In [None]:
import sqlite3
import pandas as pd

from datetime import datetime

import seaborn as sns
import matplotlib.pyplot as plt

from wordcloud import WordCloud
import re
import collections

In [None]:
# open connection to SQLite file
conn = sqlite3.connect('twitter.sqlite')
c = conn.cursor()

In [None]:
# query for data
c.execute('SELECT * FROM tweets')
conn.commit()

# fetch data
sql_results = c.fetchall()

sql_results[:5]

In [None]:
# close connection to DB
conn.close()

In [None]:
# make dataframe
tweets = pd.DataFrame(sql_results)

# grab interesting columns
tweets = tweets.iloc[:, [0, 3, 4]]

# rename them
tweets.columns = ['user_name', 'timestamp_ms', 'message']

tweets.head()

In [None]:
# timestamp to datatime
tweets['datetime'] = tweets.timestamp_ms.map(lambda x: datetime.fromtimestamp(int(x)/1000))
tweets.head()

In [None]:
# top 20 tweeters
top_20 = tweets.groupby('user_name')['message'].count().reset_index().sort_values('message', ascending=False)[:20]
top_20

In [None]:
ax = sns.barplot(x = 'user_name', y = 'message', data=top_20, color='lightblue')
ax.set_title('Number of tweets sent')
ax.set_xlabel('Twitter user')
ax.set_ylabel('Sent tweets')
plt.xticks(rotation=90)

In [None]:
# plik z polskimi stop words
pl_stop_words = pd.read_csv("polish_stopwords.txt", header=None)[0].tolist()

In [None]:
# wszystkie tweety w jeden ciąg
my_text = " ".join(tweets['message'])

# remove twitter nicks
my_text = re.sub(r"@\w+", " ", my_text) 

# remove urls
my_text = re.sub(r"(www|http:|https:)+[^\s]+[\w]", " ", my_text) 

# wszystko na małe literki
my_text = my_text.lower() 

In [None]:
wordcloud = WordCloud(width=1024, height=1024,
                      max_font_size=50,
                      max_words=200,
                      stopwords=pl_stop_words,
                      background_color='white').generate(my_text)

plt.figure(figsize=[10, 10], dpi=90)
plt.imshow(wordcloud, interpolation='bilinear')

In [None]:
# wszystkie tweety w jeden ciąg
my_text = " ".join(tweets['message'])

# zliczamy ile razy regexp pattern występuje w tekście
nicks = re.findall(r"@\w+", my_text)

# ile razy występuje element na liście
nicks_count = collections.Counter(nicks)

# chmurka słów na podstawie słownika frekfencji
wordcloud = WordCloud(width=1024, height=1024,
                      max_font_size=50,
                      max_words=50,
                      stopwords=pl_stop_words,
                      background_color='white').generate_from_frequencies(nicks_count)

plt.figure(figsize=[10, 10], dpi=90)
plt.imshow(wordcloud, interpolation='bilinear')