In [None]:
# packages

# standard
import numpy as np
import pandas as pd
import time

# plot
import matplotlib.pyplot as plt

# wordcloud
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator

In [None]:
# files
!ls ../input/data-science-tweets/tweets/

In [None]:
# read file
t1 = time.time()
df = pd.read_csv('../input/data-science-tweets/tweets/data_science.csv')
t2 = time.time()
print('Elapsed time [s]: ', np.round(t2-t1,2))

In [None]:
# convert date
df.date = pd.to_datetime(df.date)
# and extract year, month
df['year'] = df.date.dt.year
df['month'] = df.date.dt.month

In [None]:
# show structure
df.info()

# General Explorations

In [None]:
# year distribution
df.year.value_counts().sort_index().plot(kind='bar')
plt.title('Year of Tweet')
plt.grid()
plt.show()

In [None]:
# month distribution
df.month.value_counts().sort_index().plot(kind='bar')
plt.title('Month of Tweet')
plt.grid()
plt.show()

In [None]:
# language frequencies
plt.figure(figsize=(16,4))
df.language.value_counts().plot(kind='bar')
plt.title('Language')
plt.grid()
plt.show()

In [None]:
# detailed counts
df.language.value_counts()

# Evaluate Text

In [None]:
# preview
df.tweet

In [None]:
# prepare text
text = " ".join(txt for txt in df.tweet)

# stopwords
stopwords = set(STOPWORDS)
# add user defined stopwords
stopwords = stopwords.union(set(['https', 't', 'co', '&amp;', 'amp']));

### Wordcloud

In [None]:
# show wordcloud
n_words = 250
wordcloud = WordCloud(stopwords=stopwords, max_font_size=50, max_words=n_words,
                      width = 600, height = 400,
                      collocations=False,
                      background_color='white').generate(text)
plt.figure(figsize=(14,10))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.show()

### Wordcloud by Year

In [None]:
for y in range(2010,2021+1):
    df_temp = df[df.year==y]
    print('Year =',y, ':')
    text = ' '.join(txt for txt in df_temp.tweet)
    wordcloud = WordCloud(stopwords=stopwords, max_font_size=50, max_words=n_words,
                      width = 600, height = 400,
                      collocations=False,
                      background_color='white').generate(text)
    plt.figure(figsize=(14,10))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis('off')
    plt.show()