In [None]:
import numpy as np
import pandas as pd 
import plotly.express as px
from wordcloud import WordCloud, STOPWORDS
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from iso3166 import countries
import plotly.graph_objs as go
from plotly.offline import init_notebook_mode, iplot

In [None]:
df = pd.read_csv("/kaggle/input/covid19-tweets/covid19_tweets.csv")

In [None]:
df.head()

In [None]:
df.info()

In [None]:
missed = pd.DataFrame()
missed['column'] = df.columns

missed['percent'] = [round(100* df[col].isnull().sum() / len(df), 2) for col in df.columns]
missed = missed.sort_values('percent')
missed = missed[missed['percent']>0]

fig = px.bar(
    missed, 
    x='percent', 
    y="column", 
    orientation='h', 
    title='Missed values percent for every column (percent > 0)', 
    height=400, 
    width=600
)

fig.show()

In [None]:
ds = df['user_location'].value_counts().reset_index()
ds.columns = ['user_location', 'count']
ds = ds[ds['user_location']!='NA']
ds = ds.sort_values(['count'])

fig = px.bar(
    ds.tail(40), 
    x="count", 
    y="user_location", 
    orientation='h', title='Top 40 user locations by number of tweets', 
    width=800, 
    height=800
)

fig.show()

In [None]:
def pie_count(data, field, percent_limit, title):
    
    data[field] = data[field].fillna('NA')
    data = data[field].value_counts().to_frame()

    total = data[field].sum()
    data['percentage'] = 100 * data[field]/total    

    percent_limit = percent_limit
    otherdata = data[data['percentage'] < percent_limit] 
    others = otherdata['percentage'].sum()  
    maindata = data[data['percentage'] >= percent_limit]

    data = maindata
    other_label = "Others(<" + str(percent_limit) + "% each)"
    data.loc[other_label] = pd.Series({field:otherdata[field].sum()}) 
    
    labels = data.index.tolist()   
    datavals = data[field].tolist()
    
    trace=go.Pie(labels=labels,values=datavals)

    layout = go.Layout(
        title = title,
        height=600,
        width=600
        )
    
    fig = go.Figure(data=[trace], layout=layout)
    iplot(fig)
    
pie_count(df, 'user_location', 0.5, 'Number of tweets per location')

In [None]:
df['hashtags'] = df['hashtags'].fillna('[]')
df['hashtags_count'] = df['hashtags'].apply(lambda x: len(x.split(',')))
df.loc[df['hashtags'] == '[]', 'hashtags_count'] = 0

df.head(10)

In [None]:
df['hashtags_count'].describe()

In [None]:
df['date'] = pd.to_datetime(df['date']) 
df = df.sort_values(['date'])
df['day'] = df['date'].astype(str).str.split(' ', expand=True)[0]
df['time'] = df['date'].astype(str).str.split(' ', expand=True)[1]
df.head()

In [None]:
ds = df['day'].value_counts().reset_index()
ds.columns = ['day', 'count']
ds = ds.sort_values('count')
ds['day'] = ds['day'].astype(str) + ':00:00:00'
fig = px.bar(
    ds, 
    x='count', 
    y="day", 
    orientation='h',
    title='Tweets distribution over days present in dataset', 
    width=800, 
    height=800
)
fig.show()

In [None]:
def split_hashtags(x): 
    return str(x).replace('[', '').replace(']', '').split(',')

tweets_df = df.copy()
tweets_df['hashtag'] = tweets_df['hashtags'].apply(lambda row : split_hashtags(row))
tweets_df = tweets_df.explode('hashtag')
tweets_df['hashtag'] = tweets_df['hashtag'].astype(str).str.lower().str.replace("'", '').str.replace(" ", '')
tweets_df.loc[tweets_df['hashtag']=='', 'hashtag'] = 'NO HASHTAG'
tweets_df

In [None]:
ds = tweets_df['hashtag'].value_counts().reset_index()
ds.columns = ['hashtag', 'count']
ds = ds.sort_values(['count'])
fig = px.bar(
    ds.tail(20), 
    x="count", 
    y='hashtag', 
    orientation='h', 
    title='Top 20 hashtags', 
    width=800, 
    height=700
)
fig.show()

In [None]:
df['tweet_length'] = df['text'].str.len()

In [None]:
def build_wordcloud(df, title):
    wordcloud = WordCloud(
        background_color='gray', 
        stopwords=set(STOPWORDS), 
        max_words=50, 
        max_font_size=40, 
        random_state=666
    ).generate(str(df))

    fig = plt.figure(1, figsize=(14,14))
    plt.axis('off')
    fig.suptitle(title, fontsize=16)
    fig.subplots_adjust(top=2.3)

    plt.imshow(wordcloud)
    plt.show()

In [None]:
build_wordcloud(df['text'], 'Prevalent words in tweets for all dataset')

In [None]:
test_df = df[df['user_name']=='GlobalPandemic.NET']
build_wordcloud(test_df['text'], 'Prevalent words in tweets for GlobalPandemic.NET')

In [None]:
test_df = df[df['user_name']=='covidnews.ch']
build_wordcloud(test_df['text'], 'Prevalent words in tweets for covidnews.ch')

In [None]:
test_df = df[df['user_name']=='Open Letters']
build_wordcloud(test_df['text'], 'Prevalent words in tweets for Open Letters')

In [None]:
test_df = df[df['user_name']=='Hindustan Times']
build_wordcloud(test_df['text'], 'Prevalent words in tweets for Hindustan Times')

In [None]:
test_df = df[df['user_name']=='Blood Donors India']
build_wordcloud(test_df['text'], 'Prevalent words in tweets for Blood Donors India')

In [None]:
build_wordcloud(df['user_description'], 'Prevalent words in tweets for Blood Donors India')

In [None]:
df['location'] = df['user_location'].str.split(',', expand=True)[1].str.lstrip().str.rstrip()
res = df.groupby(['day', 'location'])['text'].count().reset_index()

In [None]:
country_dict = {}
for c in countries:
    country_dict[c.name] = c.alpha3
    
res['alpha3'] = res['location']
res = res.replace({"alpha3": country_dict})

country_list = ['England', 'United States', 'United Kingdom', 'London', 'UK']

res = res[
    (res['alpha3'] == 'USA') | 
    (res['location'].isin(country_list)) | 
    (res['location'] != res['alpha3'])
]

gbr = ['England', 'UK', 'London', 'United Kingdom']
us = ['United States', 'NY', 'CA', 'GA']

res = res[res['location'].notnull()]
res.loc[res['location'].isin(gbr), 'alpha3'] = 'GBR'
res.loc[res['location'].isin(us), 'alpha3'] = 'USA'
res.loc[res['alpha3'] == 'USA', 'location'] = 'USA'
res.loc[res['alpha3'] == 'GBR', 'location'] = 'United Kingdom'
plot = res.groupby(['day', 'location', 'alpha3'])['text'].sum().reset_index()
plot

In [None]:
fig = px.choropleth(
    plot, 
    locations="alpha3",
    hover_name='location',
    color="text",
    animation_frame='day',
    projection="natural earth",
    color_continuous_scale=px.colors.sequential.Plasma,
    title='Tweets from different countries for every day',
    width=800, 
    height=600
)
fig.show()