<h1>Coronavirus COVID-19 Tweets</h1>


<img src="https://www.googleapis.com/download/storage/v1/b/kaggle-user-content/o/inbox%2F769452%2F35db2dd68238bfd958efdabebc9fef8f%2Fcovid-19-4961257_1280-e1586986896105.jpg?generation=1595760042647275&alt=media" width="600"></img>


# Introduction


The Dataset we are using here is collected using Twitter API, **tweepy** and Python package.


# Data preparation

## Load packages

In [None]:
!pip install numpy
!pip install scipy
!pip install matplotlib
!pip install pandas
!pip install wordcloud
!pip install seaborn
!pip install sklearn
!pip install plotly

In [None]:
import numpy as np 
import pandas as pd
import matplotlib
import seaborn as sns
import plotly
import matplotlib.pyplot as plt
%matplotlib inline 
from wordcloud import WordCloud, STOPWORDS
pd.options.mode.chained_assignment = None 

## Load data

In [None]:
tweets_df = pd.read_csv("../input/covid19-tweets/covid19_tweets.csv")
covid_confirmed_cases = pd.read_csv("../input/covid-cases/time_series_covid19_confirmed_global.csv")
covid_deaths = pd.read_csv("../input/covid-cases/time_series_covid19_deaths_global.csv")

# Data exploration


## Glimpse the data

In [None]:
print(f"data shape: {tweets_df.shape}")

In [None]:
tweets_df.info()

In [None]:
tweets_df.describe()

In [None]:
tweets_df.head()

### Missing data

In [None]:
def missing_data(data):
    total = data.isnull().sum()
    percent = (data.isnull().sum()/data.isnull().count()*100)
    tt = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
    types = []
    for col in data.columns:
        dtype = str(data[col].dtype)
        types.append(dtype)
    tt['Types'] = types
    return(np.transpose(tt))

In [None]:
missing_data(tweets_df)

### Unique values

In [None]:
def unique_values(data):
    total = data.count()
    tt = pd.DataFrame(total)
    tt.columns = ['Total']
    uniques = []
    for col in data.columns:
        unique = data[col].nunique()
        uniques.append(unique)
    tt['Uniques'] = uniques
    return(np.transpose(tt))

In [None]:
unique_values(tweets_df)

### Most frequent values

In [None]:
def most_frequent_values(data):
    total = data.count()
    tt = pd.DataFrame(total)
    tt.columns = ['Total']
    items = []
    vals = []
    for col in data.columns:
        itm = data[col].value_counts().index[0]
        val = data[col].value_counts().values[0]
        items.append(itm)
        vals.append(val)
    tt['Most frequent item'] = items
    tt['Frequence'] = vals
    tt['Percent from total'] = np.round(vals / total * 100, 3)
    return(np.transpose(tt))

In [None]:
most_frequent_values(tweets_df)

## Visualize the data distribution

In [None]:
def plot_count(feature, title, df, size=1):
    f, ax = plt.subplots(1,1, figsize=(4*size,4))
    total = float(len(df))
    g = sns.countplot(df[feature], order = df[feature].value_counts().index[:20], palette='Set3')
    g.set_title("Number and percentage of {}".format(title))
    if(size > 2):
        plt.xticks(rotation=90, size=8)
    for p in ax.patches:
        height = p.get_height()
        ax.text(p.get_x()+p.get_width()/2.,
                height + 3,
                '{:1.2f}%'.format(100*height/total),
                ha="center") 
    plt.show()    

### User name

In [None]:
plot_count("user_name", "User name", tweets_df,4)

### User location

In [None]:
plot_count("user_location", "User location", tweets_df,4)

### Tweet source

In [None]:
plot_count("source", "Source", tweets_df,4)

In [None]:
stopwords = set(STOPWORDS)

def show_wordcloud(data, title = None):
    wordcloud = WordCloud(
        background_color='white',
        stopwords=stopwords,
        max_words=50,
        max_font_size=40, 
        scale=5,
        random_state=1
    ).generate(str(data))

    fig = plt.figure(1, figsize=(10,10))
    plt.axis('off')
    if title: 
        fig.suptitle(title, fontsize=20)
        fig.subplots_adjust(top=2.3)

    plt.imshow(wordcloud)
    plt.show()

In [None]:
### Text wordcloauds

In [None]:
show_wordcloud(tweets_df['text'], title = 'Prevalent words in tweets')

In [None]:
india_df = tweets_df.loc[tweets_df.user_location=="India"]
show_wordcloud(india_df['text'], title = 'Prevalent words in tweets from India')

In [None]:
us_df = tweets_df.loc[tweets_df.user_location=="United States"]
show_wordcloud(us_df['text'], title = 'Prevalent words in tweets from US')

In [None]:
us_df = tweets_df.loc[tweets_df.user_location=="United Kingdom"]
show_wordcloud(us_df['text'], title = 'Prevalent words in tweets from UK')

In [None]:
us_df = tweets_df.loc[tweets_df.user_location=="Canada"]
show_wordcloud(us_df['text'], title = 'Prevalent words in tweets from Canada')

In [None]:
india_df = tweets_df.loc[tweets_df.user_location=="South Africa"]
show_wordcloud(india_df['text'], title = 'Prevalent words in tweets from South Africa')

In [None]:
india_df = tweets_df.loc[tweets_df.user_location=="Switzerland"]
show_wordcloud(india_df['text'], title = 'Prevalent words in tweets from Switzerland')

In [None]:
us_df = tweets_df.loc[tweets_df.user_location=="London"]
show_wordcloud(us_df['text'], title = 'Prevalent words in tweets from London')

### Hashtags analysis

In [None]:
def plot_features_distribution(features, title, df, isLog=False):
    plt.figure(figsize=(12,6))
    plt.title(title)
    for feature in features:
        if(isLog):
            sns.distplot(np.log1p(df[feature]),kde=True,hist=False, bins=120, label=feature)
        else:
            sns.distplot(df[feature],kde=True,hist=False, bins=120, label=feature)
    plt.xlabel('')
    plt.legend()
    plt.show()


In [None]:
tweets_df['hashtags'] = tweets_df['hashtags'].replace(np.nan, "['None']", regex=True)
tweets_df['hashtags'] = tweets_df['hashtags'].apply(lambda x: x.replace('\\N',''))
tweets_df['hashtags_count'] = tweets_df['hashtags'].apply(lambda x: len(x.split(',')))
plot_features_distribution(['hashtags_count'], 'Hashtags per tweet (all data)', tweets_df)

In [None]:
tweets_df['hashtags_individual'] = tweets_df['hashtags'].apply(lambda x: x.split(','))
from itertools import chain
all_hashtags = set(chain.from_iterable(list(tweets_df['hashtags_individual'])))
print(f"There are totally: {len(all_hashtags)}")

In [None]:
country_df = pd.read_csv("../input/country-code/datasets_403474_773844_wikipedia-iso-country-codes.csv")

In [None]:
country_df.columns = ["country", "alpha2", "alpha3", "numeric", "iso"]
country_df.head()

In [None]:
tweets_df['country'] = tweets_df['user_location']

In [None]:
tweets_df = tweets_df.merge(country_df, on="country")

In [None]:
tweets_df.head(10)

In [None]:
tw_add_df = tweets_df.groupby(["country", "iso", "alpha3"])['text'].count().reset_index()
tw_add_df.columns = ["country", "iso", "alpha3", "tweets"]

In [None]:
import plotly.express as px

def plot_map(dd_df, title):
    hover_text = []
    for index, row in dd_df.iterrows():
        hover_text.append((f"country: {row['country']}<br>tweets: {row['tweets']}\
                          <br>country code: {row['iso']}<br>country alpha3: {row['alpha3']}"))
    dd_df['hover_text'] = hover_text

    fig = px.choropleth(dd_df, 
                        locations="alpha3",
                        hover_name='hover_text',
                        color="tweets",
                        projection="natural earth",
                        color_continuous_scale=px.colors.sequential.Plasma,
                        width=900, height=700)
    fig.update_geos(   
        showcoastlines=True, coastlinecolor="DarkBlue",
        showland=True, landcolor="LightGrey",
        showocean=True, oceancolor="LightBlue",
        showlakes=True, lakecolor="Blue",
        showrivers=True, rivercolor="Blue",
        showcountries=True, countrycolor="DarkBlue"
    )
    fig.update_layout(title = title, geo_scope="world")
    fig.show()    

In [None]:
plot_map(tw_add_df, "Tweets per country (where country is specified)")

In [None]:
tweets_df['datedt'] = pd.to_datetime(tweets_df['date'])


In [None]:
tweets_df['year'] = tweets_df['datedt'].dt.year
tweets_df['month'] = tweets_df['datedt'].dt.month
tweets_df['day'] = tweets_df['datedt'].dt.day
tweets_df['dayofweek'] = tweets_df['datedt'].dt.dayofweek
tweets_df['hour'] = tweets_df['datedt'].dt.hour
tweets_df['minute'] = tweets_df['datedt'].dt.minute
tweets_df['dayofyear'] = tweets_df['datedt'].dt.dayofyear
tweets_df['date_only'] = tweets_df['datedt'].dt.date

In [None]:
tweets_agg_df = tweets_df.groupby(["date_only"])["text"].count().reset_index()
tweets_agg_df.columns = ["date_only", "count"]

In [None]:
def plot_time_variation(df, x='date_only', y='count', hue=None, size=1, title="", is_log=False):
    f, ax = plt.subplots(1,1, figsize=(4*size,3*size))
    g = sns.lineplot(x=x, y=y, hue=hue, data=df)
    plt.xticks(rotation=90)
    if hue:
        plt.title(f'{y} grouped by {hue} | {title}')
    else:
        plt.title(f'{y} | {title}')
    if(is_log):
        ax.set(yscale="log")
    ax.grid(color='black', linestyle='dotted', linewidth=0.75)
    plt.show()

In [None]:
plot_time_variation(tweets_agg_df, title="Number of tweets / day of year",size=3)

### Sanitization/Re-formatting of Data

In [None]:
#Rename state columns
covid_deaths = covid_deaths.rename(columns={"Province/State":"state","Country/Region": "country"})
covid_confirmed_cases = covid_confirmed_cases.rename(columns={"Province/State":"state","Country/Region": "country"})

#Changing the conuntry names as required by pycountry_convert Lib
covid_deaths.loc[covid_deaths['country'] == "US", "country"] = 'USA'
covid_confirmed_cases.loc[covid_confirmed_cases['country'] == "US", "country"] = "USA"
                          
covid_deaths.loc[covid_deaths['country'] == 'Korea, South', "country"] = 'South Korea'
covid_confirmed_cases.loc[covid_confirmed_cases['country'] == "Korea, South", "country"] = "South Korea"

covid_deaths.loc[covid_deaths['country'] == 'Taiwan', "country"] = 'Taiwan'
covid_confirmed_cases.loc[covid_confirmed_cases['country'] == "Taiwan*", "country"] = "Taiwan"
  
covid_deaths.loc[covid_deaths['country'] == 'Congo (Kinshasa)', "country"] = 'Democratic Republic of the Congo'
covid_confirmed_cases.loc[covid_confirmed_cases['country'] == "Congo (Kinshasa)", "country"] = "Democratic Republic of the Congo"
    
covid_deaths.loc[covid_deaths['country'] == "Cote d'Ivoire", "country"] = 'Côte d Ivoire'
covid_confirmed_cases.loc[covid_confirmed_cases['country'] == "Cote d'Ivoire", "country"] = "Côte d'Ivoire"

covid_deaths.loc[covid_deaths['country'] == "Reunion", "country"] = 'Réunion'
covid_confirmed_cases.loc[covid_confirmed_cases['country'] == "Reunion", "country"] = "Réunion"
  
covid_deaths.loc[covid_deaths['country'] == 'Congo (Brazzaville)', "country"] = 'Republic of the Congo'
covid_confirmed_cases.loc[covid_confirmed_cases['country'] == "Congo (Brazzaville)", "country"] = "Republic of the Congo"
  
covid_deaths.loc[covid_deaths['country'] == 'Bahamas, The', "country"] = 'Bahamas'
covid_confirmed_cases.loc[covid_confirmed_cases['country'] == "Bahamas, The", "country"] = "Bahamas"

covid_deaths.loc[covid_deaths['country'] == 'Gambia, The', "country"] = 'Gambia'
covid_confirmed_cases.loc[covid_confirmed_cases['country'] == "Gambia, The", "country"] = "Gambia"

#Copy the death statistics (USA ONLY) to a new variable andstrip out the continent and latlng
covid_death_cases = covid_deaths.copy().drop(["Lat","Long","state"], axis=1)
covid_confirmed_cases = covid_confirmed_cases.copy().drop(["Lat","Long","state"], axis=1)

#Set the index of the pandas dataframe
covid_death_cases = covid_death_cases.set_index(["country"])
covid_confirmed_cases = covid_confirmed_cases.set_index(["country"])

#Convert column headers to date time
covid_death_cases.columns = pd.to_datetime(covid_death_cases.columns)
covid_confirmed_cases.columns = pd.to_datetime(covid_confirmed_cases.columns)


In [None]:
trimmed_tweets = tweets_df.copy().drop([
        "user_name",
        "user_description",
        "user_created",
        "user_followers",
        "user_friends",
        "user_favourites",
        "user_verified",
        "text",
        "hashtags",
        "source",
        "is_retweet",
        "hashtags_count",
        "hashtags_individual",
    ], axis=1)

def user_in_usa(location):
    import re
    usa_list = ["USA","AL", "AK", "AZ", "AR",
                "CA", "CO", "CT", "DE", "FL",
                "GA", "HI", "ID", "IL", "IN", 
                "IA", "KS", "KY", "LA", "ME",
                "MD", "MA", "MI", "MN", "MS", 
                "MO", "MT", "NE", "NV", "NH",
                "NJ", "NM", "NY", "NC", "ND", 
                "OH", "OK", "OR", "PA", "RI",
                "SC", "SD", "TN", "TX", "UT", 
                "VA", "WA", "WV", "WI", "WY"]
    location = str(location)
    locations = location.replace('-', ' ').split(' ')

    for location in locations:
        if location in usa_list:
            return True
    return False

#Filter USA only tweets
usa_tweets = trimmed_tweets[trimmed_tweets.apply(lambda x: x['alpha2'] == "US", axis=1)]

#Trim time away from date and convert to date format
usa_tweets['date'] = usa_tweets.apply(lambda x: x['date'].split(" ")[0], axis=1)
usa_tweets['date'] = pd.to_datetime(usa_tweets['date'])

#Daily tweet count
usa_daily_counts = usa_tweets.groupby(usa_tweets['date'].dt.date).size()

#convert the series to a dataframe
usa_daily_counts = usa_daily_counts.to_frame()
usa_daily_counts.index = pd.to_datetime(usa_daily_counts.index)
usa_daily_counts

In [None]:
covid_death_cases.head()
covid_confirmed_cases.head()

In [None]:
#Transpose the dataframe so that rows are columns and the columns are rows, order the dataframe by column name
t_covid_death = covid_death_cases.T
t_covid_death.sort_index(axis=1, inplace=True)

t_covid_confirm = covid_confirmed_cases.T
t_covid_confirm.sort_index(axis=1, inplace=True)

# Resample the dataset and concatenate it so that data is aggregated weekly
t_covid_death_weekly = t_covid_death.resample('D').sum()
t_covid_confirmed_weekly = t_covid_confirm.resample('D').sum()
usa_daily_counts = usa_daily_counts.resample('D').sum()

#Isolate USA data
w_usa_deaths = t_covid_death_weekly['USA']
w_usa_confirmed = t_covid_confirmed_weekly['USA']

#Join the deaths and confirmed cases into one df
usa_data = pd.concat([w_usa_confirmed, w_usa_deaths, usa_daily_counts], axis=1, ignore_index=True)

#Rename the columns for readability
usa_data.columns = ['confirmed', 'deaths', 'tweets']

#Fill NAN with 0
usa_data = usa_data.fillna(0)
usa_data

In [None]:
t_covid_death.head()

### Correlation Analysis

In [None]:
usa_data.plot.line(logy=True)

#Last point in dataset is anomoly because dataset incomplete for that week"

In [None]:
usa_data_after_july = usa_data.loc[(usa_data.index > '2020-7-20')]
usa_data_after_july

In [None]:
usa_data_after_july.plot.line(logy=True)
