In [None]:
import numpy as np
import pandas as pd 
import plotly.express as px
import matplotlib.pyplot as plt


In [None]:
df = pd.read_csv('../input/covid19-tweets/covid19_tweets.csv')

In [None]:
df.head()

In [None]:
df.info()

In [None]:
missed = round(df.isnull().sum() * 100/ len(df), 2).sort_values()
missed = missed[missed > 0]

missed_df = pd.DataFrame()
missed_df['feature_name'] = missed.index
missed_df['precent'] = missed.values

fig = px.bar(missed_df, 
             x=missed_df['precent'], 
             y=missed_df['feature_name'],
             height=400, width=600,
            title='Missed values percent for every column (percent > 0)'
            )


fig.show()

Let's see most popular users.

In [None]:
ds = df['user_name'].value_counts().reset_index()
ds.columns = ['user_name', 'tweets_count']
ds = ds.sort_values(['tweets_count'])

fig = px.bar(
    ds.tail(30), 
    x='tweets_count', 
    y='user_name',
    width=800, 
    height=800,
    title='Top 30 users by number of tweets'
    )

fig.show()

In [None]:
df = pd.merge(df, ds, on='user_name', how='inner')

And most friendly users.

In [None]:
data = df.sort_values('user_followers', ascending=False)
data = data.drop_duplicates(subset='user_name', keep="first")
data = data[['user_name', 'user_followers', 'tweets_count']]
data = data.sort_values('user_followers')

fig = px.bar(
    data.tail(40), 
    x='user_followers',
    y='user_name', 
    color='tweets_count',
    title='Top 40 users by number of followers', 
    width=800, 
    height=800
)
fig.show()

In [None]:
pd.to_datetime(df['user_created'])

Let's see how coronavirus affect to new users creation.

In [None]:
df['year_created'] = pd.to_datetime(df['user_created']).dt.year
data = df.drop_duplicates(subset='user_name', keep="first")
data = data[data['year_created']>2006]
data = data['year_created'].value_counts().reset_index()
data.columns = ['year', 'number']

fig = px.bar(
    data, 
    x="year", 
    y="number", 
    orientation='v', 
    title='User created year by year', 
    width=800, 
    height=600
)

fig.show()

Let's see top 40 most popular locations by the number of tweets.

In [None]:
ds = df['user_location'].value_counts().reset_index()
ds.columns = ['user_location', 'count']
ds = ds.sort_values(['count'])

fig = px.bar(
    ds.tail(40), 
    x="count", 
    y="user_location", 
    orientation='h', title='Top 40 user locations by number of tweets', 
    width=800, 
    height=800
)

fig.show()

Now it's time to check last one categorical feature - source. Lets see top 40 sources by the number of tweets.

In [None]:
ds = df['source'].value_counts().reset_index()
ds.columns = ['source', 'count']
ds = ds.sort_values(['count'])

fig = px.bar(
    ds.tail(40), 
    x="count", 
    y="source", 
    orientation='h', 
    title='Top 40 user sources by number of tweets', 
    width=800, 
    height=800
)

fig.show()

Just split day and time into separate columns.

In [None]:
df['date'] = pd.to_datetime(df['date']) 
df = df.sort_values(['date'])
df['day'] = df['date'].astype(str).str.split(' ', expand=True)[0]
df['time'] = df['date'].astype(str).str.split(' ', expand=True)[1]
df.head()

Now we are going to check how many tweets were for every day in our dataset.

In [None]:
ds = df['day'].value_counts().reset_index()
ds.columns = ['day', 'count']
ds = ds.sort_values('count')
ds['day'] = ds['day'].astype(str) + ':00:00:00'

fig = px.bar(
    ds, 
    x='count', 
    y="day", 
    orientation='h',
    title='Tweets distribution over days present in dataset', 
    width=800, 
    height=800
)
fig.show()

Lets do the same but for hours.

In [None]:
df['hour'] = df['date'].dt.hour
ds = df['hour'].value_counts().reset_index()
ds.columns = ['hour', 'count']
ds['hour'] = 'Hour ' + ds['hour'].astype(str)
fig = px.bar(
    ds, 
    x="hour", 
    y="count", 
    orientation='v', 
    title='Tweets distribution over hours', 
    width=800
)
fig.show()

Lets create new feature - hashtags_count that will show us how many hashtags in the current tweet.

In [None]:
df['hashtags'] = df['hashtags'].fillna('[]')
df['hashtags_count'] = df['hashtags'].apply(lambda x: len(x.split(',')))
df.loc[df['hashtags'] == '[]', 'hashtags_count'] = 0

df.head(10)

In [None]:
df['hashtags_count'].describe()

Now we are going to calculate the length for every tweet in dataset.

In [None]:
df['tweet_length'] = df['text'].str.len()

fig = px.histogram(
    df, 
    x="tweet_length", 
    nbins=80, 
    title='Tweet length distribution', 
    width=800,
    height=700
)
fig.show()

Lets see general wordcloud for this column.

In [None]:
def build_wordcloud(df, title):
    wordcloud = WordCloud(
        background_color='gray', 
        stopwords=set(STOPWORDS), 
        max_words=50, 
        max_font_size=40, 
        random_state=666
    ).generate(str(df))

    fig = plt.figure(1, figsize=(14,14))
    plt.axis('off')
    fig.suptitle(title, fontsize=16)
    fig.subplots_adjust(top=2.3)

    plt.imshow(wordcloud)
    plt.show()
    
    build_wordcloud(df['text'], 'Prevalent words in tweets for all dataset')

In [None]:
vec = TfidfVectorizer(stop_words="english")
vec.fit(df['text'].values)
features = vec.transform(df['text'].values)

In [None]:
features