In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
import warnings as ws
ws.filterwarnings("ignore")

In [None]:
data = pd.read_csv("../input/ipl2020-tweets/IPL2020_Tweets.csv")

In [None]:
data.head()

In [None]:
shape = data.shape
print("data shape",shape)

# # Dealing with missing values****

In [None]:
check_na = ((data.isna().sum() / data.shape[0])* 100).reset_index().rename(columns = {"index": "Columns", 0: "missing value percentage"})

In [None]:
plot = px.bar(check_na, y= 'missing value percentage', x='Columns', text='missing value percentage', title = "Percent of missing values in the columns")
plot.update_traces(texttemplate='%{text:.2s}',textposition='outside')
plot.show()

In [None]:
sns.set()
plt.figure(figsize =(10,4))
sns.barplot(data = data.user_location.value_counts()[:5].reset_index(),y='user_location',x='index',palette="Spectral")
plt.ylabel("count of peoples")
plt.xlabel("Top Location in the data")
plt.title("Top user location in the data ", size = 10)
plt.show()

In [None]:
##For the purpose of eda i will replace null values with india
data.user_location.fillna("india",inplace=True)

In [None]:
data.hashtags.fillna('[IPL2020]',inplace=True)

In [None]:
data.isna().sum()

All NaN values filled except user description which i prefereed not to fill

In [None]:
# estimate of verified user
temp = data.user_verified.replace({True: "Verified", False: "Non-verfied"}).value_counts().reset_index()
fig = px.pie(temp, values='user_verified', names='index', color_discrete_sequence=px.colors.sequential.RdBu, title = "User Status")
fig.show()

In [None]:
fig = px.scatter(data_frame=data, y="user_favourites", x="user_followers", size = "user_favourites", color = "user_verified",log_x=True, size_max=20)
fig.show()

In [None]:
temp = data.is_retweet.value_counts().reset_index().replace({False: "Not Retweeted", True : "ReTweeted"})
fig = px.pie(temp, values='is_retweet', names='index', color_discrete_sequence=px.colors.sequential.RdBu, title = "Retweeted or not")
fig.show()

In [None]:
import missingno as mno
mno.matrix(data)

In [None]:
ds = data['user_location'].value_counts().reset_index()
ds.columns = ['user_location', 'count']
ds = ds[ds['user_location']!='NA']
ds = ds.sort_values(['count'],ascending=False)

fig = sns.barplot(
    
    x=ds.head(20)["count"], 
    y=ds.head(20)["user_location"], 
    orientation='horizontal'
).set_title('Top 20 user locations by number of tweets')

In [None]:
data['tweet_date']=pd.to_datetime(data['date']).dt.date
tweet_date=data['tweet_date'].value_counts().to_frame().reset_index().rename(columns={'index':'date','tweet_date':'count'})
tweet_date['date']=pd.to_datetime(tweet_date['date'])
tweet_date=tweet_date.sort_values('date',ascending=False)

In [None]:
fig=go.Figure(go.Scatter(x=tweet_date['date'],
                                y=tweet_date['count'],
                               mode='markers+lines',
                               name="Submissions",
                               marker_color='dodgerblue'))


In [None]:
fig.update_layout(
    title_text='Tweets per Day : ({} - {})'.format(data['tweet_date'].sort_values()[0].strftime("%d/%m/%Y"),
                                                       data['tweet_date'].sort_values().iloc[-1].strftime("%d/%m/%Y")),template="plotly_dark",
    title_x=0.5)

fig.show()
