In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import plotly.express as px

In [None]:
df = pd.read_csv('../input/pfizer-vaccine-tweets/vaccination_tweets.csv')

In [None]:
df.head()

In [None]:
df.describe()

In [None]:
print('There are {} row and {} columns in our dataset.'.format(df.shape[0],df.shape[-1]))

In [None]:
user_loc = df.user_location.value_counts().head(15)
user_loc

plt.figure(figsize=(12,6))
sns.barplot(user_loc.values, user_loc.index, color='y')
plt.title('Top 15 countries with maximum tweets',fontsize=24)
plt.xlabel('Values', fontsize=15)
plt.ylabel('Countries', fontsize=12)
plt.show()

In [None]:
plt.figure(figsize=(12,6))
sns.heatmap(df.isnull(), cmap='viridis')

In [None]:
def unique_values_funct(data_frame):
    unique_dataframe = pd.DataFrame()
    unique_dataframe['Features'] = data_frame.columns
    uniques=[]
    for col in data_frame.columns:
        u = data_frame[col].nunique()
        uniques.append(u)
    unique_dataframe['Uniques'] = uniques
    return unique_dataframe

udf = unique_values_funct(df)


f, ax = plt.subplots(1,1, figsize=(10,5))
sns.barplot(x=udf['Features'], y=udf['Uniques'], alpha=0.8)
plt.title('Bar plot for unique values in each column', fontsize=14)
plt.ylabel('Unique values', fontsize=14)
plt.xlabel('Columns', fontsize=14)
plt.xticks(rotation=90)
plt.show()

In [None]:
df['number of words in text'] = df['text'].apply(lambda x: len(x))
plt.figure(figsize=(12,6))
sns.kdeplot(df['number of words in text'], color='m')
plt.title('Number of words', fontsize=25)
plt.show()

In [None]:
user_n = df.user_name.value_counts().head(20)
user_n

plt.figure(figsize=(15,6))
sns.barplot(user_n.values, user_n.index, color='r')
plt.title('Top 20 user names with maximum number of tweets', fontsize=25, fontweight='bold')
plt.show()

In [None]:
df.replace('True','Verified')
df.replace('False','Not Verified')
sns.countplot(df.user_verified, palette='Set1')

In [None]:
src = df['source'].value_counts().head(10)
src

plt.figure(figsize=(12,6))
fig = px.bar(src.index, src.values)
fig.show()

In [None]:
pla = df['source'][df['user_location'] == 'India'].value_counts().sort_values(ascending=False)
explode = (0, 0.1, 0, 0,0.01) 
plt.figure(figsize=(8,8))
pla[0:5].plot(kind = 'pie', title = 'Most Tweet Sources used in India', autopct='%1.1f%%',shadow=True,explode = explode)
plt.show()

In [None]:
pla = df['source'][df['user_location'] == 'Malaysia'].value_counts().sort_values(ascending=False)
explode = (0, 0.1) 
plt.figure(figsize=(8,8))
pla[0:2].plot(kind = 'pie', title = 'Most Tweet Sources used in Malaysia', autopct='%1.1f%%',shadow=True,explode = explode)
plt.show()

In [None]:
hash_ = df.hashtags.value_counts().head(5).sort_values(ascending=False).to_dict()

plt.figure(figsize=(8,8))
plt.pie(hash_.values(), labels=hash_.keys(), autopct='%1.1f%%')
plt.show()

In [None]:
df["date"] = pd.to_datetime(df["date"])
df["Month"] = df["date"].apply(lambda x : x.month)
df["day"] = df["date"].apply(lambda x : x.dayofweek)
dmap = {0:'Mon',1:'Tue',2:'Wed',3:'Thu',4:'Fri',5:'Sat',6:'Sun'}
df["day"] = df["day"].map(dmap)
plt.title("Day with maximun tweets")
sns.countplot(df["day"])