In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
pd.pandas.set_option('display.max_columns', None)

import nltk
import re
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import spacy
nlp = spacy.load('en_core_web_sm')

from textblob import Word, TextBlob

nltk.download('stopwords')
nltk.download('wordnet')
stop_words = stopwords.words('english')


In [None]:
df = pd.read_csv('../input/pfizer-vaccine-tweets/vaccination_tweets.csv')
df.head()

In [None]:
df.shape

In [None]:
df.isnull().sum()

In [None]:
df.drop(['id', 'user_description', 'hashtags'], axis = 1, inplace = True)
df.head()

In [None]:
# Create a function to clean the tweets
def cleanTxt(text):
    text = re.sub('@[A-Za-z0â€“9]+', '', text) #Removing @mentions
    text = re.sub('#', '', text) # Removing '#' hash tag
    text = re.sub('RT[\s]+', '', text) # Removing RT
    text = re.sub('https?:\/\/\S+', '', text) # Removing hyperlink
 
    return text

# Clean the tweets
df['text'] = df['text'].apply(cleanTxt)

# Show the cleaned tweets
df.head()



In [None]:
# Create a function to get the subjectivity
def getSubjectivity(text):
    return TextBlob(text).sentiment.subjectivity

# Create a function to get the polarity
def getPolarity(text):
    return  TextBlob(text).sentiment.polarity


In [None]:
# Create two new columns 'Subjectivity' & 'Polarity'
df['Subjectivity'] = df['text'].apply(getSubjectivity)
df['Polarity'] = df['text'].apply(getPolarity)

df



In [None]:
#Segregating tweets to positive and negative
def getAnalysis(score):
  if score < 0:
    return 'Negative'
  elif score == 0:
    return 'Neutral'
  else:
    return 'Positive'

df['Analysis'] = df['Polarity'].apply(getAnalysis)

# Show the dataframe
df


# Word Cloud

In [None]:
# Word cloud visualization
from wordcloud import WordCloud

allWords = ' '.join([twts for twts in df['text']])
wordCloud = WordCloud(width=500, height=300, random_state=21, max_font_size=110).generate(allWords)
plt.figure(figsize = (18,18))

plt.imshow(wordCloud, interpolation="bilinear")
plt.axis('off')
plt.show()


# Sentiment Distribution plot

In [None]:
# Plotting 
plt.figure(figsize=(15,10)) 
for i in range(0, df.shape[0]):
    plt.scatter(df["Polarity"][i], df["Subjectivity"][i], color='Red') 
plt.title('Sentiment Analysis') 
plt.xlabel('Polarity') 
plt.ylabel('Subjectivity') 
plt.show()


# Sentiment Distribution

In [None]:
#To plot pie chart

counts = df['Analysis'].value_counts().sort_index()
print(counts)
# Plot a pie chart
plt.figure(figsize = (10,8))
plt.pie(df['Analysis'].value_counts(), labels = df['Analysis'].value_counts().index, autopct="%.1f%%")

plt.legend()
plt.show()


In [None]:
df.isnull().sum()

In [None]:
#Replace Missing values with a new label(Categorical features)
#features_na contains values with missing categorical features
features_na = [features for features in df.columns if df[features].isnull().sum() > 1 and df[features].dtypes == 'O']
def replace(df, features_na):
    data = df.copy()
    data[features_na] = data[features_na].fillna('Missing') #NaN values replaced by Missing
    return data

df = replace(df, features_na)

df[features_na].isnull().sum() #to check if any NaN values exist

  


In [None]:

df['user_location'].value_counts().sort_values(ascending=False).head(30)


In [None]:
df.isnull().sum()

In [None]:
df = df.dropna()

In [None]:
df.isnull().sum()

In [None]:
df

# Verified vs Unverified user distribution

In [None]:
#To plot pie chart

counts = df['user_verified'].value_counts().sort_index()
print(counts)
# Plot a pie chart
plt.figure(figsize = (10,8))
plt.pie(df['user_verified'].value_counts(), labels = df['user_verified'].value_counts().index, autopct="%.1f%%")

plt.legend()
plt.show()


In [None]:
df_verified = df[df['user_verified'] == True]

df_verified

In [None]:
df_unverified = df[df['user_verified'] == False]

df_unverified

# Word Cloud for verified accounts

In [None]:
# word cloud visualization
from wordcloud import WordCloud

allWords = ' '.join([twts for twts in df_verified['text']])
wordCloud = WordCloud(width=500, height=300, random_state=21, max_font_size=110).generate(allWords)
plt.figure(figsize = (20,20))

plt.imshow(wordCloud, interpolation="bilinear")
plt.axis('off')
plt.show()


# Word Cloud for unverified accounts

In [None]:
# word cloud visualization
from wordcloud import WordCloud

allWords = ' '.join([twts for twts in df_unverified['text']])
wordCloud = WordCloud(width=500, height=300, random_state=21, max_font_size=110).generate(allWords)
plt.figure(figsize = (20,20))

plt.imshow(wordCloud, interpolation="bilinear")
plt.axis('off')
plt.show()


# Sentiment Distribution for verified accounts

In [None]:
#To plot pie chart

counts = df_verified['Analysis'].value_counts().sort_index()
print(counts)
# Plot a pie chart
plt.figure(figsize = (10,8))
plt.pie(df_verified['Analysis'].value_counts(), labels = df_verified['Analysis'].value_counts().index, autopct="%.1f%%")

plt.legend()
plt.show()


# Sentiment Distribution for unverified accounts

In [None]:
#To plot pie chart

counts = df_unverified['Analysis'].value_counts().sort_index()
print(counts)
# Plot a pie chart
plt.figure(figsize = (10,8))
plt.pie(df_unverified['Analysis'].value_counts(), labels = df_unverified['Analysis'].value_counts().index, autopct="%.1f%%")

plt.legend()
plt.show()


In [None]:
df_positive = df[df['Analysis'] == 'Positive']

df_positive

In [None]:
df_neg = df[df['Analysis'] == 'Negative']

df_neg

In [None]:
df_neu = df[df['Analysis'] == 'Neutral']

df_neu

# Word Cloud for Positive Tweets

In [None]:
# word cloud visualization
from wordcloud import WordCloud

allWords = ' '.join([twts for twts in df_positive['text']])
wordCloud = WordCloud(width=500, height=300, random_state=21, max_font_size=110).generate(allWords)
plt.figure(figsize = (20,20))

plt.imshow(wordCloud, interpolation="bilinear")
plt.axis('off')
plt.show()


# Word Cloud for Negative Tweets

In [None]:
# word cloud visualization
from wordcloud import WordCloud

allWords = ' '.join([twts for twts in df_neg['text']])
wordCloud = WordCloud(width=500, height=300, random_state=21, max_font_size=110).generate(allWords)
plt.figure(figsize = (20,20))

plt.imshow(wordCloud, interpolation="bilinear")
plt.axis('off')
plt.show()


# Word Cloud for Neutral Tweets

In [None]:
# word cloud visualization
from wordcloud import WordCloud

allWords = ' '.join([twts for twts in df_neu['text']])
wordCloud = WordCloud(width=500, height=300, random_state=21, max_font_size=110).generate(allWords)
plt.figure(figsize = (20,20))

plt.imshow(wordCloud, interpolation="bilinear")
plt.axis('off')
plt.show()


In [None]:
df['is_retweet'].unique()

**Dropping is_retweet as all values are false**

In [None]:
df.drop(['is_retweet'], axis = 1, inplace = True)

df

In [None]:
df['source'].value_counts()

# iPhone vs webApp vs android (Nothing conclusive)

In [None]:
df_iPhone = df[df['source'] == 'Twitter for iPhone']
df_webApp = df[df['source'] == 'Twitter Web App']
df_android = df[df['source'] == 'Twitter for Android']



# iPhone

In [None]:
#To plot pie chart

counts = df_iPhone['Analysis'].value_counts().sort_index()
print(counts)
# Plot a pie chart
plt.figure(figsize = (10,8))
plt.pie(df_iPhone['Analysis'].value_counts(), labels = df_iPhone['Analysis'].value_counts().index, autopct="%.1f%%")

plt.legend()
plt.show()


# Web App

In [None]:
#To plot pie chart

counts = df_webApp['Analysis'].value_counts().sort_index()
print(counts)
# Plot a pie chart
plt.figure(figsize = (10,8))
plt.pie(df_webApp['Analysis'].value_counts(), labels = df_webApp['Analysis'].value_counts().index, autopct="%.1f%%")

plt.legend()
plt.show()


# Android

In [None]:
#To plot pie chart

counts = df_android['Analysis'].value_counts().sort_index()
print(counts)
# Plot a pie chart
plt.figure(figsize = (10,8))
plt.pie(df_android['Analysis'].value_counts(), labels = df_android['Analysis'].value_counts().index, autopct="%.1f%%")

plt.legend()
plt.show()


In [None]:
df.head()

In [None]:
df_unverified.describe()

In [None]:
df_unverified_1 = df_unverified[df_unverified['user_followers'] < 1360]
df_unverified_2 = df_unverified[df_unverified['user_followers'] >= 1360]

In [None]:
df_unverified_1.shape

In [None]:
df_unverified_2.shape

# Sentiment distribution of unverified users with lower no. of followers (<1360, 75%)

In [None]:
#To plot pie chart

counts = df_unverified_1['Analysis'].value_counts().sort_index()
print(counts)
# Plot a pie chart
plt.figure(figsize = (10,8))
plt.pie(df_unverified_1['Analysis'].value_counts(), labels = df_unverified_1['Analysis'].value_counts().index, autopct="%.1f%%")

plt.legend()
plt.show()


# Sentiment distribution of unverified users with higher no. of followers (>1360, 75%)

In [None]:
#To plot pie chart

counts = df_unverified_2['Analysis'].value_counts().sort_index()
print(counts)
# Plot a pie chart
plt.figure(figsize = (10,8))
plt.pie(df_unverified_2['Analysis'].value_counts(), labels = df_unverified_2['Analysis'].value_counts().index, autopct="%.1f%%")

plt.legend()
plt.show()


In [None]:
df_verified.describe()

In [None]:
df_verified_1 = df_verified[df_verified['user_followers'] < 4.611270e+05]
df_verified_2 = df_verified[df_verified['user_followers'] >= 4.611270e+05]

# Sentiment distribution of verified users with lower no. of followers (< 75 percentile)

In [None]:
#To plot pie chart

counts = df_verified_1['Analysis'].value_counts().sort_index()
print(counts)
# Plot a pie chart
plt.figure(figsize = (10,8))
plt.pie(df_verified_1['Analysis'].value_counts(), labels = df_verified_1['Analysis'].value_counts().index, autopct="%.1f%%")

plt.legend()
plt.show()


# Sentiment distribution of verified users with higher no. of followers (> 75 percentile)

In [None]:
#To plot pie chart

counts = df_verified_2['Analysis'].value_counts().sort_index()
print(counts)
# Plot a pie chart
plt.figure(figsize = (10,8))
plt.pie(df_verified_2['Analysis'].value_counts(), labels = df_verified_2['Analysis'].value_counts().index, autopct="%.1f%%")

plt.legend()
plt.show()


# Sentiment Distribution based on location

In [None]:
df_positive_1 = df_positive.copy()
df_negative_1 = df_neg.copy()
df_neutral_1 = df_neu.copy()


df_positive_1 = df_positive[df_positive['user_location'] != 'Missing']
df_negative_1 = df_neg[df_neg['user_location'] != 'Missing']
df_neutral_1 = df_neu[df_neu['user_location'] != 'Missing']



In [None]:
df_positive_1.shape

In [None]:
df_negative_1.shape

In [None]:
df_neutral_1.shape

# Positive Sentiment Distribution based on location 

In [None]:

plt.figure(figsize = (10,7))
chains = df_positive_1['user_location'].value_counts()[:20] #top 20
sns.barplot(x = chains, y = chains.index)
plt.title("Location (Positive)")
plt.xlabel("Number of outlets")


# Negative Sentiment Distribution based on location 

In [None]:

plt.figure(figsize = (10,7))
chains = df_negative_1['user_location'].value_counts()[:20] #top 20
sns.barplot(x = chains, y = chains.index)
plt.title("Location (Negative)")
plt.xlabel("Number of outlets")


# Neutral Sentiment Distribution based on location 

In [None]:

plt.figure(figsize = (10,7))
chains = df_neutral_1['user_location'].value_counts()[:20] #top 20
sns.barplot(x = chains, y = chains.index)
plt.title("Location (Neutral)")
plt.xlabel("Number of outlets")


In [None]:
# To plot the correlation chart

correlations = df.corr()
f,ax = plt.subplots(figsize=(20,20))
sns.heatmap(correlations, annot = True)
