In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:

from textblob import TextBlob
from wordcloud import WordCloud

import re
import matplotlib.pyplot as plt

In [None]:
import codecs
path = '/kaggle/input/covid19-vaccine-tweets-with-sentiment-annotation/covid-19_vaccine_tweets_with_sentiment.csv'
with codecs.open(path, 'r', 'utf-8', 'ignore') as f:
    df = pd.read_csv(f)
df[0:2]

In [None]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from nltk.stem.porter import PorterStemmer
tokenizer=RegexpTokenizer(r'\w+')
ps=PorterStemmer()
en_stop=set(stopwords.words('english'))

In [None]:

def getCleanedText(text):
    text = text.lower()
    tokens=tokenizer.tokenize(text)
    new_tokens=[token for token in tokens if token not in en_stop]
    stemmed_tokens=[ps.stem(tokens) for tokens in new_tokens]
    clean_text=" ".join(stemmed_tokens)
    return clean_text

In [None]:
df['tweet_text']=df['tweet_text'].apply(getCleanedText)
df['tweet_text'].head()

In [None]:
#Create a new function to get the subjectivity
def getSubjectivity(tweet_text):
    return TextBlob(tweet_text).sentiment.subjectivity

In [None]:
#Create a function to get the polority
def getPolarity(tweet_text):
    return TextBlob(tweet_text).sentiment.polarity

In [None]:
#Create two new columns
df['Subjectivity'] = df['tweet_text'].apply(getSubjectivity)
df['Polarity'] = df['tweet_text'].apply(getPolarity)

In [None]:
#Show the dataframe with the new columns
df

In [None]:
#plot the word cloud
allWords = ''.join([twts for twts in df['tweet_text']])
wordCloud = WordCloud(width = 500,height =300,random_state = 21,max_font_size =
119).generate(allWords)
plt.imshow(wordCloud,interpolation = "bilinear")
plt.axis('off')
plt.show()

In [None]:
#Create a function to compute the negative,neutral and positive analysis
def getAnalysis(score):
    if score < 0:
        return 'Negative'
    elif score == 0:
        return 'Neutral'
    else:
        return 'Positive'
df['Analysis'] = df['Polarity'].apply(getAnalysis)
df

In [None]:
#Plot the polarity and subjectivity
plt.figure(figsize = (8,6))
for i in range(0,df.shape[0]):
    plt.scatter(df['Polarity'][i],df['Subjectivity'][i],color = 'Blue')
plt.title('Sentiment Analysis')
plt.xlabel('Polarity')
plt.ylabel('Subjectivity')
plt.show()

In [None]:
#Get the percentage of positive tweets
pos_tweets = df[df.Analysis == 'Positive']
pos_tweets = pos_tweets['tweet_text']
round((pos_tweets.shape[0]/df.shape[0])*100,2)

In [None]:
#Show the value counts
df['Analysis'].value_counts()

#Plot and visualize the counts
plt.title('Sentiment Analysis')
plt.xlabel('Sentiment')
plt.ylabel('Counts')
df['Analysis'].value_counts().plot(kind = 'bar')
plt.show()

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
cv=CountVectorizer(ngram_range=(1,2))
X_cv=cv.fit_transform(df['tweet_text']).toarray()
X=X_cv
y=df['label']
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=123)
RF=RandomForestClassifier()
RF_model=RF.fit(X_train,y_train)
print("Train Accuracy",RF_model.score(X_train,y_train))
print("Test Accuracy",RF_model.score(X_test,y_test))