In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## **Importing the Libraries**

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import re
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

## **Data Wrangling**

In [None]:
data_set = pd.read_csv('/kaggle/input/pfizer-vaccine-tweets/vaccination_tweets.csv')

In [None]:
data_set.head(5)

In [None]:
data_set.describe(include = 'all')

In [None]:
data_set.info()

In [None]:
data_set.isnull().sum()

In [None]:
data_set.shape

In [None]:
dataset = pd.DataFrame({"user_created": data_set['user_created'], "user_followers": data_set['user_followers'],
                       "user_friends": data_set['user_friends'], "user_favourites": data_set['user_favourites'], 
                       "user_verified": data_set['user_verified'], "date": data_set['date'],
                       "text": data_set['text'], "source": data_set['source'],
                       "retweets": data_set['retweets'], "favorites": data_set['favorites'], 
                       "text_length": data_set['text'].apply(len)})
dataset.head(5)

## **Exploratory Data Analysis**

### Source vs. Count

In [None]:
plt.figure(figsize = (12, 8))
sns.countplot(y = dataset['source'], data = data_set)
plt.title("Source vs Count")
plt.show()

### Tweets per day

In [None]:
import datetime
dataset['date'] = pd.to_datetime(dataset['date']).dt.date

plt.figure(figsize = (12, 8))
dataset.sort_values('date', inplace = True)
sns.countplot(y = dataset['date'], data = dataset)
plt.title("Tweets per day")
plt.show()

### Heatmap (Correlations)

In [None]:
plt.figure(figsize = (12, 8))
dataset_cor = dataset[['user_followers', 'user_friends', 'user_favourites', 'retweets', 'favorites', 'text_length']].corr()
axes = sns.heatmap(dataset_cor, linecolor = 'white', linewidths = 1, cmap = 'coolwarm', annot = True)
axes.set_title('Heatmap')
plt.show()

### Top mentions (@)

In [None]:
mention = []
for i in range(0, len(dataset)):
    review1 = dataset['text'][i]
    review1 = re.findall('@[a-zA-Z0-9_]+', review1)
    for j in review1:
        mention.append(j)
        
mention

In [None]:
df = pd.DataFrame(mention)
df = df[0].value_counts()

from nltk.probability import FreqDist
freqdist = FreqDist()

for words in df:
    freqdist[words] =+1 
    
freqdist

In [None]:
df = df[:20,]
plt.figure(figsize = (12, 8))
sns.barplot(df.values, df.index, alpha = 0.8)
plt.title("Top @(mention)")
plt.ylabel("Account Name")
plt.xlabel("Count")
plt.show()

### Top Hashtags used (#)

In [None]:
hashtags = []
for i in range(0, len(dataset)):
    review1 = dataset['text'][i]
    review1 = re.findall('#[a-zA-Z0-9_]+', review1)
    for j in review1:
        hashtags.append(j)
        
hashtags

In [None]:
df1 = pd.DataFrame(hashtags)
df1 = df1[0].value_counts()

from nltk.probability import FreqDist
freqdist1 = FreqDist()

for words in df1:
    freqdist1[words] +=1

freqdist1    

In [None]:
df1 = df1[:20, ]
plt.figure(figsize = (12, 8))
sns.barplot(df1.values, df1.index, alpha = 0.8)
plt.title("Top Hashtag Used")
plt.ylabel("Hashtags (#)")
plt.xlabel("Count")
plt.show()

## **Data Cleaning**

### Applying Sentiments on the tweets

TextBlob is a Python (2 and 3) library for processing textual data. It provides a simple API for diving into common natural language processing (NLP) tasks such as part-of-speech tagging, noun phrase extraction, sentiment analysis, classification, translation, and more

In [None]:
from textblob import TextBlob

dataset['sentiment'] = ' '
dataset['polarity'] = None

for i,j in enumerate(dataset.text):
    blob = TextBlob(j)
    dataset['polarity'][i] = blob.sentiment.polarity
    if blob.sentiment.polarity >= 0 :
        dataset['sentiment'][i] = 'positive'
    else:
        dataset['sentiment'][i] = 'negative'

In [None]:
dataset.head(5)

In [None]:
dataset['sentiment'].value_counts()

In [None]:
plt.figure(figsize = (12, 8))
sns.countplot(x = dataset.sentiment, data = dataset)
plt.show()

### Tweets per day based on the Sentiments

In [None]:
plt.figure(figsize = (12, 12))
sns.countplot(y = dataset['date'], hue = dataset['sentiment'], data = dataset)
plt.title("Tweets per Day based on the Sentiment")
plt.show()

In [None]:
senti = pd.get_dummies(dataset['sentiment'], drop_first = True)
dataset = pd.concat([dataset, senti], axis = 1)
dataset = dataset.drop('sentiment', axis = 1)
dataset.head(5)

In [None]:
data = pd.DataFrame({"Text": dataset['text'], "Sentiment": dataset['positive']})
data.head(5)

### Creating a Corpus of Words (Clean text)

In [None]:
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()
corpus = []
words = []
for i in range(0, len(data)):
    review2 = data['Text'][i]
    review2 = re.sub('""', ' ', review2)
    review2 = re.sub('https://[a-zA-Z0-9./]+', ' ',review2)
    review2 = re.sub('@[a-zA-Z0-9._]+', ' ', review2)
    review2 = re.sub('#[a-zA-Z0-9._]+', ' ', review2)
    # review2 = re.sub('\n', ' ', review2)
    review2 = re.sub('[^a-zA-Z]', ' ', review2)
    review2 = review2.lower()
    review2 = review2.split()
    review2 = [word for word in review2 if not word in stopwords.words('english')]
    for j in review2:
        words.append(j)
    review2 = [ps.stem(word) for word in review2]
    review2 = ' '.join(review2)
    corpus.append(review2)

### Top words used

In [None]:
df2 = pd.DataFrame(words)
df2 = df2[0].value_counts()

from nltk.probability import FreqDist
freqdist2 = FreqDist()

for words in df2:
    freqdist2[words] +=1

freqdist2    

In [None]:
df2 = df2[:20, ]
plt.figure(figsize = (12, 8))
sns.barplot(df2.values, df2.index, alpha = 0.8)
plt.title("Top Words Used", fontdict = {'fontsize' : 15})
plt.ylabel("Words")
plt.xlabel("Count")
plt.show()

In [None]:
dataset_final = pd.DataFrame(corpus, columns = ['Content'])  # To convert a List into a DataFrame
dataset_final = pd.concat([data['Sentiment'], dataset_final], axis = 1)
dataset_final

## **Applying Various Classification Models**

### Splitting the Data into Training and Test Set

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(dataset_final.Content, dataset_final.Sentiment, test_size = 0.25)

X_1 = X_train.reset_index(drop = True)
X_2 = X_test.reset_index(drop = True)

Y_1 = y_train.reset_index(drop = True)
Y_2 = y_test.reset_index(drop = True)

### Applying Tfidf Vectorizer

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()

X_train = vectorizer.fit_transform(X_1)
X_test = vectorizer.transform(X_2)

### Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(max_iter = 300)
lr.fit(X_train, y_train)

y_pred_log = lr.predict(X_test)

print("Accuracy of Logistic Regression is: {}%".format(accuracy_score(y_test, y_pred_log) * 100))
print("Confusion Matrix of Logistic Regression is: \n{}".format(confusion_matrix(y_test, y_pred_log)))
print("{}".format(classification_report(y_test, y_pred_log)))

### Support Vector Classifier (SVC)

In [None]:
from sklearn.svm import SVC
svc = SVC(kernel = 'rbf')
svc.fit(X_train, y_train)

y_pred_svc = svc.predict(X_test)

print("Accuracy of Support Vector Classifier is: {}%".format(accuracy_score(y_test, y_pred_svc) * 100))
print("Confusion Matrix of Support Vector Classifier is: \n{}".format(confusion_matrix(y_test, y_pred_svc)))
print("{}".format(classification_report(y_test, y_pred_svc)))

### Multinomial Naive Bayes

In [None]:
from sklearn.naive_bayes import MultinomialNB
mnb = MultinomialNB()
mnb.fit(X_train, y_train)

y_pred_mnb = mnb.predict(X_test)

print("Accuracy of MultinomialNB is: {}%".format(accuracy_score(y_test, y_pred_mnb) * 100))
print("Confusion Matrix of MultinomialNB is: \n{}".format(confusion_matrix(y_test, y_pred_mnb)))
print("{}".format(classification_report(y_test, y_pred_mnb)))

### Decision Tree Classifier

In [None]:
from sklearn.tree import DecisionTreeClassifier
dt = DecisionTreeClassifier(criterion = 'entropy')
dt.fit(X_train, y_train)

y_pred_dt = dt.predict(X_test)

print("Accuracy of Decision tree Classifier is: {}%".format(accuracy_score(y_test, y_pred_dt) * 100))
print("Confusion Matrix of Decision tree Classifier is: \n{}".format(confusion_matrix(y_test, y_pred_dt)))
print("{}".format(classification_report(y_test, y_pred_dt)))

### Random Forest Classifier

In [None]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators = 300, criterion = 'entropy')
rf.fit(X_train, y_train)

y_pred_rf = rf.predict(X_test)

print("Accuracy of Random Forest Classifier is: {}%".format(accuracy_score(y_test, y_pred_rf) * 100))
print("Confusion Matrix of Random Forest Classifier is: \n{}".format(confusion_matrix(y_test, y_pred_rf)))
print("{}".format(classification_report(y_test, y_pred_rf)))

### SGD Classifier

In [None]:
from sklearn.linear_model import SGDClassifier
sgd = SGDClassifier()
sgd.fit(X_train, y_train)

y_pred_sgd = sgd.predict(X_test)

print("Accuracy of Sochastic Gradient Descent Classifier is: {}%".format(accuracy_score(y_test, y_pred_sgd) * 100))
print("Confusion Matrix of Sochastic Gradient Descent Classifier is: \n{}".format(confusion_matrix(y_test, y_pred_sgd)))
print("{}".format(classification_report(y_test, y_pred_sgd)))

### Top Favorited Tweets

In [None]:
fav = dataset[['favorites','text']].sort_values('favorites',ascending = False)[:5].reset_index()
print("**Top 5 most Favourited tweets:**\n")
for j in range(0, 5):
    print(j,'.', fav['text'][j],'\n')

### Most Retweeted Tweets

In [None]:
fav = dataset[['retweets','text']].sort_values('retweets',ascending = False)[:5].reset_index()
print("**Top 5 most Favourited tweets:**\n")
for j in range(0, 5):
    print(j,'.', fav['text'][j],'\n')