In [1]:
#Import Libraries
import numpy as np
import pandas as pd
import nltk
import matplotlib.pyplot as plt

In [2]:
sample_string = "This is a pretty good movie"

#Punkt is a tokenizer present in NLTK
nltk.download('punkt')
tokens = nltk.tokenize.word_tokenize(sample_string)
print(tokens)

#We can also manually tokenize
split_words = [words for words in sample_string.split()]
print(split_words)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
['This', 'is', 'a', 'pretty', 'good', 'movie']
['This', 'is', 'a', 'pretty', 'good', 'movie']


In [3]:
#A corpus is a large and structured set of machine-readable texts that have been produced in a natural communicative setting.
#Stopwords are words that dont add much meaning to the sentence, that is, if they are removed then the sentence still manages
#pass on most if not all of its meaning
from nltk.corpus import stopwords
nltk.download('stopwords')

#Creating an array of stopwords present in the English language
stopwords = np.array(stopwords.words('english'))
print(stopwords)
print(stopwords.shape)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
['i' 'me' 'my' 'myself' 'we' 'our' 'ours' 'ourselves' 'you' "you're"
 "you've" "you'll" "you'd" 'your' 'yours' 'yourself' 'yourselves' 'he'
 'him' 'his' 'himself' 'she' "she's" 'her' 'hers' 'herself' 'it' "it's"
 'its' 'itself' 'they' 'them' 'their' 'theirs' 'themselves' 'what' 'which'
 'who' 'whom' 'this' 'that' "that'll" 'these' 'those' 'am' 'is' 'are'
 'was' 'were' 'be' 'been' 'being' 'have' 'has' 'had' 'having' 'do' 'does'
 'did' 'doing' 'a' 'an' 'the' 'and' 'but' 'if' 'or' 'because' 'as' 'until'
 'while' 'of' 'at' 'by' 'for' 'with' 'about' 'against' 'between' 'into'
 'through' 'during' 'before' 'after' 'above' 'below' 'to' 'from' 'up'
 'down' 'in' 'out' 'on' 'off' 'over' 'under' 'again' 'further' 'then'
 'once' 'here' 'there' 'when' 'where' 'why' 'how' 'all' 'any' 'both'
 'each' 'few' 'more' 'most' 'other' 'some' 'such' 'no' 'nor' 'not' 'only'
 'own' 'same' 'so' 'than' 't

In [4]:
#Separating the stopwords from the useful words
useful_words = [x for x in split_words if x not in stopwords]
print(useful_words)

['This', 'pretty', 'good', 'movie']


In [5]:
#Stemming is the process of keeping the root words from a list of similar words
#It may or may not retain any meaning
#For example, 'ban', 'banana', 'bankruptcy', 'banner' all start with 'ban-' but they are not related to each other at all
from nltk.stem import PorterStemmer
ps = PorterStemmer()

test_set = ['ban', 'banana', 'banner', 'bankruptcy']
stemmed = [ps.stem(words) for words in test_set]

#Stemming changes all words to lower case
print(stemmed)

['ban', 'banana', 'banner', 'bankruptci']


In [6]:
#Importing tweet data
data = pd.read_csv('https://raw.githubusercontent.com/dD2405/Twitter_Sentiment_Analysis/master/train.csv')
data.info()
data.head(15)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 31962 entries, 0 to 31961
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   id      31962 non-null  int64 
 1   label   31962 non-null  int64 
 2   tweet   31962 non-null  object
dtypes: int64(2), object(1)
memory usage: 749.2+ KB


Unnamed: 0,id,label,tweet
0,1,0,@user when a father is dysfunctional and is s...
1,2,0,@user @user thanks for #lyft credit i can't us...
2,3,0,bihday your majesty
3,4,0,#model i love u take with u all the time in ...
4,5,0,factsguide: society now #motivation
5,6,0,[2/2] huge fan fare and big talking before the...
6,7,0,@user camping tomorrow @user @user @user @use...
7,8,0,the next school year is the year for exams.ð...
8,9,0,we won!!! love the land!!! #allin #cavs #champ...
9,10,0,@user @user welcome here ! i'm it's so #gr...


In [7]:
import re
def remove_patterns(pattern, text):
  #Finds all matching strings that match the regex pattern provided
  occur = re.findall(pattern, text)
  #Substitutes every such pattern that was found with a empty string
  #and returns the string after every removal
  for match in occur:
    text = re.sub(match, "", text)
  return text

In [8]:
#Adds a new column containing the cleaned data to the original dataframe
data['cleaned'] = [remove_patterns('@[\w]*', sentence) for sentence in data['tweet']]
data.info()
data.head(15)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 31962 entries, 0 to 31961
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   id       31962 non-null  int64 
 1   label    31962 non-null  int64 
 2   tweet    31962 non-null  object
 3   cleaned  31962 non-null  object
dtypes: int64(2), object(2)
memory usage: 998.9+ KB


Unnamed: 0,id,label,tweet,cleaned
0,1,0,@user when a father is dysfunctional and is s...,when a father is dysfunctional and is so sel...
1,2,0,@user @user thanks for #lyft credit i can't us...,thanks for #lyft credit i can't use cause th...
2,3,0,bihday your majesty,bihday your majesty
3,4,0,#model i love u take with u all the time in ...,#model i love u take with u all the time in ...
4,5,0,factsguide: society now #motivation,factsguide: society now #motivation
5,6,0,[2/2] huge fan fare and big talking before the...,[2/2] huge fan fare and big talking before the...
6,7,0,@user camping tomorrow @user @user @user @use...,camping tomorrow dannyâ¦
7,8,0,the next school year is the year for exams.ð...,the next school year is the year for exams.ð...
8,9,0,we won!!! love the land!!! #allin #cavs #champ...,we won!!! love the land!!! #allin #cavs #champ...
9,10,0,@user @user welcome here ! i'm it's so #gr...,welcome here ! i'm it's so #gr8 !


In [9]:
#Further processing, ^ implies 'except' when within regex sets, so it removes everything
#except 'a-z', 'A-Z' and '#' which constitutes hashtags together with the alphabets
data['cleaned'] = data['cleaned'].str.replace("[^a-zA-Z#]", " ")
data.head(15)

Unnamed: 0,id,label,tweet,cleaned
0,1,0,@user when a father is dysfunctional and is s...,when a father is dysfunctional and is so sel...
1,2,0,@user @user thanks for #lyft credit i can't us...,thanks for #lyft credit i can t use cause th...
2,3,0,bihday your majesty,bihday your majesty
3,4,0,#model i love u take with u all the time in ...,#model i love u take with u all the time in ...
4,5,0,factsguide: society now #motivation,factsguide society now #motivation
5,6,0,[2/2] huge fan fare and big talking before the...,huge fan fare and big talking before the...
6,7,0,@user camping tomorrow @user @user @user @use...,camping tomorrow danny
7,8,0,the next school year is the year for exams.ð...,the next school year is the year for exams ...
8,9,0,we won!!! love the land!!! #allin #cavs #champ...,we won love the land #allin #cavs #champ...
9,10,0,@user @user welcome here ! i'm it's so #gr...,welcome here i m it s so #gr


In [10]:
#Here, lambda expressions basically work on every item, x is the item,
#and then the operation is applied on x

#Tokenizing
data['cleaned'] = data['cleaned'].apply(lambda x : x.split())

#Removing stopwords
data['cleaned'] = data['cleaned'].apply(lambda x : [words for words in x if words not in stopwords])

#Stemming
data['cleaned'] = data['cleaned'].apply(lambda x : [ps.stem(words) for words in x])

#Converting each item back to sentence
data['cleaned'] = data['cleaned'].apply(lambda x : ' '.join(x))

#Showing current data
data.head(15)

Unnamed: 0,id,label,tweet,cleaned
0,1,0,@user when a father is dysfunctional and is s...,father dysfunct selfish drag kid dysfunct #run
1,2,0,@user @user thanks for #lyft credit i can't us...,thank #lyft credit use caus offer wheelchair v...
2,3,0,bihday your majesty,bihday majesti
3,4,0,#model i love u take with u all the time in ...,#model love u take u time ur
4,5,0,factsguide: society now #motivation,factsguid societi #motiv
5,6,0,[2/2] huge fan fare and big talking before the...,huge fan fare big talk leav chao pay disput ge...
6,7,0,@user camping tomorrow @user @user @user @use...,camp tomorrow danni
7,8,0,the next school year is the year for exams.ð...,next school year year exam think #school #exam...
8,9,0,we won!!! love the land!!! #allin #cavs #champ...,love land #allin #cav #champion #cleveland #cl...
9,10,0,@user @user welcome here ! i'm it's so #gr...,welcom #gr


In [11]:
#Count Vectorizer basically performs a count of every word present as a whole,
#It maps each word to a label and then counts the number of times the labeled words appear
#In each sentence, throughout the document as a whole
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(max_df=0.8, min_df=2, max_features=1000, stop_words='english')
vectorized_data = vectorizer.fit_transform(data['cleaned'])
vectorized_data = pd.DataFrame(vectorized_data.todense())
vectorized_data.info()
vectorized_data.head(15)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 31962 entries, 0 to 31961
Columns: 1000 entries, 0 to 999
dtypes: int64(1000)
memory usage: 243.9 MB


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,...,960,961,962,963,964,965,966,967,968,969,970,971,972,973,974,975,976,977,978,979,980,981,982,983,984,985,986,987,988,989,990,991,992,993,994,995,996,997,998,999
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
6,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
7,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0
8,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
9,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [12]:
#Test train splitting of data
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(vectorized_data, data['label'], test_size = 0.3)

In [13]:
#Logistic Regression
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
LRClf = LogisticRegression(solver = 'lbfgs').fit(X_train, Y_train)
probab_est = LRClf.predict_proba(X_test)
print(probab_est.shape)
#accuracy_score returns the accuracy by comparing Y_predicted with Y_true
predictions = LRClf.predict(X_test)
accuracy = accuracy_score(predictions, Y_test)
print(accuracy)
#LogisticRegression.score() method does the exact same thing except in one line
print(LRClf.score(X_test, Y_test))

(9589, 2)
0.9478569193867974
0.9478569193867974


In [14]:
#f1_score is used when classes are largely unbalanced, in this case number of happy tweets
#largely outnumbers the number of sad tweets
from sklearn.metrics import f1_score
print(np.unique(predictions, return_counts=True))
f1_accuracy = f1_score(predictions, Y_test)
print(f1_accuracy)

(array([0, 1]), array([9262,  327]))
0.50199203187251


In [15]:
#SVM
from sklearn import svm
SVMClf = svm.SVC().fit(X_train, Y_train)
predictions = SVMClf.predict(X_test)
accuracy = accuracy_score(predictions, Y_test)
print(accuracy)
#f1 score from SVM
f1_accuracy = f1_score(predictions, Y_test)
print(f1_accuracy)

0.9504640734174575
0.5046923879040668


In [16]:
#TfidfVectorizer provides the same results as CountVectorizer followed by TfidfTranformer
#Tf stands for Term-frequency
#Idf stands for Inverse document frequency
#The picture at https://miro.medium.com/max/1050/1*qQgnyPLDIkUmeZKN2_ZWbQ.png explains the concept in a nutshell
#In TfidfVectorizer we consider overall document weightage of a word. It helps us in dealing with most frequent words.
#Using it we can penalize them. TfidfVectorizer weights the word counts by a measure of how often they appear in the documents.
#If a word appears in every document, it is not so unique hence its actual affect on the meaning must be low, but not zero

#Using TfidfVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
tfidfvectorizer = TfidfVectorizer(max_df=0.8, min_df=2, max_features=1000, stop_words='english')
tfidfvectorized_data = tfidfvectorizer.fit_transform(data['cleaned'])
tfidfvectorized_data = pd.DataFrame(tfidfvectorized_data.todense())
X_train, X_test, Y_train, Y_test = train_test_split(tfidfvectorized_data, data['label'], test_size = 0.3)

#SVM using default kernel
TfidfSVMClf = svm.SVC().fit(X_train, Y_train)
predictions = TfidfSVMClf.predict(X_test)
accuracy = accuracy_score(predictions, Y_test)
print(accuracy)
#f1 score from SVM
f1_accuracy = f1_score(predictions, Y_test)
print(f1_accuracy)

#SVM with linear kernel
LinKernSVMClf = svm.SVC(kernel='linear').fit(X_train, Y_train)
predictions = LinKernSVMClf.predict(X_test)
accuracy = accuracy_score(predictions, Y_test)
print(accuracy)
#f1 score from SVM
f1_accuracy = f1_score(predictions, Y_test)
print(f1_accuracy)

0.9557826676400042
0.5583333333333333
0.95213265199708
0.5311542390194075
