# Sentiment Analysis

In [2]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Data Preprocessing

# Lemmatization
It turns the words to its original form. It is a very common thing we want to do, because we do not want to confuse our model that run and running are different.

In [3]:
#running, ran --> run
import spacy

nlp = spacy.load("en_core_web_sm")

In [4]:
doc = nlp("run ran running")

In [5]:
for token in doc:
    print(token.text, token.lemma_)

run run
ran run
running run


In [6]:
from spacy.lang.en.stop_words import STOP_WORDS

stopwords = list(STOP_WORDS)

In [7]:
doc = nlp("Sam is going to Disney Land to eat with his best friend Peter.")

In [8]:
clean_tokens = []

for token in doc:
    if token.text not in stopwords:
        clean_tokens.append(token.text)

clean_tokens

['Sam', 'going', 'Disney', 'Land', 'eat', 'best', 'friend', 'Peter', '.']

# Punctuations

In [9]:
doc = nlp("Lameesa, the NLP instructor, $ / @ # Omdena !!!???? likes to eat pasta.")

In [10]:
token_no_punct = []

for token in doc:
    if token.pos_ != "PUNCT" and token.pos_ != "SYM":
        token_no_punct.append(token.text)

token_no_punct

['Lameesa',
 'the',
 'NLP',
 'instructor',
 '@',
 '#',
 'Omdena',
 'likes',
 'to',
 'eat',
 'pasta']

# Lowercasing and Unnecessary Spaces

In [11]:
stripped_lowercase_tokens = []

for token in doc:
    stripped_lowercase_tokens.append(token.text.lower().strip())

stripped_lowercase_tokens

['lameesa',
 ',',
 'the',
 'nlp',
 'instructor',
 ',',
 '$',
 '/',
 '@',
 '#',
 'omdena',
 '!',
 '!',
 '!',
 '?',
 '?',
 '?',
 '?',
 'likes',
 'to',
 'eat',
 'pasta',
 '.']

# Combining everything

In [12]:
def preprocessing(sentence):
    stopwords = list(STOP_WORDS)
    doc = nlp(sentence)
    clean_tokens = []

    for token in doc:
        if token.text not in stopwords and token.pos_ != 'PUNCT' and token.pos_ != "SYM" and \
            token.pos_ != "SPACE":
                clean_tokens.append(token.text)

    return clean_tokens

In [13]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Loading Data

In [14]:
data_yelp   = pd.read_csv('/content/yelp_labelled.txt',   sep='\t', header=None, names=['Review', 'Sentiment'])
data_amazon = pd.read_csv('/content/amazon_labelled.txt', sep='\t', header=None, names=['Review', 'Sentiment'])
data_imdb   = pd.read_csv('/content/imdb_labelled.txt',   sep='\t', header=None, names=['Review', 'Sentiment'])

In [15]:
data_yelp.head()

Unnamed: 0,Review,Sentiment
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1


In [16]:
data_yelp.shape, data_amazon.shape, data_imdb.shape

((1000, 2), (1000, 2), (748, 2))

# Exploratory Data Analysis

In [17]:
data = pd.concat([data_yelp, data_amazon, data_imdb], ignore_index=True)

In [18]:
data.shape

(2748, 2)

In [19]:
#check imbalances
data['Sentiment'].value_counts()

1    1386
0    1362
Name: Sentiment, dtype: int64

In [20]:
data.isna().sum()

Review       0
Sentiment    0
dtype: int64

# Countvectorizer

In [21]:
from sklearn.feature_extraction.text import CountVectorizer

countvec = CountVectorizer(tokenizer=preprocessing)

#examples
corpus = [
    'Deep learning is fun',
    'Spacy is cool and fun',
    'please hashtag #spacy'
]

result = countvec.fit_transform(corpus)

print(countvec.get_feature_names_out()) #list of tokens

print(result.toarray())
#rows are sentences
#columns are unique words

['#' 'cool' 'deep' 'fun' 'hashtag' 'learning' 'spacy']
[[0 0 1 1 0 1 0]
 [0 1 0 1 0 0 1]
 [1 0 0 0 1 0 1]]




In [22]:
import numpy as np

neg_cond = data.Sentiment == 0
pos_cond = data.Sentiment == 1

neg_df   = data[neg_cond]
pos_df   = data[pos_cond]

In [23]:
neg_result = countvec.fit_transform(neg_df.Review)
neg_vocabs = countvec.get_feature_names_out()

pos_result = countvec.fit_transform(pos_df.Review)
pos_vocabs = countvec.get_feature_names_out()

In [24]:
neg_result.shape, pos_result.shape

((1362, 3158), (1386, 3115))

In [25]:
neg_counts = np.sum(neg_result, axis = 0)
pos_counts = np.sum(pos_result, axis = 0)

In [26]:
df = pd.DataFrame(neg_counts, columns = neg_vocabs).T.sort_values(by=0, ascending=False)

In [27]:
df.head(10)

Unnamed: 0,0
1,103
bad,96
movie,95
0,92
phone,78
film,72
like,67
food,66
time,62
good,57


# TfidfVectorizer
In NLP, generally countvectorizer is not used because it makes very frequent words a prominent feature. We want to normalize it, whereas normalized(countvectorizer) ==> tfidvectorizer.

In [28]:
tfidvec = TfidfVectorizer(tokenizer=preprocessing)

neg_result = tfidvec.fit_transform(neg_df.Review)
neg_vocabs = tfidvec.get_feature_names_out()
pos_result = tfidvec.fit_transform(pos_df.Review)
pos_vocabs = tfidvec.get_feature_names_out()

neg_counts = np.sum(neg_result, axis = 0)
pos_counts = np.sum(pos_result, axis = 0)

neg_count_df = pd.DataFrame(neg_counts, columns = neg_vocabs).T.sort_values(by=0, ascending=False)
pos_count_df = pd.DataFrame(pos_counts, columns = pos_vocabs).T.sort_values(by=0, ascending=False)



In [29]:
neg_count_df.head(10)

Unnamed: 0,0
bad,27.35239
phone,21.732597
service,21.427283
food,20.67629
movie,18.446768
time,18.194969
place,17.811345
good,16.930496
like,16.753157
waste,15.143746


In [30]:
pos_count_df.head(10)

Unnamed: 0,0
great,56.691299
good,47.769436
phone,30.258919
food,22.290479
place,22.060052
service,21.79469
works,21.240647
film,20.164936
movie,19.952642
excellent,19.037113


# Modeling and Training

In [31]:
from sklearn.svm import LinearSVC

classifer = LinearSVC()
tfidvec   = TfidfVectorizer()

X = data["Review"]
y = data["Sentiment"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=333)
print(X_test.shape)

(825,)


In [32]:
clf = Pipeline([('tfidf', tfidvec), ('clf', classifer)])

In [33]:
clf.fit(X_train, y_train)

In [34]:
yhat = clf.predict(X_test)

In [35]:
print(classification_report(yhat, y_test))

              precision    recall  f1-score   support

           0       0.85      0.82      0.83       418
           1       0.82      0.85      0.83       407

    accuracy                           0.83       825
   macro avg       0.83      0.83      0.83       825
weighted avg       0.83      0.83      0.83       825



In [36]:
confusion_matrix(yhat, y_test)

array([[342,  76],
       [ 62, 345]])

# Real-world Inferencing

In [37]:
clf.predict(['Mastering NLP is pretty difficult.'])

array([0])