#   
<span style="font-family:Arial; font-weight:Bold; font-size:2.3em; color:#00b3e5;"> NLP Project: Sentiment Analysis   

#####      
<span style="font-family:Arial; font-weight:Bold; font-size:1.8em; color:#00b3e5;"> loading libraries  

In [1]:
import numpy  as np
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt

import re, string, unicodedata
from   bs4 import BeautifulSoup

from   textblob  import TextBlob, Word
from   wordcloud import WordCloud, STOPWORDS
import spacy

from sklearn.svm           import SVC
from sklearn.naive_bayes   import MultinomialNB
from sklearn.preprocessing import LabelBinarizer
from sklearn.linear_model  import LogisticRegression, SGDClassifier
from sklearn.metrics       import classification_report, confusion_matrix, accuracy_score
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [2]:
import nltk   # pip install --upgrade nltk

nltk.download('punkt') 
nltk.download('wordnet')
nltk.download('stopwords') 

from nltk.corpus          import stopwords
from nltk.stem.porter     import PorterStemmer
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.tokenize        import word_tokenize, sent_tokenize
from nltk.stem            import LancasterStemmer, WordNetLemmatizer

[nltk_data] Downloading package punkt to C:\Users\EZ-
[nltk_data]     Tech\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to C:\Users\EZ-
[nltk_data]     Tech\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to C:\Users\EZ-
[nltk_data]     Tech\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


#####      
<span style="font-family:Arial; font-weight:Bold; font-size:1.8em; color:#00b3e5;"> loading dataset

In [3]:
comments = pd.read_csv('sentiment_sentences.txt',delimiter='\t')
comments.sample(3)

Unnamed: 0,sentence_index,sentence
6165,6166,Certainly .
3503,3504,"The filmmakers try to balance pointed , often ..."
6406,6407,"In the end , The Weight of Water comes to rese..."


In [4]:
sentiments = pd.read_csv('sentiment_labels.txt',delimiter='\t')

sentiments.sample(3)

Unnamed: 0,phrase ids|sentiment values
25613,25613|0.5
187485,187485|0.055556
133466,133466|0.375


In [5]:
# splitting

df = sentiments.copy()
ft = 'phrase ids|sentiment values'

#phrase_id      = df[ft].str.split('|', expand=True)[0]
sentiment_value = df[ft].str.split('|', expand=True)[1]

In [6]:
data = comments[['sentence']]
data['sentiment_value'] = (sentiment_value).astype(float)
data.sample(3)

Unnamed: 0,sentence,sentiment_value
4935,Both a successful adaptation and an enjoyable ...,0.5
4624,"As averse as I usually am to feel-good , follo...",0.69444
438,This examination of aquatic life off the shore...,0.375


In [7]:
def num2sent(x):
    if   x>=0.5: return 'positive'
    elif x<0.5 : return 'negative'
    else       : return x

data['sentiment'] = data['sentiment_value'].apply(num2sent)
data.sample(3)

Unnamed: 0,sentence,sentiment_value,sentiment
10195,"High Crimes is a cinematic misdemeanor , a rou...",0.5,positive
9448,Anyone who gets chills from movies with giant ...,0.55556,positive
2510,"You emerge dazed , confused as to whether you ...",0.77778,positive


In [8]:
# correcting column names
data.columns = ['Comment','Sentiment_value','Sentiment']
data.sample(3)

Unnamed: 0,Comment,Sentiment_value,Sentiment
6157,"There 's a lot of good material here , but the...",0.5,positive
7847,Verbinski implements every hack-artist trick t...,0.58333,positive
865,At times funny and at other times candidly rev...,0.61111,positive


#####      
<span style="font-family:Arial; font-weight:Bold; font-size:1.8em; color:#00b3e5;"> EDA

In [9]:
data.shape

(11855, 3)

In [10]:
data.describe()

Unnamed: 0,Sentiment_value
count,11855.0
mean,0.502388
std,0.172716
min,0.0
25%,0.40278
50%,0.5
75%,0.59722
max,1.0


In [11]:
data = data.dropna()
data.isnull().sum()

Comment            0
Sentiment_value    0
Sentiment          0
dtype: int64

In [12]:
data['Sentiment'].value_counts()

Sentiment
positive    7387
negative    4468
Name: count, dtype: int64

#####      
<span style="font-family:Arial; font-weight:Bold; font-size:1.8em; color:#00b3e5;"> NLP Preprocessing

In [13]:
# Tokenization of the text
tokenizers = ToktokTokenizer()

In [14]:
# Removing the noisy text
def noiseremoval_text(text):
    soup = BeautifulSoup(text,'html.parser')
    text = soup.get_text()
    text = re.sub( '\[[^]]*\]', '', text )
    return text

In [15]:
# Apply function on review column
data['Comment'] = data['Comment'].apply(noiseremoval_text)



In [16]:
data.head()

Unnamed: 0,Comment,Sentiment_value,Sentiment
0,The Rock is destined to be the 21st Century 's...,0.5,positive
1,The gorgeously elaborate continuation of `` Th...,0.5,positive
2,Effective but too-tepid biopic,0.44444,negative
3,If you sometimes like to go to the movies to h...,0.5,positive
4,"Emerges as something rare , an issue movie tha...",0.42708,negative


In [17]:
# Stemming
def stemmer(text):
    ps = PorterStemmer()
    text = ' '.join([ps.stem(word) for word in text.split()])
    return text

In [18]:
# Apply function on review column
data['Comment'] = data['Comment'].apply(stemmer)

In [19]:
data.head()

Unnamed: 0,Comment,Sentiment_value,Sentiment
0,the rock is destin to be the 21st centuri 's n...,0.5,positive
1,the gorgeous elabor continu of `` the lord of ...,0.5,positive
2,effect but too-tepid biopic,0.44444,negative
3,if you sometim like to go to the movi to have ...,0.5,positive
4,"emerg as someth rare , an issu movi that 's so...",0.42708,negative


In [20]:
# Setting English StopWords
stopwords  = set( nltk.corpus.stopwords.words('english') )

In [21]:
# removing the stopwords
def removing_stopwords( text, into_lower_case=False ):
    # tokenization of text
    tokenizers = ToktokTokenizer()
    # setting english stopwords
    tokens = tokenizers.tokenize( text )
    tokens = [i.strip() for i in tokens]
    if into_lower_case:
        filtokens = [i for i in tokens if i.lower() not in stopwords]
    else:
        filtokens = [i for i in tokens if i         not in stopwords]
    filtered_text = ' '.join(filtokens)
    return filtered_text

In [22]:
# Apply function on review column
data['Comment'] = data['Comment'].apply(removing_stopwords)

In [23]:
data.sample(5)

Unnamed: 0,Comment,Sentiment_value,Sentiment
2988,"witherspoon put rest valley-girl imag , ' denc...",0.43056,negative
10240,"feel aimless much run time , late film tidal w...",0.75,positive
9926,"' dull , spiritless , silli monoton : ultra-lo...",0.16667,negative
750,reinforc often forgotten fact world ' remark v...,0.69444,positive
9601,tediou parabl honesti good sportsmanship .,0.52778,positive


#####      
<span style="font-family:Arial; font-weight:Bold; font-size:1.8em; color:#00b3e5;"> Train-Valid-Test Split (Comments)

In [24]:
# Percents
n  = data.shape[0]

Train_Comment = data.Comment[          :int(.8*n) ]
Valid_Comment = data.Comment[ int(.8*n):int(.9*n) ]
Test_Comment  = data.Comment[ int(.9*n):          ]

#####      
<span style="font-family:Arial; font-weight:Bold; font-size:1.8em; color:#00b3e5;"> Label Encoding

In [25]:
# Labeling the sentiment data
label = LabelBinarizer()

In [26]:
# transformed sentiment data
Sentiment_Value = label.fit_transform(data['Sentiment'])

In [27]:
print( Sentiment_Value.shape )

(11855, 1)


In [28]:
Sentiment_Value

array([[1],
       [1],
       [0],
       ...,
       [1],
       [1],
       [1]])

#####      
<span style="font-family:Arial; font-weight:Bold; font-size:1.8em; color:#00b3e5;"> Train-Valid-Test Split (Sentiment)

In [29]:
# Percents
n  = data.shape[0]

Train_Sentiment = data.Sentiment[          :int(.8*n) ]
Valid_Sentiment = data.Sentiment[ int(.8*n):int(.9*n) ]
Test_Sentiment  = data.Sentiment[ int(.9*n):          ]

In [30]:
Test_Sentiment.sample(3)

10779    positive
11269    positive
10807    positive
Name: Sentiment, dtype: object

#####      
<span style="font-family:Arial; font-weight:Bold; font-size:1.8em; color:#00b3e5;"> Bag of Words

In [31]:
#Count vectorizer for bag of words
cv=CountVectorizer( min_df=0, max_df=1, binary=False, ngram_range=(1,3) )

In [32]:
# transformed Data
CV_Train = cv.fit_transform( Train_Comment )
CV_Valid = cv.transform(     Valid_Comment )
CV_Test  = cv.transform(     Test_Comment  )

In [33]:
print('BOW CV_Train:', CV_Train.shape)
print('BOW CV_Valid:', CV_Valid.shape)
print('BOW CV_Test: ', CV_Test.shape )

BOW CV_Train: (9484, 150206)
BOW CV_Valid: (1185, 150206)
BOW CV_Test:  (1186, 150206)


#####      
<span style="font-family:Arial; font-weight:Bold; font-size:1.8em; color:#00b3e5;"> TF-IDF

In [34]:
# TF-IDF vectorizer  
tf = TfidfVectorizer(min_df=0,max_df=1,use_idf=True,ngram_range=(1,3))

In [35]:
# transformed Data
TF_Train = tf.fit_transform( Train_Comment )
TF_Valid = tf.transform(     Valid_Comment )
TF_Test  = tf.transform(     Test_Comment  )

In [36]:
print('BOW TF_Train:', TF_Train.shape)
print('BOW TF_Valid:', TF_Valid.shape)
print('BOW TF_Test: ', TF_Test.shape )

BOW TF_Train: (9484, 150206)
BOW TF_Valid: (1185, 150206)
BOW TF_Test:  (1186, 150206)


#####      
<span style="font-family:Arial; font-weight:Bold; font-size:1.8em; color:#00b3e5;"> Logistic Regression Model

In [37]:
model = LogisticRegression( penalty='l2', max_iter=500, C=1, random_state=7 )

<span style="font-family:Arial; font-weight:Bold; font-size:1.5em; color:#00b3e5;"> Model with BOW

In [38]:
# Fitting
BOW_Model = model.fit( CV_Train, Train_Sentiment )
print(BOW_Model)

LogisticRegression(C=1, max_iter=500, random_state=7)


In [39]:
# Predicting
BOW_Pred = BOW_Model.predict( CV_Valid )
BOW_Pred

array(['positive', 'positive', 'positive', ..., 'positive', 'positive',
       'positive'], dtype=object)

In [40]:
# Scoring (Accuracy)  
BOW_Score = accuracy_score( BOW_Pred, Valid_Sentiment )
print("BOW_Score :",BOW_Score)

BOW_Score : 0.6362869198312237


<span style="font-family:Arial; font-weight:Bold; font-size:1.5em; color:#00b3e5;"> Model with TF-IDF

In [41]:
# Fitting
TF_Model = model.fit( TF_Train, Train_Sentiment )
print(TF_Model)

LogisticRegression(C=1, max_iter=500, random_state=7)


In [42]:
# Predicting
TF_Pred = TF_Model.predict( TF_Valid )
TF_Pred

array(['positive', 'positive', 'positive', ..., 'positive', 'positive',
       'positive'], dtype=object)

In [43]:
# Scoring (Accuracy)  
TF_Score = accuracy_score( TF_Pred, Valid_Sentiment )
print("TF_Score :",TF_Score)

TF_Score : 0.6362869198312237


#####   

#####   