In [1]:
import pandas as pd
df=pd.read_csv('labeledTrainData.tsv',sep = '\t')

In [2]:

df.shape

(25000, 3)

In [3]:
df.head()

Unnamed: 0,id,sentiment,review
0,5814_8,1,With all this stuff going down at the moment w...
1,2381_9,1,"\The Classic War of the Worlds\"" by Timothy Hi..."
2,7759_3,0,The film starts with a manager (Nicholas Bell)...
3,3630_4,0,It must be assumed that those who praised this...
4,9495_8,1,Superbly trashy and wondrously unpretentious 8...


In [4]:
df.columns


Index(['id', 'sentiment', 'review'], dtype='object')

In [5]:
#descriptive statistics
df.describe()


Unnamed: 0,sentiment
count,25000.0
mean,0.5
std,0.50001
min,0.0
25%,0.0
50%,0.5
75%,1.0
max,1.0


In [6]:
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25000 entries, 0 to 24999
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   id         25000 non-null  object
 1   sentiment  25000 non-null  int64 
 2   review     25000 non-null  object
dtypes: int64(1), object(2)
memory usage: 586.1+ KB


DATA CLEANING AND PREPROCESSING

In [7]:
df.isna().sum() #Finding any null values in each column


id           0
sentiment    0
review       0
dtype: int64

In [8]:
import re                 ##imports the re module, which provides regular expression matching operations for pattern matching and text manipulation.
import nltk               ##imports the nltk library,stands for Natural Language Toolkit.NLTK is a popular library for natural language processing tasks,such as tokenization,stemming,lemmatization, and more.
import spacy              ##imports the spacy library,another powerful library for natural language processing.
import string
from nltk.tokenize import word_tokenize ##imports the word_tokenize function from the nltk.tokenize module.


In [9]:
from nltk.corpus import stopwords  ##'stopwords' module allows you to access and use predefined stopword lists for various languages.
from nltk.stem import WordNetLemmatizer,PorterStemmer ##'WordNetLemmatizer' class from NLTK's stem module provides functionality for lemmatization,'PorterStemmer' is a widely used stemming algorithm that applies a set of rules to perform stemming.


In [10]:
nltk.download('stopwords')  ##download the necessary stopwords
nltk.download('wordnet')  ##it fetches the WordNet data from the NLTK repository.


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [11]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [12]:
x = df[["review"]]

In [13]:
x["review"] = x["review"].astype(str)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  x["review"] = x["review"].astype(str)


In [14]:
def clean_text(text):
    def remove_mentions(text):
        # Regular expression pattern to match mentions
        mention_pattern = r'@[\w_]+'

        # Remove mentions using regular expression substitution
        cleaned_text = re.sub(mention_pattern,'', text)

        return cleaned_text

    # Remove mentions from the text
    text = remove_mentions(text)

    # Tokenization
    tokens = word_tokenize(text)

    # Lowercasing
    tokens = [token.lower() for token in tokens]

    # Handling contractions
    contractions = {
        "n't": "not",
        "'s": "is",
        "'re": "are",
        "'ve": "have"

    }
    tokens = [contractions[token] if token in contractions else token for token in tokens]

    # Removing stopwords and punctuation
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words and token not in string.punctuation]
    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token) for token in tokens]
    return tokens

x['cleaned_review'] = x['review'].apply(clean_text)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  x['cleaned_review'] = x['review'].apply(clean_text)


In [15]:
x['cleaned_review'].head()

0    [stuff, going, moment, mj, started, listening,...
1    [\the, classic, war, worlds\, '', timothy, hin...
2    [film, start, manager, nicholas, bell, giving,...
3    [must, assumed, praised, film, \the, greatest,...
4    [superbly, trashy, wondrously, unpretentious, ...
Name: cleaned_review, dtype: object

In [16]:
df['cleaned_review'] = x["cleaned_review"]
df.head()

Unnamed: 0,id,sentiment,review,cleaned_review
0,5814_8,1,With all this stuff going down at the moment w...,"[stuff, going, moment, mj, started, listening,..."
1,2381_9,1,"\The Classic War of the Worlds\"" by Timothy Hi...","[\the, classic, war, worlds\, '', timothy, hin..."
2,7759_3,0,The film starts with a manager (Nicholas Bell)...,"[film, start, manager, nicholas, bell, giving,..."
3,3630_4,0,It must be assumed that those who praised this...,"[must, assumed, praised, film, \the, greatest,..."
4,9495_8,1,Superbly trashy and wondrously unpretentious 8...,"[superbly, trashy, wondrously, unpretentious, ..."


In [17]:
df_new = df
df_new.drop(['review'], axis=1, inplace=True) #dropping actual column because cleaned_full_text has cleaned text
df_new.head()

Unnamed: 0,id,sentiment,cleaned_review
0,5814_8,1,"[stuff, going, moment, mj, started, listening,..."
1,2381_9,1,"[\the, classic, war, worlds\, '', timothy, hin..."
2,7759_3,0,"[film, start, manager, nicholas, bell, giving,..."
3,3630_4,0,"[must, assumed, praised, film, \the, greatest,..."
4,9495_8,1,"[superbly, trashy, wondrously, unpretentious, ..."


SENTIMENT ANALYSIS

In [18]:
from textblob import TextBlob

In [19]:
def polarity(text):
    return TextBlob(text).sentiment.polarity

df_new['polarity_score'] = df_new['cleaned_review'].apply(lambda x : polarity(str(x)))

def sentiment(x):
    if x<=0:
        return 'negative'
    else:
        return 'positive'

df_new['polarity'] = df_new['polarity_score'].map(lambda x: sentiment(x))

In [20]:
import plotly.graph_objects as go
fig = go.Figure(data=[go.Pie(labels=df_new['polarity'].value_counts().index.tolist(),
                             values=df_new['polarity'].value_counts().tolist(),
                            marker=dict(colors=['#006400','#8B0001','#add8e3']))])

fig.update_layout(title_text='Proportion of Sentiments',title_x=0.5,
                  template='plotly_white')
fig.show()



Observation:The above pie chart shows us the proportion of "positive," and "negative" reviews from customers.

Clustering the reviews into various themes such as Positive,Negative & Neutral

In [21]:
df_new

Unnamed: 0,id,sentiment,cleaned_review,polarity_score,polarity
0,5814_8,1,"[stuff, going, moment, mj, started, listening,...",-0.075058,negative
1,2381_9,1,"[\the, classic, war, worlds\, '', timothy, hin...",0.193137,positive
2,7759_3,0,"[film, start, manager, nicholas, bell, giving,...",-0.051991,negative
3,3630_4,0,"[must, assumed, praised, film, \the, greatest,...",0.155347,positive
4,9495_8,1,"[superbly, trashy, wondrously, unpretentious, ...",-0.017872,negative
...,...,...,...,...,...
24995,3453_3,0,"[seems, like, consideration, gone, imdb, revie...",-0.104861,negative
24996,5064_1,0,"[believe, made, film, completely, unnecessary,...",0.104788,positive
24997,10905_3,0,"[guy, loser, ca, get, girl, need, build, picke...",0.132576,positive
24998,10194_3,0,"[30, minute, documentary, buñuel, made, early,...",0.028750,positive


In [22]:
def get_data(df_new,senti):
    senti_df = df_new[df_new['polarity']==senti].reset_index()
    return senti_df
p_corpus = get_data(df_new,'positive')
p_corpus=pd.DataFrame(p_corpus)

n_corpus = get_data(df_new,'negative')
n_corpus=pd.DataFrame(n_corpus)

nt_corpus = get_data(df_new,'neutral')
nt_corpus=pd.DataFrame(nt_corpus)

In [23]:
p_corpus.shape,n_corpus.shape,nt_corpus.shape


((18394, 6), (6606, 6), (0, 6))

In [24]:
df_new.drop('polarity_score',axis=1,inplace=True)
df_new.rename(columns={"polarity": "Polarity_Sentiment"},inplace=True)
df_new.head()

Unnamed: 0,id,sentiment,cleaned_review,Polarity_Sentiment
0,5814_8,1,"[stuff, going, moment, mj, started, listening,...",negative
1,2381_9,1,"[\the, classic, war, worlds\, '', timothy, hin...",positive
2,7759_3,0,"[film, start, manager, nicholas, bell, giving,...",negative
3,3630_4,0,"[must, assumed, praised, film, \the, greatest,...",positive
4,9495_8,1,"[superbly, trashy, wondrously, unpretentious, ...",negative


In [25]:
df["Polarity_Sentiment"].value_counts()

positive    18394
negative     6606
Name: Polarity_Sentiment, dtype: int64

In [27]:
# Label Encoding "Sentiment" Column

df_new['Polarity_Sentiment']=df_new['Polarity_Sentiment'].replace(to_replace=['negative', 'neutral', 'positive'],value=[0,1,2])
df_new.head()

Unnamed: 0,id,sentiment,cleaned_review,Polarity_Sentiment
0,5814_8,1,"[stuff, going, moment, mj, started, listening,...",0
1,2381_9,1,"[\the, classic, war, worlds\, '', timothy, hin...",2
2,7759_3,0,"[film, start, manager, nicholas, bell, giving,...",0
3,3630_4,0,"[must, assumed, praised, film, \the, greatest,...",2
4,9495_8,1,"[superbly, trashy, wondrously, unpretentious, ...",0


MODEL BUILDING

In [28]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report,confusion_matrix

In [29]:
corpus=[]
for i in df_new['cleaned_review']:
  review = i
  review=' '.join(review)                # joining the words to rearrage to form the sent without stop words
  corpus.append(review)

1. Using Bag of Words

In [31]:
# Converting the Words to Vector using Bag of words

from sklearn.feature_extraction.text import CountVectorizer
cv=CountVectorizer(max_features=2500,ngram_range=(1,3)) # top 2500 features are taken
X=cv.fit_transform(corpus).toarray()
y=df_new['Polarity_Sentiment']
x_train,x_test,y_train,y_test=train_test_split(X,y,test_size=0.20,random_state=1,stratify=y)

In [32]:
#Naive Bayes Model

from sklearn.naive_bayes import MultinomialNB
nb=MultinomialNB()
nb.fit(x_train,y_train)
MultinomialNB()
train_pred=nb.predict(x_train)
test_pred=nb.predict(x_test)
print(classification_report(test_pred,y_test))

              precision    recall  f1-score   support

           0       0.78      0.63      0.70      1629
           2       0.84      0.91      0.87      3371

    accuracy                           0.82      5000
   macro avg       0.81      0.77      0.79      5000
weighted avg       0.82      0.82      0.82      5000



In [33]:
#Random Forest Model

from sklearn.ensemble import RandomForestClassifier
rfc=RandomForestClassifier()
rfc.fit(x_train,y_train)
RandomForestClassifier()
train_pred=rfc.predict(x_train)
test_pred=rfc.predict(x_test)
print(classification_report(test_pred,y_test))

              precision    recall  f1-score   support

           0       0.51      0.85      0.63       788
           2       0.97      0.84      0.90      4212

    accuracy                           0.85      5000
   macro avg       0.74      0.85      0.77      5000
weighted avg       0.89      0.85      0.86      5000



In [34]:
from sklearn.svm import LinearSVC
SVCmodel = LinearSVC()
SVCmodel.fit(x_train, y_train)
LinearSVC()
train_pred=SVCmodel.predict(x_train)
test_pred = SVCmodel.predict(x_test)
print(classification_report(test_pred,y_test))


Liblinear failed to converge, increase the number of iterations.



              precision    recall  f1-score   support

           0       0.81      0.80      0.81      1342
           2       0.93      0.93      0.93      3658

    accuracy                           0.90      5000
   macro avg       0.87      0.87      0.87      5000
weighted avg       0.90      0.90      0.90      5000



Conclusion: Here we get a very good Model using 'Bag Of words' but Still the Semantic meaning will be missing So, to overcome this problem we are further going for TF-IDF Technique

2: Using Term Frequency-Inverse Document Frequency

In [35]:
# Converting the Words to Vector using TF-IDF

from sklearn.feature_extraction.text import TfidfVectorizer
tf=TfidfVectorizer(ngram_range=(1,3),max_features=3000)
X=tf.fit_transform(corpus).toarray()
x_train,x_test,y_train,y_test=train_test_split(X,y,test_size=0.20,random_state=1,stratify=y)

In [36]:
# Naive Bayes Model

nb.fit(x_train,y_train)
MultinomialNB()
train_pred=nb.predict(x_train)
test_pred=nb.predict(x_test)
print(classification_report(test_pred,y_test))

              precision    recall  f1-score   support

           0       0.40      0.85      0.55       631
           2       0.97      0.82      0.89      4369

    accuracy                           0.82      5000
   macro avg       0.69      0.83      0.72      5000
weighted avg       0.90      0.82      0.85      5000



In [37]:
# Random Forest Model

rfc.fit(x_train,y_train)
RandomForestClassifier(n_estimators=[200],criterion='entropy',max_depth=[5,10,20,25])
train_pred=rfc.predict(x_train)
test_pred=rfc.predict(x_test)
print(classification_report(test_pred,y_test))

              precision    recall  f1-score   support

           0       0.53      0.85      0.65       826
           2       0.97      0.85      0.91      4174

    accuracy                           0.85      5000
   macro avg       0.75      0.85      0.78      5000
weighted avg       0.89      0.85      0.86      5000



In [39]:
SVCmodel.fit(x_train, y_train)
LinearSVC()
train_pred=SVCmodel.predict(x_train)
test_pred = SVCmodel.predict(x_test)
print(classification_report(test_pred,y_test))

              precision    recall  f1-score   support

           0       0.81      0.86      0.83      1253
           2       0.95      0.93      0.94      3747

    accuracy                           0.91      5000
   macro avg       0.88      0.89      0.89      5000
weighted avg       0.92      0.91      0.91      5000



Conclusion: Here we get a better **accuracy** of **91%** with SVC model  using TF-IDF.so we can choose this model