### Steps for building a NLP model
    1. Import the libraries
    2. Import the data
    3. Preprocess the data
    4. Get the bag of words model
    5. Do the train test split
    6. Train the naive bayes model on the training data
    7. Evaluate on the test set

In [1]:
# Review Sentiment Classification using NLTK

In [9]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from collections import Counter

In [10]:
df = pd.read_csv("../IMDB Dataset.csv").sample(768)
df.head()

Unnamed: 0,review,sentiment
21066,What a loss the passing of director Emile Ardo...,positive
34437,"The idea of young girl, who gets pregnant at t...",positive
32807,I will give it a second chance but was very di...,negative
25502,I usually steer clear of TV movies because of ...,positive
44237,"I love the premise, but it's replay value is o...",positive


In [11]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

df['sentiment'] = le.fit_transform(df['sentiment'])
df.head()

Unnamed: 0,review,sentiment
21066,What a loss the passing of director Emile Ardo...,1
34437,"The idea of young girl, who gets pregnant at t...",1
32807,I will give it a second chance but was very di...,0
25502,I usually steer clear of TV movies because of ...,1
44237,"I love the premise, but it's replay value is o...",1


In [12]:
df.shape

(768, 2)

In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 768 entries, 21066 to 28486
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   review     768 non-null    object
 1   sentiment  768 non-null    int32 
dtypes: int32(1), object(1)
memory usage: 15.0+ KB


In [14]:
df['sentiment'].value_counts()

0    403
1    365
Name: sentiment, dtype: int64

In [18]:
# Cleaning the data 
import nltk
nltk.download('wordnet')
corpus = []

for i in range(len(df)):
    tokens = word_tokenize(df.iloc[i, 0])
    lower = [l.lower() for l in tokens]
    alphas = [t for t in lower if t.isalpha()]
    stops = [t for t in alphas if t not in stopwords.words('english')]
    wordnet = WordNetLemmatizer()
    lemmatized = [wordnet.lemmatize(t) for t in stops]
    review = ' '.join(lemmatized)
    corpus.append(review)

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\vlekkala\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [19]:
corpus[:10]

['loss passing director emile ardolino could take light script right casting editing put twinkle make shine like star particular star may brightest sky great romance go definitely one keep tuned end really want know thing going work br br script perfect cybill shepherd time needed capitalize moonlighting success new generation fortunately probably unaware many big screen major dud promising start film every bit back form widow living vicariously daughter mary stuart masterson cusp stardom would peak fried green tomato two year later may looked young role work well way story unfolds film overstep bound br br shepherd graciously allows robert downey carry much film show mature comic flair previous film point ample support ryan best role year christopher macdonald masterson natural charm pretty much coast either way making character seem like breath fresh air every br br ardolino make good use cast sex appeal way dirty dancing film quite sizzling could still watch parent happened room use

In [21]:
# Lets produce a sparse matrix of reviews with counts or tfidf
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
cv = CountVectorizer()
X = cv.fit_transform(corpus).toarray()
y = df['sentiment'].values

In [23]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=0.2)

In [28]:
# fit the model 
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC

model = MultinomialNB()

model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.76      0.85      0.80        74
           1       0.85      0.75      0.79        80

    accuracy                           0.80       154
   macro avg       0.80      0.80      0.80       154
weighted avg       0.80      0.80      0.80       154



In [27]:
# fit the model 
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC

model = LinearSVC()

model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.78      0.81      0.79        74
           1       0.82      0.79      0.80        80

    accuracy                           0.80       154
   macro avg       0.80      0.80      0.80       154
weighted avg       0.80      0.80      0.80       154



In [29]:
# Using spacy

# import spacy libraries
import spacy
import string
nlp = spacy.load('en_core_web_sm')

In [30]:
# text cleaning
from spacy.lang.en.stop_words import STOP_WORDS
stopwords = list(STOP_WORDS)
punct = string.punctuation

def text_cleaning(sentence):
    doc = nlp(sentence)
    tokens = []
    for token in doc:
        if token.lemma_ != '-PRON-':
            temp = token.lemma_.lower().strip()
        else:
            temp = token.lower_
        
        tokens.append(temp)
    cleaned_tokens = []
    for token in tokens:
        if token not in stopwords and token not in punct:
            cleaned_tokens.append(token)
    return cleaned_tokens

In [32]:
tfidf = TfidfVectorizer(tokenizer = text_cleaning)

In [33]:
from sklearn.pipeline import Pipeline

clf = Pipeline([('tfidf', tfidf), ('clf', LinearSVC())])

X = df['review']
y = df['sentiment']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=0.2)

clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.75      0.84      0.79        74
           1       0.83      0.74      0.78        80

    accuracy                           0.79       154
   macro avg       0.79      0.79      0.79       154
weighted avg       0.79      0.79      0.79       154



In [47]:
clf.predict(['movie was very long but made sense'])

array([1])