In [20]:
import spacy
from spacy import displacy
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from sklearn.svm import LinearSVC

In [22]:
import en_core_web_sm
nlp = en_core_web_sm.load()

In [23]:
import string

In [24]:
from spacy.lang.en.stop_words import STOP_WORDS
stopwords = list(STOP_WORDS)
print(len(stopwords)) #List of all stopwords in spacy

326


In [25]:
data_imdb = pd.read_csv("imdb_review.txt",sep='\t', header= None)
columnName = ['Review','Sentiment']
data_imdb.columns = columnName
data_imdb.head()

Unnamed: 0,Review,Sentiment
0,"A very, very, very slow-moving, aimless movie ...",0
1,Not sure who was more lost - the flat characte...,0
2,Attempting artiness with black & white and cle...,0
3,Very little music or anything to speak of.,0
4,The best scene in the movie was when Gerardo i...,1


In [26]:
print(data_imdb.shape)

(748, 2)


In [27]:
data_imdb.Sentiment.value_counts()

1    386
0    362
Name: Sentiment, dtype: int64

In [28]:
data_imdb.isnull().sum()

Review       0
Sentiment    0
dtype: int64

In [29]:
punct = string.punctuation
print(punct) #returns set of all punctuations in English

!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~


In [30]:
#Removing stopwords and puncutations from the tokens 
def dataCleaning(sentence):
  doc = nlp(sentence)
  tokens = []
  for token in doc:
    if token.lemma_ != '-PRON-':
      temp = token.lemma_.lower().strip()
    else:
      temp = token.lower_
    tokens.append(temp)
  clean_tokens = []
  for token in tokens:
    if token not in punct and token not in stopwords:
      clean_tokens.append(token)
  return clean_tokens

**Now, on passing a setence through this function as argument, punctuation and stopword free text is returned which will be suitable for sentiment analysis**

In [31]:
dataCleaning("The movie was not up the mark, even though there were some brilliant performances, it didn't make any sense in the end.")

['movie', 'mark', 'brilliant', 'performance', 'sense', 'end']

In [33]:
# Spillting the data into train and test data
X = data_imdb['Review']
y = data_imdb['Sentiment']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)
print(X_train.shape,y_test.shape)

(598,) (150,)


In [34]:
#preparing the model
tfidf = TfidfVectorizer(tokenizer = dataCleaning)
svm = LinearSVC()
steps = [('tfidf',tfidf),('svm',svm)]
pipe = Pipeline(steps)

In [35]:
# Training the model
pipe.fit(X_train,y_train)

Pipeline(steps=[('tfidf',
                 TfidfVectorizer(tokenizer=<function dataCleaning at 0x00000221DF9A1048>)),
                ('svm', LinearSVC())])

In [36]:
# Testing on the test dataset
y_pred = pipe.predict(X_test)

In [37]:
# Printing the classification report and the confusion matrix
print(classification_report(y_test,y_pred))
print("\n\n")
print(confusion_matrix(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.82      0.79      0.81        71
           1       0.82      0.85      0.83        79

    accuracy                           0.82       150
   macro avg       0.82      0.82      0.82       150
weighted avg       0.82      0.82      0.82       150




[[56 15]
 [12 67]]


**Now having trained the model, we test the model on random samples of text**

In [38]:
pipe.predict(["The movie was quite engrossing"])

array([1], dtype=int64)

**"1" represents given input has positive sentiment**

In [40]:
pipe.predict(["I regret wasting my money on this movie"])

array([0], dtype=int64)

**"0" represents negative sentiment**