In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
#Reading the Dataset
movie_data = pd.read_csv('../input/imdb-dataset-of-50k-movie-reviews/IMDB Dataset.csv')

In [None]:
# Check the head of the dataset
movie_data.head()

In [None]:
movie_data.describe()

In [None]:
# Importing vader lexicon from NLTK
import nltk
nltk.download('vader_lexicon')

In [None]:
#!pip3 install -U nltk[twitter]

VADER Sentiment Analysis. VADER (Valence Aware Dictionary and sEntiment Reasoner) is a lexicon and rule-based sentiment 
analysis tool that is specifically attuned to sentiments expressed in social media, and works well on texts from other domains.

In [None]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer
sia = SentimentIntensityAnalyzer()

In [None]:
# Using SentimentIntensityAnalyzer lets get the polarity scores for each review
movie_data['scores'] = movie_data['review'].apply(lambda review: sia.polarity_scores(review))

In [None]:
movie_data.head()

In [None]:
# Lets store the compound value alone in a column
movie_data['compound'] = movie_data['scores'].apply(lambda comp: comp['compound'])

In [None]:
# Based on the compound score mapping it as 0 or 1 
movie_data['comp_score'] = movie_data['compound'].apply(lambda c: 1 if c >= 0 else 0)

In [None]:
# mapping the sentiment as 0 for neg and 1 for positive
movie_data['sentiment'] = movie_data['sentiment'].map({'positive':1,'negative':0})

In [None]:
from sklearn.metrics import confusion_matrix,classification_report,accuracy_score,f1_score

In [None]:
f1 = f1_score(movie_data['sentiment'],movie_data['comp_score'],pos_label=1)
accuracy = accuracy_score(movie_data['sentiment'],movie_data['comp_score'])

print(f"Validation F1 Score  : {f1} and Accuracy Score {accuracy}")

We get the approx accuracy of 0.69 and an F1 score of 0.74

In [None]:
# Now lets try to do the same using Spacy
import spacy
nlp = spacy.load('en_core_web_lg')

In [None]:
# Fetching each of the review and storing it in a list
review_list = list(movie_data['review'].values)

In [None]:
review_processed = list(nlp.pipe(review_list))

In [None]:
# Storing the values as a vector 
X = [review.vector for review in review_processed]

y = movie_data['sentiment'].tolist()

In [None]:
# Splitting the data to train and test
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=77)

In [None]:
# Using DecisionTreeClassifier to predit the data
from sklearn.tree import DecisionTreeClassifier

clf = DecisionTreeClassifier()

In [None]:
clf = clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

In [None]:
f1 = f1_score(y_test, y_pred,pos_label=1)
accuracy = accuracy_score(y_test, y_pred)

print(f"Validation F1 Score  : {f1} and Accuracy Score {accuracy}")

We get the approx accuracy of 0.67 and an F1 score of 0.67

In [None]:
# Using GradientBoostingClassifier to predit the data
from sklearn.ensemble import GradientBoostingClassifier

gbc = GradientBoostingClassifier()

In [None]:
gbc.fit(X_train, y_train)

In [None]:
y_pred = gbc.predict(X_test)

In [None]:
f1 = f1_score(y_test, y_pred,pos_label=1)
accuracy = accuracy_score(y_test, y_pred)

print(f"Validation F1 Score  : {f1} and Accuracy Score {accuracy}")

We get the approx accuracy of 0.82 and an F1 score of 0.82

In [None]:
# Using TFIDF and LinearSVC to predit the data

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC

In [None]:
from sklearn.pipeline import Pipeline

In [None]:
text_clf = Pipeline([('tfidf', TfidfVectorizer()),
                     ('clf', LinearSVC()),])

In [None]:
movie_data.head()

In [None]:
X1 = movie_data['review']

y1 = movie_data['sentiment']

X1_train, X1_test, y1_train, y1_test = train_test_split(X1, y1, test_size=0.3, random_state=77)

In [None]:
text_clf.fit(X1_train, y1_train)

In [None]:
predictions = text_clf.predict(X1_test)

In [None]:
f1 = f1_score(y1_test, predictions,pos_label=1)
accuracy = accuracy_score(y1_test, predictions)

print(f"Validation F1 Score  : {f1} and Accuracy Score {accuracy}")

We get the approx accuracy of 0.90 and an F1 score of 0.90


Do let me know how to improve the models more for better predictions