# SENTIMENT ANALYSIS ON IMDB DATASET (TASK 5)

## Loading Dataset

In [1]:
import pandas as pd
df = pd.read_csv("C:\\Users\\Sanskriti\\Documents\\IMDB_dataset.csv")

In [2]:
from nltk.stem.porter import PorterStemmer
porter = PorterStemmer() # instantiating stemmer

## Stemming

In [3]:
def stemmer_tokenize (text):
  return [porter.stem(word) for word in text.split()]

In [4]:
stemmer_tokenize('coders like coding and thus they code')

['coder', 'like', 'code', 'and', 'thu', 'they', 'code']

In [5]:
import nltk

## TF-IDF Vectorizer

In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [7]:
tfidf = TfidfVectorizer(strip_accents= None,
                        lowercase=False,
                        tokenizer = stemmer_tokenize,
                        use_idf = True,
                        norm = 'l2',
                        smooth_idf = True)

In [8]:
Y = df.sentiment.values #labels
X = tfidf.fit_transform(df.review) #all records are vectorized

## Splitting Dataset

In [9]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(X, Y, random_state = 1, test_size = 0.5)

## Document classification using Logistic Regression

In [11]:
import pickle
from sklearn.linear_model import LogisticRegressionCV #Cross-Validation

clf = LogisticRegressionCV(cv = 5,
                          scoring = 'accuracy',
                          random_state = 0,
                          n_jobs = 1,
                          verbose = 2,
                          max_iter = 300).fit(x_train, y_train)
#saving the model
saved_model = open('saved_model.sav','wb')
#using pickle library's dump function to write saved model
pickle.dump(clf,saved_model)
#close saved model
saved_model.close()

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  1.4min remaining:    0.0s
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  7.0min finished


## Model Evaluation

In [12]:
filename = 'saved_model.sav'
#using pickle function to load saved model
saved_clf = pickle.load(open(filename, 'rb'))
#test saved model on test data
saved_clf.score(x_test, y_test)

0.89072

In [14]:
pred=clf.predict(x_test)

In [15]:
from sklearn.metrics import accuracy_score
accuracy=accuracy_score(y_test,pred)

In [16]:
accuracy

0.89072