# Sentiment Analysis

A logistic regression classifier using scikit-learn to identify the sentiment of a movie review given a data set of 50000 IMDb reviews. 

Also includes, cleaning and pre-process text data, performing feature extraction with The Natural Language Toolkit (NLTK) and tuning model hyperparameters and evaluate model accuracy

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

df=pd.read_csv('../input/imdb-dataset-of-50k-movie-reviews/IMDB Dataset.csv')
df.head()

In [None]:
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer

count = CountVectorizer()
"""docs = np.array([
        'The sun is shining',
        'The weather is sweet',
        'The sun is shining, the weather is sweet, and one and one is two'])
bag = count.fit_transform(docs) """

In [None]:
#print(count.vocabulary_)

In [None]:
#print(bag.toarray())

In [None]:
np.set_printoptions(precision=2)

In [None]:
df.loc[0,'review']


In [None]:
import re
def preprocessor(text):
    text = re.sub('<[^>]*>', '', text)
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', text)
    text = re.sub('[\W]+', ' ', text.lower()) +\
        ' '.join(emoticons).replace('-', '')
    return text

In [None]:
preprocessor(df.loc[0, 'review'])

In [None]:
df['review'] = df['review'].apply(preprocessor)

In [None]:
from nltk.stem.porter import PorterStemmer

porter = PorterStemmer()

def tokenizer(text):
    return text.split()


def tokenizer_porter(text):
    return [porter.stem(word) for word in text.split()]

In [None]:
tokenizer('runners like running and thus they run')

In [None]:
tokenizer_porter('runners like running and thus they run')

In [None]:
import nltk

In [None]:
from nltk.corpus import stopwords

stop = stopwords.words('english')
[w for w in tokenizer_porter('a runner likes running and runs a lot')[-10:]
if w not in stop]

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf=TfidfVectorizer(strip_accents=None, lowercase=False, tokenizer=tokenizer_porter, use_idf=True, norm='l2',smooth_idf=True)

x=tfidf.fit_transform(df.review);
y=df.sentiment.values

In [None]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(x,y,random_state=1,test_size=0.5,shuffle=False)

In [None]:
import pickle
from sklearn.linear_model import LogisticRegressionCV

clf=LogisticRegressionCV(cv=5,
                        scoring='accuracy',
                        random_state=0,
                        n_jobs=-1,
                        verbose=3,
                        max_iter=300).fit(x_train,y_train)
saved_model=open('saved_model.sav','wb')
pickle.dump(clf,saved_model)


In [None]:
saved_model.close()

In [None]:
    file='saved_model.sav'
    saved_clf=pickle.load(open(file,'rb'))


In [None]:
y_predict=saved_clf.predict(x_test)
saved_clf.score(x_test,y_test)

In [None]:
from sklearn.metrics import confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt
cm=confusion_matrix(y_test,y_predict)
cm
cm_df = pd.DataFrame(cm,
                     index = ['positive','negative'], 
                     columns = ['positive','negative'])

plt.figure(figsize=(6,6))
sns.heatmap(cm_df, annot=True)

plt.title('Logistic Regression')
plt.ylabel('True label')
plt.xlabel('Predicted label')
plt.show()