# **1.Importing data**


In [None]:
import pandas as pd
import numpy as np

In [None]:
df=pd.read_csv('../input/imdb-dataset-of-50k-movie-reviews/IMDB Dataset.csv')
df.head()

In [None]:
df['Sentiment']=np.where(df['sentiment']=='positive',1,0)
df.drop('sentiment',axis=1,inplace=True)
df_copy=df

In [None]:
df.head()

In [None]:
df['review'][3]

# Data preparation

* Removed punctuations and html commands.
* Transferred emoticons to the end of of the document.
* Converted every text to lower case.

In [None]:
import re
def preprocess(text):
    text = re.sub('<[^>]*>', '',text)
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', text)
    text=re.sub('[\W]+',' ',text.lower()) +\
       ' '.join(emoticons).replace('-','')    
    return text

In [None]:
print('this is an example')
preprocess('Hello world !!! :) :( . Wishing you a very good morning!!')

In [None]:
df['review']=df['review'].apply(preprocess)

# tokenization of documents

In [None]:
from nltk.stem.porter import PorterStemmer

porter = PorterStemmer()

In [None]:
def tokenizer_stem(text):
    return [porter.stem(word) for word in text.split()]

# Transforming text data into TF-IDF Vectors

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer 

tfidf=TfidfVectorizer(strip_accents=None,
                     lowercase=False,
                     preprocessor=None,
                     tokenizer=tokenizer_stem,
                     use_idf=True,
                     norm='l2',
                     smooth_idf=True)

y=df.Sentiment.values
X=tfidf.fit_transform(df.review)

# Document classification using Logistic regression

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=1)

In [None]:
import pickle 
from sklearn.linear_model import LogisticRegressionCV

clf= LogisticRegressionCV(cv=5,
                         random_state=0,
                         n_jobs=-1,
                         verbose=3,
                         max_iter=300).fit(X_train,y_train)

saved_model= open('saved_model1.sav', 'wb')
pickle.dump(clf,saved_model)
saved_model.close()


Saved the model by the name of 'saved_model1.sav' using pickle library.

# Model evaluation

In [None]:
filename='saved_model1.sav'
saved_clf=pickle.load(open(filename,'rb'))

loaded the saved model.

In [None]:
saved_clf.score(X_test,y_test)

**Accuracy of model on test set is about 90%.**

# Creating World-Cloud for good and bad review

In [None]:
from wordcloud import WordCloud, STOPWORDS
import matplotlib.pyplot as plt
stopwords = set(STOPWORDS)

def show_wordcloud(data, title = None):
    wordcloud = WordCloud(
        background_color='white',
        stopwords=stopwords,
        max_words=100,
        max_font_size=40, 
        scale=3,
        random_state=1 # chosen at random by flipping a coin; it was heads
    ).generate(str(data))

    fig = plt.figure(1, figsize=(12, 12))
    plt.axis('off')
    if title: 
        fig.suptitle(title, fontsize=20)
        fig.subplots_adjust(top=2.3)

    plt.imshow(wordcloud)
    plt.show()


In [None]:
positiveReview=df_copy[df_copy['Sentiment']==1]
negativeReview=df_copy[df_copy['Sentiment']==0]

In [None]:
print('Bad review')
show_wordcloud(negativeReview['review'])

In [None]:
print('Good review')
show_wordcloud(positiveReview['review'])