# **START**

## **#1** (Loading dataset)

In [1]:
import pandas as pd

df = pd.read_csv('../input/imdb-dataset/IMDB Dataset.csv')
df.head(5)

## **#2** (Transforming documents into feature vectors)

In [2]:
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer

count = CountVectorizer()
docs = np.array([
        'The sun is shining',
        'The weather is sweet',
        'The sun is shining, the weather is sweet, and one and one is two'])
bag = count.fit_transform(docs)

In [3]:
print(count.vocabulary_)

In [4]:
print(bag.toarray())

## **#3** (Word relevancy using term frequency-inverse document frequency)

In [5]:
np.set_printoptions(precision=2)

In [6]:
from sklearn.feature_extraction.text import TfidfTransformer

tfidf = TfidfTransformer(use_idf=True, norm='l2', smooth_idf=True)
print(tfidf.fit_transform(count.fit_transform(docs)).toarray())

## **#4** (Calculate tf-idf)

In [7]:
tf_is = 3
n_docs = 3
idf_is = np.log((n_docs+1) / (3+1))
tfidf_is = tf_is * (idf_is + 1)
print('tf-idf of term "is" = %.2f' % tfidf_is)

In [8]:
tfidf = TfidfTransformer(use_idf=True, norm=None, smooth_idf=True)
raw_tfidf = tfidf.fit_transform(count.fit_transform(docs)).toarray()[-1]
raw_tfidf

In [9]:
l2_tfidf = raw_tfidf / np.sqrt(np.sum(raw_tfidf**2))
l2_tfidf

## **#5** (Preparation of data)

In [10]:
df.loc[0, 'review'][-50:]

In [11]:
import re
def preprocessor(text):
    text = re.sub('<[^>]*>', '', text)
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', text)
    text = re.sub('[\W]+', ' ', text.lower()) +\
        ' '.join(emoticons).replace('-', '')
    return text

In [12]:
preprocessor(df.loc[0, 'review'][-50:])

In [13]:
preprocessor("</a>This :) is :( a test :-)!")

In [14]:
df['review'] = df['review'].apply(preprocessor)

## **#6** (Tokenization of documents)

In [15]:
from nltk.stem.porter import PorterStemmer

porter = PorterStemmer()

def tokenizer(text):
    return text.split()


def tokenizer_porter(text):
    return [porter.stem(word) for word in text.split()]

In [16]:
tokenizer('runners like running and thus they run')

In [17]:
tokenizer_porter('runners like running and thus they run')

In [18]:
import nltk

nltk.download('stopwords')

In [19]:
from nltk.corpus import stopwords

stop = stopwords.words('english')
[w for w in tokenizer_porter('a runner likes running and runs a lot')[-10:]
if w not in stop]

## **#7** (Document classification via a logistic regression model)

In [20]:
def getStemmedReview(review):
    review=review.lower()
    review=review.replace("<br /><br />"," ")
    #Tokenize
    tokens=tokenizer(review)
    new_tokens=[token for token in tokens if token not in  stop]
    stemmed_tokens=[porter.stem(token) for token in new_tokens]
    clean_review=' '.join(stemmed_tokens)
    return clean_review

In [21]:
df['review'].apply(getStemmedReview)


X_train = df.loc[:25000, 'review'].values
y_train = df.loc[:25000, 'sentiment'].values
X_test = df.loc[25000:, 'review'].values
y_test = df.loc[25000:, 'sentiment'].values

In [22]:
print("Review: ", X_train[1:2], "\n")
print("Sentiment: ", y_train[1])

In [23]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(sublinear_tf=True, encoding='utf-8',
 decode_error='ignore')
vectorizer.fit(X_train)
X_train=vectorizer.transform(X_train)
X_test=vectorizer.transform(X_test)

In [24]:
from sklearn.linear_model import LogisticRegression
model=LogisticRegression(solver='liblinear')
model.fit(X_train,y_train)

In [25]:
model.predict(X_test[0])
model.predict_proba(X_test[0])

## **#8 (Model Accuracy)**

In [26]:
result = model.score(X_test, y_test)
print(result)

## **#9** (Generate Pickle/Joblib Files from the Model for Web Application Deployment)

In [27]:
import pickle

In [28]:
# save the model to disk
filename = 'saved_model.sav'
pickle.dump(model, open(filename, 'wb'))

In [29]:
filename = 'saved_model.sav'
gs_lr_tfidf = pickle.load(open(filename, 'rb'))

In [31]:
import joblib
joblib.dump(model,'Sentiment-Analyzer\pkl_objects\logreg.joblib')
joblib.dump(vectorizer,'Sentiment-Analyzer\pkl_objects\\vectorizer.joblib')

# **END**