# TfIdf model (Movie reviews)

In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score
import numpy as np
import warnings
warnings.filterwarnings('ignore')

## Read data

In [2]:
df = pd.read_csv('../moviereviews.tsv', sep='\t')
df.head()

Unnamed: 0,label,review
0,neg,how do films like mouse hunt get into theatres...
1,neg,some talented actresses are blessed with a dem...
2,pos,this has been an extraordinary year for austra...
3,pos,according to hollywood movies made in last few...
4,neg,my first press screening of 1998 and already i...


In [3]:
#np.bincount(df.label)
df['label'].value_counts()

pos    1000
neg    1000
Name: label, dtype: int64

In [4]:
df.isnull().sum()

label      0
review    35
dtype: int64

In [5]:
df = df.dropna()

In [6]:
df.shape

(1965, 2)

## Split data into train and test

In [7]:
def split_data(data,y,length,split_mark=0.8):
    if split_mark > 0. and split_mark<1.0:
        n = int(split_mark*length)
    else:
        n = int(split_mark)
    xtrain = data[:n].copy()
    xtest = data[n:].copy()
    ytrain = y[:n].copy()
    ytest = y[n:].copy()
    return xtrain, xtest, ytrain, ytest

In [8]:
xtrain, xtest, ytrain, ytest = split_data(df.review, df.label, len(df))
print(xtrain.shape, xtest.shape)

(1572,) (393,)


## Separate Steps

### Feature Extraction

In [9]:
vectorizer = TfidfVectorizer()

In [10]:
xtrain_tfidf = vectorizer.fit_transform(xtrain)
xtest_tfidf = vectorizer.transform(xtest)

In [11]:
xtrain_tfidf.shape

(1572, 35629)

In [12]:
feature_names = vectorizer.get_feature_names()

In [13]:
feature_names[19500:19520]

['mastering',
 'masterless',
 'mastermind',
 'masterminded',
 'masterminds',
 'masterpeice',
 'masterpiece',
 'masterpieces',
 'masters',
 'masterson',
 'masterwork',
 'mastery',
 'mastrantonio',
 'masturbates',
 'masturbation',
 'masturbatory',
 'masur',
 'mat',
 'matador',
 'matarazzo']

In [14]:
xtrain.shape

(1572,)

In [15]:
xtrain_tfidf.toarray()

array([[0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.03284895, 0.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ]])

### Model Training & Prediction

In [16]:
parameters = {'C':[0.001,0.01,0.1,1,10], 'max_iter':[50, 75, 100, 150]}
gs_clf = GridSearchCV(estimator=LogisticRegression(),param_grid=parameters, n_jobs=-1, cv=5)

In [17]:
scores = cross_val_score(estimator=gs_clf, X=xtrain_tfidf, y=ytrain ,cv=5)
print("MEan score:{:.2f}".format(np.mean(scores)))

MEan score:0.84


In [18]:
log_model = gs_clf.fit(xtrain_tfidf, ytrain)

In [19]:
preds = log_model.predict(xtest_tfidf)

In [20]:
accuracy_score(ytest, preds)

0.8320610687022901

### Single data prediction

In [21]:
raw = df['review'][2]
data = vectorizer.transform([raw])

In [22]:
# 35629 unique words as features
data.shape

(1, 35629)

In [23]:
log_model.predict(data)

array(['pos'], dtype=object)

In [24]:
df['label'][2]

'pos'

Now, let's summarise all these steps as in a pipeline object.

## Pipeline methodology

In [25]:
t = Pipeline([
    ('vect',TfidfVectorizer()),
    ('clf',LogisticRegression())])

In [26]:
t.fit(xtrain, ytrain)

Pipeline(steps=[('vect', TfidfVectorizer()), ('clf', LogisticRegression())])

In [27]:
#np.mean(preds==ytest)
t.score(xtest,ytest)

0.8040712468193384

### Hyper-parameter Tuning

In [28]:
pipe = Pipeline([
    ('vect',TfidfVectorizer()),
    ('gs_clf',gs_clf)])

In [29]:
gs_clf

GridSearchCV(cv=5, estimator=LogisticRegression(), n_jobs=-1,
             param_grid={'C': [0.001, 0.01, 0.1, 1, 10],
                         'max_iter': [50, 75, 100, 150]})

In [30]:
tuned_model = pipe.fit(xtrain,ytrain)

In [31]:
tuned_model.score(xtest,ytest)

0.8320610687022901

## Model Evaluation

In [32]:
preds = tuned_model.predict(xtest)

In [33]:
accuracy_score(ytest, preds)

0.8320610687022901

In [34]:
print(classification_report(ytest, preds))

              precision    recall  f1-score   support

         neg       0.82      0.87      0.84       201
         pos       0.85      0.80      0.82       192

    accuracy                           0.83       393
   macro avg       0.83      0.83      0.83       393
weighted avg       0.83      0.83      0.83       393



### Raw data prediction
We havent performed any text-preprocessing hence, can directly predict over the pipeline model

In [35]:
tuned_model.predict([" Not best"])

array(['pos'], dtype=object)

# END