# NLP - vectorizer methods

In [1]:
import pandas as pd

from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

In [9]:
data = pd.read_csv("data/reviews.csv")
data.head()

Unnamed: 0,target,reviews
0,neg,"plot : two teen couples go to a church party ,..."
1,neg,the happy bastard's quick movie review \ndamn ...
2,neg,it is movies like these that make a jaded movi...
3,neg,""" quest for camelot "" is warner bros . ' firs..."
4,neg,synopsis : a mentally unstable man undergoing ...


## Preprocessing

In [10]:
def remove_punctuation(text):
    import string 
    for punctuation in string.punctuation:
        text = text.replace(punctuation, '')
    return text

def lower_text(text):
    return text.str.lower()

In [11]:
data['reviews'] = remove_punctuation(data['reviews'])
data['reviews'] = lower_text(data['reviews'])
y = data['target']

## Vectorizer and model

In [13]:
#Tfid
pipeline_1 = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('nb', MultinomialNB()),
])

parameters_1 = {
    'tfidf__ngram_range': ((1,1), (2,2)),
    'nb__alpha': (0.1,1),}

grid_search_1 = GridSearchCV(pipeline_1, parameters_1, n_jobs=-1, 
                           verbose=1, scoring = "accuracy", 
                           refit=True, cv=5)

grid_search_1.fit(data.reviews,y)

print(grid_search_1.best_params_)

Fitting 5 folds for each of 4 candidates, totalling 20 fits
{'nb__alpha': 0.1, 'tfidf__ngram_range': (2, 2)}


In [14]:
#Bag of words
pipeline_2 = Pipeline([
    ('bag', CountVectorizer()),
    ('nb', MultinomialNB()),
])

parameters_2 = {
    'bag__ngram_range': ((1,1), (2,2)),
    'nb__alpha': (0.1,1),}

grid_search_2 = GridSearchCV(pipeline_2, parameters_2, n_jobs=-1, 
                           verbose=1, scoring = "accuracy", 
                           refit=True, cv=5)

grid_search_2.fit(data.reviews,y)

print(grid_search_2.best_params_)

Fitting 5 folds for each of 4 candidates, totalling 20 fits
{'bag__ngram_range': (2, 2), 'nb__alpha': 0.1}


In [15]:
print(grid_search_1.best_score_)
print(grid_search_2.best_score_)

0.836
0.8320000000000001
