# Vectorizer Tuning

In [1]:
import pandas as pd

data = pd.read_csv("reviews.csv")

data.head()

Unnamed: 0,target,reviews
0,neg,"plot : two teen couples go to a church party ,..."
1,neg,the happy bastard's quick movie review \ndamn ...
2,neg,it is movies like these that make a jaded movi...
3,neg,""" quest for camelot "" is warner bros . ' firs..."
4,neg,synopsis : a mentally unstable man undergoing ...


The dataset is made up of positive and negative movie reviews.

## Preprocessing

👇 Remove punctuation and lower case the text.

In [2]:
import string

def punc_lower(data):
    data.lower()
    for punctuation in string.punctuation:
        data=data.replace(punctuation," ")
    return data

data["clean_reviews"]=data.reviews.apply(punc_lower)
    

In [3]:
data.head()

Unnamed: 0,target,reviews,clean_reviews
0,neg,"plot : two teen couples go to a church party ,...",plot two teen couples go to a church party ...
1,neg,the happy bastard's quick movie review \ndamn ...,the happy bastard s quick movie review \ndamn ...
2,neg,it is movies like these that make a jaded movi...,it is movies like these that make a jaded movi...
3,neg,""" quest for camelot "" is warner bros . ' firs...",quest for camelot is warner bros firs...
4,neg,synopsis : a mentally unstable man undergoing ...,synopsis a mentally unstable man undergoing ...


In [5]:
# pipeline.fit(data.clean_reviews,data.target)
# pipeline.score(data.clean_reviews,data.target)
# pipeline.para(data.clean_reviews,data.target)

## Tuning

👇 Tune a vectorizer of your choice (or try both!) and a MultinomialNB model simultaneously.

In [8]:
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.naive_bayes import MultinomialNB

pipeline=Pipeline([
    ("tfidf",TfidfVectorizer()),
    ("nb",MultinomialNB())
])

parameters = {
    'tfidf__ngram_range': ((1,1), (2,2)),
    'nb__alpha': (0.1,1),}


grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, 
                           verbose=1, scoring = "accuracy", 
                           refit=True, cv=5)

grid_search.fit(data["clean_reviews"],data.target)


# Create Pipeline


# Set parameters to search (model and vectorizer)

# Perform grid search on pipeline

Fitting 5 folds for each of 4 candidates, totalling 20 fits


GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('tfidf', TfidfVectorizer()),
                                       ('nb', MultinomialNB())]),
             n_jobs=-1,
             param_grid={'nb__alpha': (0.1, 1),
                         'tfidf__ngram_range': ((1, 1), (2, 2))},
             scoring='accuracy', verbose=1)

In [10]:
grid_search.best_params_

{'nb__alpha': 0.1, 'tfidf__ngram_range': (2, 2)}

⚠️ Please push the exercise once you are done 🙃

## 🏁 

In [9]:
grid_search.best_score_

0.834