# Fast sentiment analysis that uses hashing trick

Hashing trick lowers memory requirement as there is no longer a need to store entire vocab in memory.

Also Logistic Regression will have less weights reducing memory.

In [None]:
import pandas as pd
import re
from sklearn.model_selection import train_test_split
import sklearn
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import HashingVectorizer,CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from tqdm.notebook import tqdm
tqdm.pandas()

In [None]:
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()


def lemmatize_sentence(sentence):
    return " ".join([lemmatizer.lemmatize(t) for t in sentence.split()])

In [None]:
def simple_clean(text):
    text = text.lower()
    text = re.sub(r"[^a-z ]+", " ", text)
    return lemmatize_sentence(text)

In [None]:
def encode_sentiment(text):
    if text == "positive":
        return 1
    elif text == "negative":
        return 0
    else:
        print("error")
def decode_sentiment(number):
    if number == 1:
        return "positive"
    elif number == 0:
        return "negative"

In [None]:
df = pd.read_csv("../input/imdb-dataset-of-50k-movie-reviews/IMDB Dataset.csv")

In [None]:
df["sentiment_value"] = df["sentiment"].apply(encode_sentiment)
df["review_clean"] = df["review"].progress_apply(simple_clean)

In [None]:
X_train,X_test,y_train,y_test = train_test_split(df["review_clean"].values,df["sentiment_value"].values, test_size=0.2, random_state=42)

In [None]:
pipeline1 = Pipeline(steps=[('featurehash',HashingVectorizer(ngram_range=(1, 2),n_features=8192)),('logisticregression',LogisticRegression(solver='liblinear'))])
pipeline2 = Pipeline(steps=[('countvectorizer',CountVectorizer(ngram_range=(1, 2))),('logisticregression',LogisticRegression(solver='liblinear'))])

In [None]:
%%time
pipeline1.fit(X_train,y_train)

In [None]:
%%time
pipeline2.fit(X_train,y_train)

In [None]:
%%time
y_pred_1 = pipeline1.predict(X_test)

In [None]:
%%time
y_pred_2 = pipeline2.predict(X_test)

In [None]:
print("Hashing Vectorizer")
print(classification_report(y_test,y_pred_1))
print("Count Vectorizer")
print(classification_report(y_test,y_pred_2))

In [None]:
print("Total number of features for count vectorizer: ",len(pipeline2['countvectorizer'].get_feature_names()))

In [None]:
import joblib
joblib.dump(pipeline1, 'pipeline1.joblib')
joblib.dump(pipeline2,'pipeline2.joblib')

In [None]:
import os
print(f"Size of hash vectorizer {os.path.getsize('pipeline1.joblib')/1000000}mb")
print(f"Size of count vectorizer {os.path.getsize('pipeline2.joblib')/1000000}mb")