In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import nltk
import re
from tqdm import tqdm
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

In [2]:
data = pd.read_csv("Reviews.csv", nrows = 568400, usecols = ["Score", "Text"])
df = pd.DataFrame(data)

In [3]:
cat = []
for x in tqdm(df.Score):
    if x < 3:
        cat.append("bad")
    elif x > 3:
        cat.append("good")
    elif x == 3:
        cat.append("neutral")

100%|██████████| 568400/568400 [00:00<00:00, 1096144.71it/s]


In [4]:
df["Category"] = cat

In [5]:
# cleaning the texts
corpus = []

for i in tqdm(range(0, 568400)):
    review = re.sub('[^a-zA-Z]', ' ',df['Text'][i])
    review = review.lower()
    review = review.split()
    ps = PorterStemmer()
    review = [ps.stem(word) for word in review if not word in set(stopwords.words('english'))]
    review = ' '.join(review)
    corpus.append(review)

100%|██████████| 568400/568400 [1:35:26<00:00, 99.25it/s] 


In [6]:
# Create Bag of Words model
cv = CountVectorizer(max_features = 1500)
X = cv.fit_transform(corpus).toarray()

In [7]:
y = df["Category"]

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0)

In [9]:
mul_lr = LogisticRegression(multi_class="multinomial", solver = "newton-cg")
mul_lr.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='multinomial',
          n_jobs=1, penalty='l2', random_state=None, solver='newton-cg',
          tol=0.0001, verbose=0, warm_start=False)

In [10]:
print(mul_lr.score(X_test, y_test))

0.850237508797


In [11]:
y_test_log = mul_lr.predict(X_test)

In [None]:
for j in y_test_log:
    print(j)

In [13]:
import pickle

In [15]:
with open('model_pickle','wb') as f:
    pickle.dump(mul_lr,f)

In [16]:
with open('model_pickle', 'rb') as f:
    mp = pickle.load(f)

In [17]:
from sklearn.externals import joblib

In [18]:
joblib.dump(mul_lr,'model_joblib')

['model_joblib']

In [19]:
mj = joblib.load('model_joblib')