### Sentiment Analysis using Logistic Regression

In [0]:
import pandas as pd
import numpy as np
import pickle 

In [0]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [0]:
# Load data
PATH = '/content/drive/My Drive/Big Data Analytics - Project/Data/'
data = pd.read_csv(PATH + 'AmazonCustomerReviewsK3.csv', delimiter=',', index_col=None, error_bad_lines=False)
data.columns

Index(['marketplace', 'customer_id', 'review_id', 'product_id',
       'product_parent', 'product_title', 'product_category', 'star_rating',
       'helpful_votes', 'total_votes', 'vine', 'verified_purchase',
       'review_headline', 'review_body', 'review_date', 'body_polarity',
       'label'],
      dtype='object')

In [0]:
data['label'].value_counts()

2    632337
1    232244
0    135419
Name: label, dtype: int64

In [0]:
# balance dataset - 250000 samples for each class
from sklearn.utils import resample

N_samples = 250000
# Separate majority and minority classes
data_2 = data[data.label==2]  # majority

data_1 = data[data.label==1]  # minority
data_0 = data[data.label==0]  # minority
 
# Downsample majority class
data_2 = resample(data_2, 
                  replace=False, # sample without replacement
                  n_samples=N_samples,
                  random_state=8)

# Upsample minority classes
data_1 = resample(data_1, 
                  replace=True, # sample with replacement
                  n_samples=N_samples,
                  random_state=8)
data_0 = resample(data_0, 
                  replace=True, # sample with replacement
                  n_samples=N_samples,
                  random_state=8)
 
# Combine classes
data = pd.concat([data_2, data_1, data_0])

In [0]:
data['label'].value_counts()

2    250000
1    250000
0    250000
Name: label, dtype: int64

In [0]:
# split into train and test data
from sklearn.utils import shuffle
N_split = 600000
data_X = pd.DataFrame(data, columns = ['review_body']).fillna('')
data_y = pd.DataFrame(data, columns = ['label']).fillna('')
data_X = data_X.values.flatten()
data_y = data_y.values.flatten()
data_X, data_y = shuffle(data_X, data_y)
X_train, y_train = data_X[:N_split], data_y[:N_split]
X_test, y_test = data_X[N_split:], data_y[N_split:]

In [0]:
# Convert reviews to BoW using CountVectorizer
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()
X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)

print(f'Vocabulary size: {len(vectorizer.vocabulary_)}')
print(f'X_train:\n{repr(X_train)}')
print(f'X_test: \n{repr(X_test)}')

Vocabulary size: 279028
X_train:
<600000x279028 sparse matrix of type '<class 'numpy.int64'>'
	with 13977977 stored elements in Compressed Sparse Row format>
X_test: 
<150000x279028 sparse matrix of type '<class 'numpy.int64'>'
	with 3459236 stored elements in Compressed Sparse Row format>


In [0]:
feature_names = vectorizer.get_feature_names()
print(f'Number of features: {len(feature_names)}')

Number of features: 279028


####Logistic Regression

In [0]:
# perform grid search and cross validate data to prevent over-fitting
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression

param_grid = {'C': [0.01, 0.1, 1]}
grid = GridSearchCV(LogisticRegression(max_iter=500, class_weight='balanced'), param_grid, cv=5)
grid.fit(X_train, y_train)

print(f'Best cross-validation score: {round(grid.best_score_)}')
print(f'Best parameters: {grid.best_params_}' )
print(f'Best estimator: {grid.best_estimator_}')


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logist

Best cross-validation score: 1.0
Best parameters: {'C': 1}
Best estimator: LogisticRegression(C=1, class_weight='balanced', dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=500,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [0]:
LogReg_clf = grid.best_estimator_
LogReg_clf.fit(X_train, y_train)
LogReg_clf.predict(X_test)

print(f'Score: {round(LogReg_clf.score(X_test, y_test))}')

Score: 1.0


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [0]:
# Pickle classifier
def create_pickle(c, file_name): 
    save_classifier = open(file_name, 'wb')
    pickle.dump(c, save_classifier)
    save_classifier.close()

# save model and vectorizer
create_pickle(LogReg_clf, PATH + 'models/LogReg_SA_K3.pickle')
create_pickle(vectorizer, PATH + 'models/LogReg_SA_K3_vectorizer.pickle')

In [0]:
# Load model
def load_model(file_path):
  clf_f = open(file_path, 'rb')
  clf = pickle.load(clf_f)
  clf_f.close()
  return clf

# Logistic Regression Classifier
LogReg_clf = load_model(PATH + 'models/LogReg_SA_K3.pickle')

# Vectorizer
with open(PATH + 'models/LogReg_SA_K3_vectorizer.pickle', 'rb') as f:
    vectorizer = pickle.load(f)

In [0]:
# Sentiment analysis

neg = ['This item is very bad', 'This item sucks']
pos = ['I loved the product', 'did not work, hate it']
neutral = ['product was okay, expected more']

print(f'Negative comments: {LogReg_clf.predict(vectorizer.transform(neg))}')
print(f'Positive comments: {LogReg_clf.predict(vectorizer.transform(pos))}')
print(f'Neutral comments: {LogReg_clf.predict(vectorizer.transform(neutral))}')

Negative comments: [0 0]
Positive comments: [2 0]
Neutral comments: [1]
