<a href="https://colab.research.google.com/github/poffertje/TextMining/blob/master/code/fake_classifier/LogisticRegression.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
##### This code is strongly based of Amazon review classifier (https://t-lanigan.github.io/amazon-review-classifier/)

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [None]:
import re
import os
import sys
import string
import joblib
import warnings
import numpy as np
import pandas as pd
import scipy as scipy
import matplotlib.pyplot as plt

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import FunctionTransformer
from nltk.stem.snowball import SnowballStemmer
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import AdaBoostClassifier
from sklearn.svm import LinearSVC, SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier

from six.moves import cPickle as pickle
from time import time

pd.options.display.max_rows = 15
np.set_printoptions(precision = 4, suppress=True)

warnings.filterwarnings('ignore')

In [None]:
from pathlib import Path

### Dataset import

In [None]:
# set variable containing dataset path
FakeReviews = "/content/gdrive/Shareddrives/Minecraft/Datasets/final_classifier_dataset.csv"

In [None]:
#import dataset
dataset = pd.read_csv(FakeReviews,index_col=0)

In [None]:
# show dataset
display(dataset)

Unnamed: 0,userID,productID,rating,label,date,review,review_length,average_product_rating,average_user_rating,extreme_count_ratio,exclaim_cnt,all_cap,sub_cnt
0,5045,0,1.0,0,2014-09-08,This was the worst experience I've ever had a ...,248,3.643678,1.000000,1.000000,0,4,9
1,5046,0,3.0,0,2013-10-06,This is located on the site of the old Spruce ...,50,3.643678,3.250000,0.000000,0,1,0
2,5047,0,5.0,0,2014-11-30,I enjoyed coffee and breakfast twice at Toast ...,233,3.643678,5.000000,0.000000,2,9,11
3,5048,0,5.0,0,2014-08-28,I love Toast! The food choices are fantastic -...,152,3.643678,5.000000,0.000000,2,3,3
4,5049,0,5.0,0,2013-07-16,The egg on an English muffin (their take on eg...,73,3.643678,5.000000,0.000000,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
159995,6805,839,4.0,1,2014-01-14,"the menu is small, but the dishes are well pre...",71,3.507634,2.666667,0.500000,1,0,1
159996,22381,839,5.0,1,2014-01-13,You have to get the nachos. They are our of th...,52,3.507634,4.583333,0.000000,0,2,4
159997,10265,839,1.0,1,2014-01-09,I revoke my previous praise for happy hour at ...,81,3.507634,3.862069,0.017241,1,5,7
159998,69957,839,2.0,1,2014-01-09,"I had pretty high expectations for El Rey, but...",92,3.507634,2.500000,0.000000,0,5,7


### Dataset splitting

In [None]:
# split the dataset into 
x_train, x_test, y_train, y_test = train_test_split(dataset, dataset['label'],test_size=0.2, random_state=25)

In [None]:
# check train set label distribution
x_train.value_counts("label")

label
0    64047
1    63953
dtype: int64

### Creating correlation matrix

In [None]:
# generate correlation table from training data
correlation_table = x_train.corr().round(decimals=2)

In [None]:
# export correlation table
correlation_table.to_csv("/content/gdrive/Shareddrives/Minecraft/Our_Models/LogisticRegression/correlation_table.csv")

In [None]:
#define our own tokenizing function that we will pass into the TFIDFVectorizer. We will also stem the words here.
def tokens(x):
    x = x.split()
    stems = []
    [stems.append(stemmer.stem(word)) for word in x]
    return stems

### Creating feature space for LR

In [None]:
#create a stemmer
stemmer = SnowballStemmer("english")

os.chdir('/content/gdrive/Shareddrives/Minecraft/Our_Models/LogisticRegression/')
#loads pickle if exists, extracts and pickles if it doesn't
if os.path.exists('features.pickle') and os.path.exists('vectorizer.pickle'):
    print ('Pickled file already present, loading...')
    features = pickle.load( open( "features.pickle", "rb" ) )
    vectorizer = pickle.load( open( "vectorizer.pickle", "rb") )
    print ('Pickle file loaded.')
else:
    #define the vectorizer
    vectorizer = TfidfVectorizer(tokenizer = tokens, stop_words = 'english', ngram_range=(1, 3), min_df = 0.01)
    #fit the vectorizers to the data.
    x_train.loc[:, 'review'] = x_train['review'].str.lower()
    x_train['review']=x_train['review'].apply( lambda x: remove_punctuation(x))
    features = vectorizer.fit_transform(x_train['review'].values.astype(str))
    length = np.array(list(x_train.review_length)).reshape(features.shape[0], 1)
    xtreme_ratio = np.array(list(x_train.extreme_count_ratio)).reshape(features.shape[0],1)
    features = scipy.sparse.hstack((features,scipy.sparse.csr_matrix(length)))
    features = scipy.sparse.hstack((features,scipy.sparse.csr_matrix(xtreme_ratio)))
    features = scipy.sparse.csr_matrix(features)
    pickle.dump(features, open("features.pickle", "wb"))
    pickle.dump(vectorizer, open("vectorizer.pickle", "wb"))

features

Pickled file already present, loading...
Pickle file loaded.


<128000x856 sparse matrix of type '<class 'numpy.float64'>'
	with 4158003 stored elements in Compressed Sparse Row format>

### Defining helper methods



In [None]:
# helper function for removing punctuation
def remove_punctuation(text):
  return text.translate(str.maketrans('', '', string.punctuation))

In [None]:
# count number of exclamation marks
def count_exlaim(review):
    count = 0
    for i in range(len(review)):
        if review[i] == '!':
            count += 1
    return count

# count number of capital words
def count_caps(review):
    count = 0
    for item in review.split():
        if item.isupper():
            count += 1
    return count

In [None]:
# additional columns to be added to input data
def adding_columns(data):
  data["review_length"] = data['review'].str.split().str.len()
  data["average_product_rating"] = data.groupby('productID')['rating'].transform('mean')
  data["average_user_rating"] = data.groupby('userID')['rating'].transform('mean')
  nr_rows = data.groupby('userID').size().astype(float).reset_index(name="nr of rows")
  extreme_count = (data.groupby('userID')['rating'].apply(lambda x: (x == (1.0 or 5.0) ).sum())).reset_index(name="extreme_count_ratio")
  extreme_count["extreme_count_ratio"] = extreme_count["extreme_count_ratio"].astype(float).div(nr_rows["nr of rows"].values,axis=0)
  data = pd.merge(data, extreme_count, how='left', on = 'userID')
  data["nr_of_reviews"] = data.groupby('userID')["userID"].transform('count')
  return data

In [None]:
# convert input data to features (input for the model)
def features_conversion(data):
  os.chdir('/content/gdrive/Shareddrives/Minecraft/Our_Models/LogisticRegression/')
  # drop NaN rows
  data = data.dropna()
  data.loc[:, 'review'] = data['review'].str.lower()
  data['review']=data['review'].apply( lambda x: remove_punctuation(x))
  vectorizer = joblib.load("vectorizer.pickle")
  features_created = vectorizer.transform(data['review'])
  length = np.array(list(data.review_length)).reshape(features_created.shape[0], 1)
  features_created = scipy.sparse.hstack((features_created,scipy.sparse.csr_matrix(length)))
  xtreme_ratio = np.array(list(data.extreme_count_ratio)).reshape(features_created.shape[0],1)
  features_created = scipy.sparse.hstack((features_created,scipy.sparse.csr_matrix(xtreme_ratio)))
  features_created = scipy.sparse.csr_matrix(features_created)
  return features_created

### Convert data to features (compatible with feature space)

In [None]:
# convert training data to features
x_train = features_conversion(x_train)

In [None]:
# convert testing data to features
x_test = features_conversion(x_test)

### Training classifier and export model

In [None]:
# to train (and store) classifier model
def train_classifier(clf, X_train, y_train, store=False,name=None):
    ''' Fits a classifier to the training data. '''
    
    # Start the clock, train the classifier, then stop the clock
    start = time()
    clf.fit(X_train, y_train)
    end = time()
    
    if store == True:
      filename = name
      joblib.dump(clf,filename)
    # Print the results
    print ("Trained model in {:.4f} seconds".format(end - start))

In [None]:
# train classifier and store model
os.chdir('/content/gdrive/Shareddrives/Minecraft/Our_Models/LogisticRegression/')
train_classifier(LogisticRegression(), x_train,y_train,True,"finalized_model.sav")

Trained model in 3.0388 seconds


In [None]:
# load in model
os.chdir('/content/gdrive/Shareddrives/Minecraft/Our_Models/LogisticRegression/')
final_model = joblib.load("finalized_model.sav")

### Classification report on test set

In [None]:
# generate and show classification report
print(classification_report(y_test, final_model.predict(x_test)))

              precision    recall  f1-score   support

           0       0.67      0.71      0.69     15953
           1       0.69      0.66      0.68     16047

    accuracy                           0.68     32000
   macro avg       0.68      0.68      0.68     32000
weighted avg       0.68      0.68      0.68     32000



### Processing production test set

In [None]:
# set variable to path containing production test set
test_1814 = "/content/gdrive/Shareddrives/Minecraft/Datasets/8April_sample_production_set.csv"

In [None]:
# read in the file
test_1814 = pd.read_csv(test_1814)

In [None]:
# encoding the labels from -1 and 1 to 0 and 1
encode_label = {-1 : 0, 1 : 1}

In [None]:
# encoding 
test_1814['label'] = test_1814['label'].map(encode_label)

In [None]:
# check distribution of target label in production test set
test_1814.value_counts("label")

label
1    167
0     33
dtype: int64

In [None]:
# add additional columns to be intepretable for the model
test_1814 = adding_columns(test_1814)

In [None]:
# convert dataframe to features
test_1814_features = features_conversion(test_1814)

### Classification report on production test set

In [None]:
# generate and show classification report
print(classification_report(test_1814["label"], final_model.predict(test_1814_features)))

              precision    recall  f1-score   support

           0       0.28      0.64      0.39        33
           1       0.90      0.68      0.78       167

    accuracy                           0.68       200
   macro avg       0.59      0.66      0.59       200
weighted avg       0.80      0.68      0.71       200

