<a href="https://colab.research.google.com/github/poffertje/TextMining/blob/master/code/fake_classifier/FeatureEngineering.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Feature Engineering

## Mounting the Drive (Google Colab)

In [None]:
# from google.colab import drive
# drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [1]:
import re
import string
import warnings
import numpy as np
import pandas as pd
import scipy as scipy

from pathlib import Path

pd.options.display.max_rows = 15
np.set_printoptions(precision = 4, suppress=True)

warnings.filterwarnings('ignore')

## Resolving Paths

### Local Repository

In [7]:
CUR_DIR = (
    Path().resolve()
)  # this should provide you with the folder in which this notebook is placed
# use this for local repository
PATH_TO_DATASETS = Path.joinpath(CUR_DIR.parents[1], "datasets")
print(PATH_TO_DATASETS)
print("Does path exist? ->", Path.exists(PATH_TO_DATASETS))

# same for colab and local repository
PATH_TO_YELP = Path.joinpath(PATH_TO_DATASETS, "processed_yelp.csv")
print(PATH_TO_YELP)
print("Does path exist? ->", Path.exists(PATH_TO_YELP))

C:\Users\lmps\github\TextMining2\datasets
Does path exist? -> True
C:\Users\lmps\github\TextMining2\datasets\processed_yelp.csv
Does path exist? -> True


# **Data processing**

### Importing and reading the dataset

In [8]:
# read in the file 
yelp_merged = pd.read_csv(PATH_TO_YELP)

In [9]:
# display to see structure
display(yelp_merged)

Unnamed: 0,userID,productID,rating,label,date,review,sentiment label
0,5045,0,1.0,-1,2014-09-08,This was the worst experience I've ever had a ...,0
1,5046,0,3.0,-1,2013-10-06,This is located on the site of the old Spruce ...,0
2,5047,0,5.0,-1,2014-11-30,I enjoyed coffee and breakfast twice at Toast ...,1
3,5048,0,5.0,-1,2014-08-28,I love Toast! The food choices are fantastic -...,1
4,5049,0,5.0,-1,2013-07-16,The egg on an English muffin (their take on eg...,1
...,...,...,...,...,...,...,...
598412,119664,5039,4.0,1,2013-01-20,When I first moved to the area I must say I wa...,1
598413,56277,5039,2.0,1,2012-11-12,Kind of pricey. I guess I expected a ridiculou...,0
598414,265320,5039,1.0,1,2012-08-22,"Stopped by this restaurant yesterday, we just ...",0
598415,161722,5039,4.0,1,2011-05-11,Finally checked out The Best Subs in Claremont...,1


### Defining helper methods

In [10]:
# helper function for removing punctuation
def remove_punctuation(text):
  return text.translate(str.maketrans('', '', string.punctuation))

In [11]:
# count number of exclamation marks
def count_exlaim(review):
    count = 0
    for i in range(len(review)):
        if review[i] == '!':
            count += 1
    return count

# count number of capital words
def count_caps(review):
    count = 0
    for item in review.split():
        if item.isupper():
            count += 1
    return count


### Dataframe refinement and processing

In [12]:
# drop NaN rows
yelp_merged.dropna(inplace = True)

In [13]:
# remove the test restaurant
yelp_merged = yelp_merged.loc[yelp_merged['productID']!=1814].reset_index(drop=True)

In [14]:
# encoding the labels from -1 and 1 to 0 and 1
encode_label = {-1 : 0, 1 : 1}

In [15]:
# encoding 
yelp_merged['label'] = yelp_merged['label'].map(encode_label)

In [16]:
yelp_merged.value_counts("label")

label
1    518921
0     79496
dtype: int64

In [17]:
# check for empty reviews
yelp_merged.loc[yelp_merged["review"] == '']

Unnamed: 0,userID,productID,rating,label,date,review,sentiment label


In [18]:
# add review length column
yelp_merged["review_length"] = yelp_merged['review'].str.split().str.len()

In [19]:
# add average product rating column
yelp_merged["average_product_rating"] = yelp_merged.groupby('productID')['rating'].transform('mean')

In [20]:
# add average user rating column
yelp_merged["average_user_rating"] = yelp_merged.groupby('userID')['rating'].transform('mean')

In [21]:
# count number of instances per user
nr_rows = yelp_merged.groupby('userID').size().astype(float).reset_index(name="nr of rows")

In [22]:
# get the number of extreme reviews (1 or 5 stars)
extreme_count = (yelp_merged.groupby('userID')['rating'].apply(lambda x: (x == (1.0 or 5.0) ).sum())).reset_index(name="extreme_count_ratio")

In [23]:
# construct an extreme ratio count table that has the ratio of extreme reviews per user
extreme_count["extreme_count_ratio"] = extreme_count["extreme_count_ratio"].astype(float).div(nr_rows["nr of rows"].values,axis=0)

In [24]:
# left merge it with the original dataset
yelp_merged = pd.merge(yelp_merged, extreme_count, how='left', on = 'userID')

In [25]:
# add number of reviews per user to the dataset
yelp_merged["nr_of_reviews"] = yelp_merged.groupby('userID')["userID"].transform('count')

In [26]:
# add number of exclamation marks and number of capital letters to the dataframe
yelp_merged['exclaim_cnt'] = yelp_merged['review'].apply(count_exlaim)
yelp_merged['all_cap'] = yelp_merged['review'].apply(count_caps)

### Final dataset creation and export

In [27]:
# to create class balance in the datasets, add 80k instances from each label
sub_0 = yelp_merged.loc[(yelp_merged["label"] == 0) & (yelp_merged["review"] != '')][:80000].reset_index(drop=True)
sub_1 = yelp_merged.loc[(yelp_merged["label"] == 1) & (yelp_merged["review"] != '')][:80000].reset_index(drop=True)

In [28]:
# create a final dataset that joined the two previous sub-sets together
final_classifier_dataset = pd.concat([sub_0,sub_1]).reset_index(drop=True)

In [29]:
# additional check for empty cells
final_classifier_dataset[final_classifier_dataset.isnull().any(axis=1)]

Unnamed: 0,userID,productID,rating,label,date,review,sentiment label,review_length,average_product_rating,average_user_rating,extreme_count_ratio,nr_of_reviews,exclaim_cnt,all_cap


In [30]:
# export final dataset to be easily accesible
final_classifier_dataset.to_csv(
    Path.joinpath(PATH_TO_DATASETS, "classifier_sample.csv"),
    encoding="utf-8",
    index=False
    )