In [11]:
import altair as alt
from imblearn.under_sampling import RandomUnderSampler
import pandas as pd
import numpy as np

LARGE_DATASET_SIZE = 100000
RANDOM_SEED = 42

alt.data_transformers.disable_max_rows()

DataTransformerRegistry.enable('default')

In [12]:
def corpus_to_csv(df: pd.DataFrame, filename: str):
    df = df[['reviewText', 'overall']]
    df = df.rename({'reviewText': 'reviews', 'overall': 'ratings'})
    df.to_csv(filename, index=False)

In [13]:
df = pd.read_json('../data/Video_Games_5.json', orient='records', lines=True)

In [None]:
alt.Chart(df).mark_bar().encode(
   x='overall', y='count()'
).interactive()

So the thing with Machine Learning, is that generally training works best with balanced datasets - i.e. there is an equal number of rows in each category. Otherwise the model might start to see a bias. e.g. if 90% of rows are 5 star reviews, then it might just claim every review is 5 star regardless of the actual sentiment. Some models are more resistant to this bias than others. Decision trees/random forests deal with imbalanced data well. This article explains the problem - https://towardsdatascience.com/methods-for-dealing-with-imbalanced-data-5b761be45a18

For other types of model, one solution is to create a balanced training dataset. This can be done by either adding rows or removing rows in a particular category.

Oversampling, we means increasing rows by including the same row more than once.
Synthetic Minority Oversampling Technique (SMOTE) creates synthetic extra rows to add by using a nearest neighbours algorithm.
Undersampling, means dropping rows. This works well when there is a large dataset.


# Data format

* "overall" - score. Integer [1..5]
* "verified" - Has the review been verified. Boolean
* "reviewerID" - Unique identifier for the review. String.
* "asin" - Unknown. String
* "reviewTime" - Date of review. Date format "MM DD, YYYY"
* "reviewerName" - Name of the reviewer. String
* "reviewText" - The review. String
* "summary" - Summary of the review. String.
* "unixReviewTime" - Timestamp for review. Unix Epoch format

# Create and save small balanced dataset

In [5]:
strategy = {1: 1500, 2: 500, 3: 500, 4: 500, 5: 1500}
under_sampler = RandomUnderSampler(sampling_strategy=strategy, random_state=RANDOM_SEED)
small_df, _ = under_sampler.fit_resample(df, df['overall'])

In [None]:
alt.Chart(small_df).mark_bar().encode(
   x='overall', y='count()'
).interactive()

In [7]:
corpus_to_csv(small_df, '../data/small_corpus.csv')

# Create and save large unbalanced dataset

In [14]:
np.random.seed(seed=RANDOM_SEED)
random_indexes = np.random.randint(0, len(df), LARGE_DATASET_SIZE)
large_df = df.iloc[random_indexes]

In [None]:
alt.Chart(large_df).mark_bar().encode(
   x='overall', y='count()'
).interactive()

In [None]:
corpus_to_csv(large_df, '../data/large_corpus.csv')