In [None]:
! pip install -U npc-gzip

In [None]:
import numpy as np
from sklearn.metrics import classification_report, accuracy_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split

from npc_gzip.compressors.gzip_compressor import GZipCompressor
from npc_gzip.knn_classifier import KnnClassifier

import pandas as pd
import os

pd.set_option('display.width', 1000)  # For better display of dataframes


In [None]:
# Preprocessing the data

file = './yelp_academic_dataset_review'
try: os.remove(f"{file}.csv")
except: pass

# Read the first 1 million rows
df = pd.read_json(f'{file}.json', lines=True, nrows=1_000_000)

# Remove all rows where the text has less than 100 characters to ensure high quality reviews
df = df[df['text'].str.len() > 100]

# remove unnecessary columns to save space and time
df.drop(['review_id', 'date', 'user_id', 'business_id'], axis=1, inplace=True)

# map [0,5] stars to negative (0), neutral (1), positive (2)
df['sentiment'] = df['stars'].map({0 : 0, 1 : 0, 2 : 0, 3 : 1, 4 : 2, 5 : 2})

# Remove all newlines and carriage returns from the text
df.replace('\n', ' ', regex=True, inplace=True)
df.replace('\r', ' ', regex=True, inplace=True)
df.replace('  ', ' ', regex=True, inplace=True)

# Duplicate rows based on the 'useful' voted reviews to bias the model towards helpful reviews
useful_duplicated = pd.DataFrame(df.reindex(df.index.repeat(df['useful'] + 1)).reset_index(drop=True))
useful_duplicated.to_csv(f'{file}.csv', header=True, index=False, mode='w')
useful_duplicated.head()

In [None]:
print(len(pd.read_csv(f'{file}.csv')))
pd.read_csv(f'{file}.csv', nrows=10)

In [None]:
from typing import Sequence


def get_data(n_samples: int):
    """
    Pulls the Yelp dataset from the local file system
    and returns the training and test datasetsas tuples.
    Each contains the text and labels as np arrays.

    """
    df = pd.read_csv(f'{file}.csv', nrows=n_samples)
    print('number of samples loaded', len(df))
    text, stars = df['text'], df['sentiment']
    split : Sequence[pd.DataFrame] = train_test_split(text, stars, test_size=0.2, random_state=1) #, stratify=stars)
    text_train, text_test, stars_train, stars_test = split
        

    train = text_train.to_numpy(), stars_train.to_numpy()
    test = text_test.to_numpy(), stars_test.to_numpy()

    return train, test

def fit_model(train_text: np.ndarray, train_labels: np.ndarray, distance_metric: str = "ncd") -> KnnClassifier:
    """ Fits a Knn-GZip compressor on the train data and returns it.
    Arguments:
        train_text (np.ndarray): Training dataset as a numpy array.
        train_labels (np.ndarray): Training labels as a numpy array.

    Returns: KnnClassifier: Trained Knn-Compressor model ready to make predictions.
    """
    return KnnClassifier(
        compressor=GZipCompressor(), 
        distance_metric=distance_metric,
        training_inputs=train_text,
        training_labels=train_labels,
    )

In [None]:
print("Fetching data...")
(train_text, train_labels), (test_text, test_labels) = get_data(n_samples=1_00000)

print("Fitting model...")
model = fit_model(train_text, train_labels)

random_indicies = np.random.choice(test_text.shape[0], len(test_labels), replace=False)
print(random_indicies.shape)
sample_test_text = test_text[random_indicies]
sample_test_labels = test_labels[random_indicies]

print("Generating predictions...")
""" Here we use the `sampling_percentage` to save time at the expense of worse predictions. 
This `sampling_percentage` selects a random % of training data to compare `sample_test_text` 
against rather than comparing it against the entire training dataset. """
(distances, labels, similar_samples) = model.predict( sample_test_text, top_k=1, sampling_percentage=0.1 )

print("Final Accuracy = ", accuracy_score(sample_test_labels, labels.reshape(-1)))

In [None]:
# Save the state of the KNN model to a file
import pickle

program_state = {
    'random_indicies': random_indicies,
    'distances': distances,
    'labels': labels,
    'sample_test_labels' : sample_test_labels,
}

def save_state(filename, data):
    with open(filename, 'wb') as file:
        pickle.dump(data, file)

def load_state(filename):
    with open(filename, 'rb') as file:
        return pickle.load(file)

# Save the state to a file
save_state('program_state.pkl', program_state)

In [None]:
# Load the state from the file
loaded_state = load_state('program_state.pkl')

random_indicies    = loaded_state['random_indicies']
distances          = loaded_state['distances']
labels             = loaded_state['labels']
sample_test_labels = loaded_state['sample_test_labels']


In [None]:
print("Fetching data...")
(train_text, train_labels), (test_text, test_labels) = get_data(n_samples=1_00000)

print("Fitting model...")
model = fit_model(train_text, train_labels)

print(random_indicies.shape)
sample_test_text = test_text[random_indicies]
sample_test_labels = test_labels[random_indicies]

print("Generating predictions...")
""" Here we use the `sampling_percentage` to save time at the expense of worse predictions. 
This `sampling_percentage` selects a random % of training data to compare `sample_test_text` 
against rather than comparing it against the entire training dataset. """
(distances, labels, similar_samples) = model.predict( sample_test_text, top_k=1, sampling_percentage=0.1 )

print("Final Accuracy = ", accuracy_score(sample_test_labels, labels.reshape(-1)))