In [232]:
"""
Standart Python notebook imports

"""

import numpy as np
import pandas as pd

from tqdm.auto import tqdm
from os.path import join
from joblib import dump, load

from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier

from modules.analyzer import features

In [221]:
"""
Load data

TODO fix ugly .to_numpy().reshape(-1)

"""

path = os.path.join('data','parquet')

x_train_raw = pd.read_parquet(join(path,'train_values.parquet'))
y_train = pd.read_parquet(join(path,'train_labels.parquet')).to_numpy().reshape(-1)

x_test_raw = pd.read_parquet(join(path,'test_values.parquet'))
y_test = pd.read_parquet(join(path,'test_labels.parquet')).to_numpy().reshape(-1)

In [166]:
"""
Parses a raw string into an array
 
Some strings, such as descriptions, can be quite lengthy. 
To improve performance only the first max_len elements 
of the string are taken.
 
"""

def formatted_to_numpy(value, max_len = 5000):
    value = value[:max_len]
    return np.array(value[2:-2].split("""', '"""))

In [167]:
"""
Runs a feature extractor on each value in a raw DataFrame
The values are taken from the 'values' column. 

Note that the process of running the feature extractor on
all data (400k+ columns) can take a significant amount of time. 

To facilitate testing and experimentation with the model,
a smaller sample size can be set to reduce the features extraction time.

"""

def features_from_raw(raw, sample_size = 1000):
    values = raw['values'].to_numpy()
    sample = values[:sample_size]
    return np.array([features(formatted_to_numpy(value)) for value in tqdm(sample)])

In [252]:
"""
Parse data and extract features

Data can be saved or loaded from the  "./cached/x_train_features.npy"
Please note that running the extractor will eventually override existing save.

"""

cachedpath = './cached/x_train_features.npy'

flag = False
if os.path.isfile(cachedpath):
    user_choice = input("File with features exists, do you want to load it? (y/n)")
    if user_choice == 'y':
        x_train = np.load(cachedpath)
    else: flag = True
else: flag = True
        

if flag:
    x_train = features_from_raw(x_train_raw, sample_size=1000)

    np.save('x_train_features.npy', x_train)


File with features exists, do you want to load it? (y/n)y


In [253]:
"""
Train model

Model can be saved or loaded from the  "./cached/trained.joblib"
Please note that running the extractor will eventually override existing save.

TODO research optimal number of trees in a forest
TODO a lot of graphs wtf no graphs

"""

cachedpath = './cached/trained.joblib'

flag = False
if os.path.isfile(cachedpath):
    user_choice = input("File with model weights exists, do you want to load it? (y/n)")
    if user_choice == 'y':
        model = load(cachedpath)
    else: flag = True
else: flag = True
        

if flag:
    model = RandomForestClassifier(n_estimators=200, random_state=0)

    n = x_train.shape[0]
    model.fit(x_train, y_train[:n])

    dump(model, 'RandomForest.joblib')


File with model weights exists, do you want to load it? (y/n)y


In [254]:
"""
Check the train accuracy for the model.
Accuracy should be close to 1 as we are using random forest here

TODO do something with that n and ugly [:n]

"""

n = x_train.shape[0]

accuracy = accuracy_score(model.predict(x_train), y_train[:n])
print(f"Train accuracy: {accuracy}")

Train accuracy: 0.9863732775483063


In [255]:
"""
Check the test accuracy.

Please note that the 

"""

values_test = x_test_raw['values'].to_numpy()

x_test = features_from_raw(values_test, sample_size=5000)
n = x_test.shape[0]

accuracy = accuracy_score(model.predict(x_test), y_test[:n])
print(f"Test accuracy: {accuracy}")

  0%|          | 0/5000 [00:00<?, ?it/s]

Test accuracy: 0.8694


In [241]:
"""
This code performs manual checks on the prediction results.

This block the model's predictions on the first 100 test samples
and outputs data in a fancy way.

It can be helpful in identifying poorly predicted types and gaining insights 
into the model's behavior.

"""

def top_classes(preds):
    top_indices = np.argsort(preds[0])[::-1][:3]
    
    top_types = model.classes_[top_indices]
    top_probs = preds[0][top_indices]
    
    return dict(zip(top_types, top_probs))

for i in range(100):
    preds = model.predict_proba(x_test[i].reshape(1, -1))
    top = top_classes(preds)
    value_sample = x_test_raw['values'].to_numpy()[i][1:60]
    
    print("Values    : ", value_sample, ". . .")
    print("Predicted : ", top)
    print("Truth     : ", y_test[i], '\n')


Values    :  'Central Missouri', 'unattached', 'unattached', 'Kansas Sta . . .
Predicted :  {'affiliation': 0.345, 'region': 0.055, 'city': 0.04}
Truth     :  affiliation 

Values    :  95, 100, 95, 89, 84, 91, 88, 94, 75, 78, 90, 84, 90, 76, 93 . . .
Predicted :  {'plays': 0.2504166666666666, 'rank': 0.15125, 'weight': 0.125}
Truth     :  weight 

Values    :  'Katie Crews', 'Christian Hiraldo', 'Alex Estrada', 'Fredy  . . .
Predicted :  {'jockey': 0.97, 'name': 0.015, 'person': 0.005}
Truth     :  jockey 

Values    :  'Christian', 'Non-Christian', 'Unreported', 'Jewish', 'Athe . . .
Predicted :  {'city': 0.115, 'religion': 0.07, 'type': 0.065}
Truth     :  religion 

Values    :  'AAF-McQuay Canada Inc.', 'AAF-McQuay Canada Inc.', 'Abilit . . .
Predicted :  {'company': 0.455, 'album': 0.09, 'name': 0.07}
Truth     :  company 

Values    :  '05 - 08', '04 - 08', '06 - 09', '05 - 08', '06 - 08', '06  . . .
Predicted :  {'grades': 1.0, 'depth': 0.0, 'continent': 0.0}
Truth     :  grade

Values    :  'Hardcover', 'Ebook'] . . .
Predicted :  {'format': 1.0, 'year': 0.0, 'depth': 0.0}
Truth     :  format 

Values    :  'SCMS', 'TOC', 'SCAQ', 'CFM', 'GOLD', 'LINC', 'SWOM', 'RINC . . .
Predicted :  {'club': 0.995, 'team': 0.005, 'year': 0.0}
Truth     :  club 

Values    :  'Lichtenvoorde', 'Vancouver', 'Toronto', 'Montreal', 'Montr . . .
Predicted :  {'city': 0.345, 'category': 0.05, 'type': 0.045}
Truth     :  city 

Values    :  'Metallica', 'Metallica', 'Metallica', 'Metallica', 'Metall . . .
Predicted :  {'artist': 0.88, 'album': 0.1, 'product': 0.005}
Truth     :  artist 

Values    :  'OFF MEDOC ST', '155 MAYFLOWER ST', ''] . . .
Predicted :  {'address': 0.87, 'county': 0.015, 'result': 0.015}
Truth     :  address 

Values    :  'TO/40', 'TO/40p', 'PF/40', 'PF/40p'] . . .
Predicted :  {'category': 0.815, 'code': 0.045, 'product': 0.025}
Truth     :  category 

Values    :  'N/A', 'N/A', 'ATH', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/ . . .
Predicted :  {'position': 1.

In [256]:
"""
This is where you can test the model on your own custom samples!
Enjoy experimenting with the model :D

Please be aware that the performance of the model may be poor 
when working with a small number of samples, as it relies on 
statistics collected from a large datasets.

"""

your_samples = np.array([
    ["Alice", "Bob", "Donald", "Duda", "Petro", "Jacek"],
    ["12/21/2004"],
    ["Ukraine", "Poland", "China"],
], dtype=object)

for sample in your_samples:
    preds = model.predict_proba(features(sample).reshape(1,-1))
    top = top_classes(preds)
    print(top)

{'person': 0.13, 'city': 0.11, 'credit': 0.095}
{'year': 0.735, 'day': 0.1, 'code': 0.065}
{'country': 0.31, 'city': 0.08, 'team': 0.08}
