In [1]:
"""
Standart Python notebook imports

"""

import numpy as np
import pandas as pd

from tqdm.auto import tqdm
from os.path import join
import os
from joblib import dump, load

from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier

from modules.analyzer import features

In [2]:
"""
Load data

TODO fix ugly .to_numpy().reshape(-1)

"""

path = os.path.join('data','parquet')

x_train_raw = pd.read_parquet(join(path,'train_values.parquet'))
y_train = pd.read_parquet(join(path,'train_labels.parquet')).to_numpy().reshape(-1)

x_test_raw = pd.read_parquet(join(path,'test_values.parquet'))
y_test = pd.read_parquet(join(path,'test_labels.parquet')).to_numpy().reshape(-1)

In [3]:
"""
Parses a raw string into an array
 
Some strings, such as descriptions, can be quite lengthy. 
To improve performance only the first max_len elements 
of the string are taken.
 
"""

def formatted_to_numpy(value, max_len = 5000):
    value = value[:max_len]
    return np.array(value[2:-2].split("""', '"""))

In [4]:
"""
Runs a feature extractor on each value in a raw DataFrame
The values are taken from the 'values' column. 

Note that the process of running the feature extractor on
all data (400k+ columns) can take a significant amount of time. 

To facilitate testing and experimentation with the model,
a smaller sample size can be set to reduce the features extraction time.

"""

def features_from_raw(raw, sample_size = 5000):
    values = raw['values'].to_numpy()
    sample = values[:sample_size]
    return np.array([features(formatted_to_numpy(value)) for value in tqdm(sample)])

In [6]:
"""
Parse data and extract features

Data can be saved or loaded from the  "./cached/x_train_features.npy"
Please note that running the extractor will override existing save.

"""

cachedpath = './cached/x_train_features.npy'

flag = False
if os.path.isfile(cachedpath):
    user_choice = input("File with features exists, do you want to load it? (y/n)")
    if user_choice == 'y':
        x_train = np.load(cachedpath)
    else: flag = True
else: flag = True
        

if flag:
    x_train = features_from_raw(x_train_raw, sample_size=50)

    np.save(join('cached','x_train_features.npy'), x_train)


File with features exists, do you want to load it? (y/n)n


  0%|          | 0/50 [00:00<?, ?it/s]

In [None]:
"""
Train model

Model can be saved or loaded from the  "./cached/RandomForest.joblib"
Please note that running the extractor will eventually override existing save.

TODO research optimal number of trees in a forest
TODO a lot of graphs wtf no graphs

"""

cachedpath = './cached/RandomForest.joblib'

flag = False
if os.path.isfile(cachedpath):
    user_choice = input("File with model weights exists, do you want to load it? (y/n)")
    if user_choice == 'y':
        model = load(cachedpath)
    else: flag = True
else: flag = True
        

if flag:
    model = RandomForestClassifier(n_estimators=200, random_state=0)

    n = x_train.shape[0]
    model.fit(x_train, y_train[:n])

    dump(model, join('cached','RandomForest.joblib'))


File with model weights exists, do you want to load it? (y/n)y


In [15]:
"""
Check the train accuracy for the model.
Accuracy should be close to 1 as we are using random forest here

TODO do something with that n and ugly [:n]

"""

n = x_train.shape[0]

accuracy = accuracy_score(model.predict(x_train), y_train[:n])
print(f"Train accuracy: {accuracy}")

Train accuracy: 1.0


In [16]:
"""
Check the test accuracy.

Please note that the 

"""

x_test = features_from_raw(x_test_raw, sample_size=1000)
n = x_test.shape[0]

accuracy = accuracy_score(model.predict(x_test), y_test[:n])
print(f"Test accuracy: {accuracy}")

  0%|          | 0/1000 [00:00<?, ?it/s]

Test accuracy: 0.163


In [11]:
"""
This code performs manual checks on the prediction results.

This block the model's predictions on the first 100 test samples
and outputs data in a fancy way.

It can be helpful in identifying poorly predicted types and gaining insights 
into the model's behavior.

"""

def top_classes(preds):
    top_indices = np.argsort(preds[0])[::-1][:3]
    
    top_types = model.classes_[top_indices]
    top_probs = preds[0][top_indices]
    
    return dict(zip(top_types, top_probs))

for i in range(100):
    preds = model.predict_proba(x_test[i].reshape(1, -1))
    top = top_classes(preds)
    value_sample = x_test_raw['values'].to_numpy()[i][1:60]
    
    print("Values    : ", value_sample, ". . .")
    print("Predicted : ", top)
    print("Truth     : ", y_test[i], '\n')


Values    :  'Central Missouri', 'unattached', 'unattached', 'Kansas Sta . . .
Predicted :  {'area': 0.065, 'industry': 0.06, 'type': 0.06}
Truth     :  affiliation 

Values    :  95, 100, 95, 89, 84, 91, 88, 94, 75, 78, 90, 84, 90, 76, 93 . . .
Predicted :  {'plays': 0.37, 'age': 0.22, 'rank': 0.11}
Truth     :  weight 

Values    :  'Katie Crews', 'Christian Hiraldo', 'Alex Estrada', 'Fredy  . . .
Predicted :  {'industry': 0.15, 'county': 0.075, 'city': 0.055}
Truth     :  jockey 

Values    :  'Christian', 'Non-Christian', 'Unreported', 'Jewish', 'Athe . . .
Predicted :  {'industry': 0.145, 'type': 0.06, 'area': 0.06}
Truth     :  religion 

Values    :  'AAF-McQuay Canada Inc.', 'AAF-McQuay Canada Inc.', 'Abilit . . .
Predicted :  {'industry': 0.13, 'county': 0.11, 'affiliation': 0.09}
Truth     :  company 

Values    :  '05 - 08', '04 - 08', '06 - 09', '05 - 08', '06 - 08', '06  . . .
Predicted :  {'plays': 0.215, 'duration': 0.14, 'age': 0.115}
Truth     :  grades 

Values    :  

Values    :  'Hardcover', 'Ebook'] . . .
Predicted :  {'industry': 0.085, 'type': 0.075, 'name': 0.06}
Truth     :  format 

Values    :  'SCMS', 'TOC', 'SCAQ', 'CFM', 'GOLD', 'LINC', 'SWOM', 'RINC . . .
Predicted :  {'club': 0.395, 'state': 0.155, 'nationality': 0.105}
Truth     :  club 

Values    :  'Lichtenvoorde', 'Vancouver', 'Toronto', 'Montreal', 'Montr . . .
Predicted :  {'industry': 0.12, 'component': 0.06, 'jockey': 0.055}
Truth     :  city 

Values    :  'Metallica', 'Metallica', 'Metallica', 'Metallica', 'Metall . . .
Predicted :  {'industry': 0.105, 'collection': 0.095, 'name': 0.085}
Truth     :  artist 

Values    :  'OFF MEDOC ST', '155 MAYFLOWER ST', ''] . . .
Predicted :  {'industry': 0.1, 'class': 0.085, 'state': 0.07}
Truth     :  address 

Values    :  'TO/40', 'TO/40p', 'PF/40', 'PF/40p'] . . .
Predicted :  {'grades': 0.105, 'age': 0.085, 'code': 0.08}
Truth     :  category 

Values    :  'N/A', 'N/A', 'ATH', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/ . . .
Predicted

In [12]:
"""
This is where you can test the model on your own custom samples!
Enjoy experimenting with the model :D

Please be aware that the performance of the model may be poor 
when working with a small number of samples, as it relies on 
statistics collected from a large datasets.

"""

your_samples = np.array([
    ["Alice", "Bob", "Donald", "Duda", "Petro", "Jacek"],
    ["12/21/2004"],
    ["Ukraine", "Poland", "China"],
    ["male", "female", "male", "m", "m", "f"],
], dtype=object)

for sample in your_samples:
    preds = model.predict_proba(features(sample).reshape(1,-1))
    top = top_classes(preds)
    print(top)

{'team': 0.075, 'order': 0.075, 'industry': 0.07}
{'plays': 0.14, 'industry': 0.135, 'rank': 0.12}
{'team': 0.085, 'gender': 0.07, 'industry': 0.055}
{'industry': 0.12, 'type': 0.1, 'age': 0.075}
