In [1]:
"""
Standart Python notebook imports

"""

import numpy as np
import pandas as pd

from tqdm.auto import tqdm
from os.path import join
import os
import joblib

from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier

from modules.analyzer import extract_features

In [2]:
"""
Load data

"""

path = os.path.join('data','parquet')

x_train_raw = pd.read_parquet(join(path,'train_values.parquet'))
y_train = pd.read_parquet(join(path,'train_labels.parquet')).to_numpy().reshape(-1)

x_test_raw = pd.read_parquet(join(path,'test_values.parquet'))
y_test = pd.read_parquet(join(path,'test_labels.parquet')).to_numpy().reshape(-1)

In [3]:
"""
Parses a raw string into an array
 
Some strings, such as descriptions, can be quite lengthy. 
To improve performance only the first max_len elements 
of the string are taken.
 
"""

def formatted_to_numpy(value, max_len = 1000):
    value = value[:max_len]
    return np.array(value[2:-2].split("""', '"""))

In [16]:
"""
Runs a feature extractor on each value in a raw DataFrame
The values are taken from the 'values' column. 

Note that the process of running the feature extractor on
all data (400k+ columns) can take a significant amount of time. 

To facilitate testing and experimentation with the model,
a smaller sample size can be set to reduce the features extraction time.

"""

def features_from_raw(raw, sample_size = 5000):
    values = raw['values'].to_numpy()
    sample = values[:sample_size]
    return np.array([extract_features(formatted_to_numpy(value))[0] for value in tqdm(sample)])

features_keys = extract_features(formatted_to_numpy(x_train_raw['values'][0]))[1]

In [5]:
"""
Tries to load a cached data file

Args: 
    name (str): Name of the file to load.

Returns:
    A tuple (flag, data), where flag is a boolean indicating whether 
    the file was loaded or not, and data is the loaded data or None.
    
"""

def load_cached(name):
    cachedpath = join('cached', name + '.joblib')

    data = None
    loaded = True

    if os.path.isfile(cachedpath):
        user_choice = input(f"File {cachedpath} exists, do you want to load it? (y/n)")
        if user_choice == 'y':
            data = joblib.load(cachedpath)
        else:
            loaded = False
    else:
        loaded = False

    return data, loaded

In [6]:
"""
Saves file into the cached directory.

"""

def save_cached(name, data):
    cachedpath = join('cached', name + '.joblib')
    joblib.dump(data, cachedpath)

In [18]:
"""
Parse data and extract features

Data can be saved or loaded from the "./cached/x_train_features.joblib"
Please note that running the extractor will override existing save.

"""

x_train_features, loaded = load_cached('x_train_features')

if not loaded:
    x_train_features = features_from_raw(x_train_raw, sample_size=5000)
    save_cached('x_train_features', x_train_features)
    
x_test_features, loaded = load_cached('x_test_features')

if not loaded:
    x_test_features = features_from_raw(x_test_raw, sample_size=1000)
    save_cached('x_test_features', x_test_features)

File cached/x_train_features.joblib exists, do you want to load it? (y/n) n


  0%|          | 0/5000 [00:00<?, ?it/s]

File cached/x_test_features.joblib exists, do you want to load it? (y/n) n


  0%|          | 0/1000 [00:00<?, ?it/s]

In [19]:
"""
Train model

Model can be saved or loaded from the  "./cached/RandomForest.joblib"
Please note that running the extractor will eventually override existing save.

TODO research optimal number of trees in a forest
TODO a lot of graphs wtf no graphs

"""

random_forest, loaded = load_cached('random_forest')
if not loaded:
    random_forest = RandomForestClassifier(n_estimators=100, random_state=0)
    random_forest.fit(x_train_features, y_train[:x_train_features.shape[0]])
    save_cached('random_forest', random_forest)

model = random_forest

File cached/random_forest.joblib exists, do you want to load it? (y/n) n


In [52]:
""" 
Outputs most valuable features paired with importances

sklearn.inspection.permutation_importance is not used due to the high computation time,
although might give more accurate result

"""
#from sklearn.inspection import permutation_importance
#result = permutation_importance(random_forest, x_train_features, y_train[:x_train_features.shape[0]], n_repeats=10,verbose=1)
#importances = result.importances_mean

importances = random_forest.feature_importances_
indices = np.argsort(importances)[::-1]
for p1, p2, in np.vstack((features_keys, importances)).T[indices][:10]:
    print(f"{p1 + (30-len(p1))*' '}: {p2[:6]}")
  

length_mean                   : 0.0122
length_max                    : 0.0121
uppercase_mean                : 0.0086
length_min                    : 0.0082
 _occurrence_mean             : 0.0081
alphabetic_mean               : 0.0078
 _occurrence_max              : 0.0075
length_std                    : 0.0075
 _occurrence_min              : 0.0067
numeric_mean                  : 0.0064


In [53]:
"""
Check the train accuracy for the model.
Accuracy should be close to 1 as we are using random forest here

"""

accuracy = accuracy_score(model.predict(x_train_features), y_train[:x_train_features.shape[0]])
print(f"Train accuracy: {accuracy}")

Train accuracy: 0.9936


In [56]:
"""
Check the test accuracy.

"""

accuracy = accuracy_score(model.predict(x_test_features), y_test[:x_test_features.shape[0]])
print(f"Test accuracy: {accuracy}")

Test accuracy: 0.635


In [58]:
"""
This code performs manual checks on the prediction results.

This block outputs the model's predictions on the first 100 test samples
and writes data in a fancy way.

It can be helpful in identifying poorly predicted types and gaining insights 
into the model's behavior.

"""

def top_classes(preds):
    top_indices = np.argsort(preds[0])[::-1][:3]
    
    top_types = model.classes_[top_indices]
    top_probs = preds[0][top_indices]
    
    return dict(zip(top_types, top_probs))

for i in range(10):
    preds = model.predict_proba(x_test_features[i].reshape(1, -1))
    top = top_classes(preds)
    value_sample = x_test_raw['values'].to_numpy()[i][1:60]
    
    print("Values    : ", value_sample, ". . .")
    print("Predicted : ", top)
    print("Truth     : ", y_test[i], '\n')


Values    :  'Central Missouri', 'unattached', 'unattached', 'Kansas Sta . . .
Predicted :  {'region': 0.17, 'name': 0.06, 'team Name': 0.06}
Truth     :  affiliation 

Values    :  95, 100, 95, 89, 84, 91, 88, 94, 75, 78, 90, 84, 90, 76, 93 . . .
Predicted :  {'age': 0.39, 'plays': 0.17, 'rank': 0.08}
Truth     :  weight 

Values    :  'Katie Crews', 'Christian Hiraldo', 'Alex Estrada', 'Fredy  . . .
Predicted :  {'jockey': 0.61, 'name': 0.1, 'owner': 0.06}
Truth     :  jockey 

Values    :  'Christian', 'Non-Christian', 'Unreported', 'Jewish', 'Athe . . .
Predicted :  {'city': 0.09, 'region': 0.08, 'country': 0.08}
Truth     :  religion 

Values    :  'AAF-McQuay Canada Inc.', 'AAF-McQuay Canada Inc.', 'Abilit . . .
Predicted :  {'company': 0.17, 'industry': 0.08, 'team': 0.07}
Truth     :  company 

Values    :  '05 - 08', '04 - 08', '06 - 09', '05 - 08', '06 - 08', '06  . . .
Predicted :  {'grades': 0.99, 'age': 0.01, 'depth': 0.0}
Truth     :  grades 

Values    :  'Cerulean Cave 

In [61]:
"""
This is where you can test the model on your own custom samples!
Enjoy experimenting with the model :D

Please be aware that the performance of the model may be poor 
when working with a small number of samples, as it relies on 
statistics collected from a large datasets.

"""

your_samples = np.array([
    ["Alice", "Bob", "Donald", "Duda", "Petro", "Jacek"],
    ["12/21/2004"],
    ["Ukraine", "Poland", "China"],
    ["male", "female", "male", "m", "m", "f"],
], dtype=object)

for sample in your_samples:
    preds = model.predict_proba(extract_features(sample)[0].reshape(1,-1))
    top = top_classes(preds)
    print(top)

{'city': 0.13, 'region': 0.11, 'team': 0.08}
{'year': 0.32, 'day': 0.1, 'code': 0.1}
{'city': 0.13, 'status': 0.07, 'state': 0.07}
{'sex': 0.22, 'gender': 0.19, 'type': 0.09}
