In [1]:
# import libraries
import time
import pandas as pd
from sklearn import set_config
from sklearn.preprocessing import LabelEncoder, OrdinalEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import classification_report
import joblib
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.width', None)


ModuleNotFoundError: No module named 'scipy'

In [37]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
set_config(display="diagram")

In [38]:
df = pd.read_csv("data/whot_dataset.csv", index_col=0)

In [39]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 22407805 entries, 0 to 22407804
Data columns (total 8 columns):
 #   Column          Dtype 
---  ------          ----- 
 0   Card 1          object
 1   Card 2          object
 2   Card 3          object
 3   Card 4          object
 4   Call Card       object
 5   Action          object
 6   Requested Suit  object
 7   Special State   object
dtypes: object(8)
memory usage: 1.5+ GB


In [42]:
df["Special State"].value_counts()

Special State
NONE              20851662
WHOT                649674
PICK_TWO            324603
PICK_THREE          323013
GENERAL_MARKET      258853
Name: count, dtype: int64

In [43]:
X = df.drop(columns="Action")
y = df["Action"]

In [44]:
# encode target colum
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

In [45]:
 # create pipeline
card_columns = [col for col in df.columns if "Card" in col]
suit_col = ["Requested Suit"]
special_state_col = ["Special State"]

preprocessing = ColumnTransformer([
    ("card_cols", OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1), card_columns),
    ("suit_col", OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1), suit_col),
    ("special_col", OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1), special_state_col)
], remainder="passthrough")

In [None]:
# train model
start_time = time.time()
model_pipeline = make_pipeline(preprocessing, RandomForestClassifier(n_jobs=-1, random_state=42))
model_pipeline.fit(X, y_encoded)

training_time = time.time() - start_time
print(f"Model training completed in {training_time:.2f} seconds")

In [35]:
#save model
joblib.dump(model_pipeline, 'model/whot_model.pkl', compress=3)
joblib.dump(label_encoder, 'model/action_encoder.pkl', compress=3)

['model/whot_model.pkl']

['model/action_encoder.pkl']

In [None]:
sample = pd.DataFrame([{
    'Card 1': 'cross 2',
    'Card 2': 'star 1',
    'Card 3': 'triangle 7',
    'Card 4': 'cross 10',
    'Call Card': 'square 2',
    'Requested Suit': 'NONE',
    'Special State': 'NONE'
}])
pred = model_pipeline.predict(sample)
action = label_encoder.inverse_transform(pred)[0]
print(action)


In [29]:
# test model on sample
y_train_pred = model_pipeline.predict(X[:200000])

print(classification_report(
    label_encoder.inverse_transform(y_encoded[:200000]),
    label_encoder.inverse_transform(y_train_pred)
))

  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


              precision    recall  f1-score   support

    circle 1       1.00      1.00      1.00     54599
   circle 10       1.00      1.00      1.00       285
   circle 11       1.00      1.00      1.00       285
   circle 12       1.00      1.00      1.00        95
   circle 13       1.00      1.00      1.00       168
    circle 2       0.00      0.00      0.00         0
    circle 4       1.00      1.00      1.00      2352
    circle 5       1.00      1.00      1.00      1104
    circle 7       1.00      1.00      1.00       380
    circle 8       1.00      1.00      1.00       190
     cross 1       1.00      1.00      1.00       374
    cross 10       1.00      1.00      1.00       378
    cross 11       1.00      1.00      1.00       372
    cross 13       1.00      1.00      1.00       367
    cross 14       0.00      0.00      0.00         0
     cross 5       1.00      1.00      1.00       426
     cross 7       1.00      1.00      1.00       428
   go market       1.00    

  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


In [25]:
print(label_encoder.classes_)

['circle 1' 'circle 10' 'circle 11' 'circle 12' 'circle 13' 'circle 14'
 'circle 2' 'circle 3' 'circle 4' 'circle 5' 'circle 7' 'circle 8'
 'cross 1' 'cross 10' 'cross 11' 'cross 13' 'cross 14' 'cross 2' 'cross 3'
 'cross 5' 'cross 7' 'go market' 'square 1' 'square 10' 'square 11'
 'square 13' 'square 14' 'square 2' 'square 3' 'square 5' 'square 7'
 'star 1' 'star 2' 'star 3' 'star 4' 'star 5' 'star 7' 'star 8'
 'triangle 1' 'triangle 10' 'triangle 11' 'triangle 12' 'triangle 13'
 'triangle 14' 'triangle 2' 'triangle 3' 'triangle 4' 'triangle 5'
 'triangle 7' 'triangle 8' 'whot 20']
