In [1]:
%matplotlib inline

import math
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from functools import reduce
from pathlib import Path
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split

In [2]:
data_pth  = Path('data')
data = pd.read_csv(data_pth / '2018_10_18_trump.csv', header=None)

data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,29,30,31,32,33,34,35,36,37,38
0,0,0,0,1,1,0,1,1,0,0,...,1,0,0,0,1,0,0,0,53248,6
1,0,0,0,0,0,0,0,0,1,1,...,0,1,0,0,0,1,0,0,4613,5
2,1,0,0,1,0,0,0,0,0,0,...,1,0,0,0,0,1,1,0,68780,6
3,0,0,0,0,0,0,0,0,0,1,...,0,0,1,1,0,0,0,0,24555,5
4,0,1,0,0,0,0,0,0,1,1,...,0,1,0,0,0,0,0,1,8392,4


In [3]:
cards = [
# Diamonds
'DA','DK','DQ','DJ','D10','D9','D8','D7','D6',
# Hearts
'HA','HK','HQ','HJ','H10','H9','H8','H7','H6',
# Spades
'SA','SK','SQ','SJ','S10','S9','S8','S7','S6',
# Clubs
'CA','CK','CQ','CJ','C10','C9','C8','C7','C6'
]

# Forehand (yes = 1, no = 0)
forehand = ['FH']

user  = ['user']
trump = ['trump']

data.columns = cards + forehand + user + trump
data.head()

Unnamed: 0,DA,DK,DQ,DJ,D10,D9,D8,D7,D6,HA,...,CQ,CJ,C10,C9,C8,C7,C6,FH,user,trump
0,0,0,0,1,1,0,1,1,0,0,...,1,0,0,0,1,0,0,0,53248,6
1,0,0,0,0,0,0,0,0,1,1,...,0,1,0,0,0,1,0,0,4613,5
2,1,0,0,1,0,0,0,0,0,0,...,1,0,0,0,0,1,1,0,68780,6
3,0,0,0,0,0,0,0,0,0,1,...,0,0,1,1,0,0,0,0,24555,5
4,0,1,0,0,0,0,0,0,1,1,...,0,1,0,0,0,0,0,1,8392,4


In [4]:
data.drop('user', axis='columns', inplace=True)
data.head()

Unnamed: 0,DA,DK,DQ,DJ,D10,D9,D8,D7,D6,HA,...,CK,CQ,CJ,C10,C9,C8,C7,C6,FH,trump
0,0,0,0,1,1,0,1,1,0,0,...,0,1,0,0,0,1,0,0,0,6
1,0,0,0,0,0,0,0,0,1,1,...,0,0,1,0,0,0,1,0,0,5
2,1,0,0,1,0,0,0,0,0,0,...,0,1,0,0,0,0,1,1,0,6
3,0,0,0,0,0,0,0,0,0,1,...,0,0,0,1,1,0,0,0,0,5
4,0,1,0,0,0,0,0,0,1,1,...,0,0,1,0,0,0,0,0,1,4


In [5]:
data.trump = data.trump.astype('category')
data[cards + forehand] = data[cards + forehand].astype(bool)
data.dtypes

DA           bool
DK           bool
DQ           bool
DJ           bool
D10          bool
D9           bool
D8           bool
D7           bool
D6           bool
HA           bool
HK           bool
HQ           bool
HJ           bool
H10          bool
H9           bool
H8           bool
H7           bool
H6           bool
SA           bool
SK           bool
SQ           bool
SJ           bool
S10          bool
S9           bool
S8           bool
S7           bool
S6           bool
CA           bool
CK           bool
CQ           bool
CJ           bool
C10          bool
C9           bool
C8           bool
C7           bool
C6           bool
FH           bool
trump    category
dtype: object

In [6]:
# Backward compatibility: Value 10 for PUSH was used in an older version by Swisslos

data.trump.cat.rename_categories({0: 'DIAMONDS', 1: 'HEARTS', 2: 'SPADES', 3:'CLUBS',
                                  4: 'OBE_ABE', 5: 'UNE_UFE', 6: 'PUSH', 10: 'PUSH'}, inplace=True)
data.head()

feature_columns = cards + forehand
X_train, X_test, y_train, y_test = train_test_split(data[feature_columns], data.trump, test_size=0.2,
                                                    stratify=data.trump, random_state=42)

  res = method(*args, **kwargs)


In [13]:
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import GradientBoostingClassifier

# Classifier names
names = [

    "Logistic Regression",
]

# Classifiers
classifiers = [
    LogisticRegression()
]

# Hyperparameter grid to search per classifier
parameters = [

    {
        'solver': ['lbfgs', 'liblinear', 'newton-cg', 'sag', 'saga'],
        'C': [0.5, 0.8, 1.5],
        'max_iter': [100, 200, 500],
        'multi_class': ['auto', 'multinomial']
    }

]
list(zip(names, parameters))


[('Logistic Regression',
  {'solver': ['lbfgs', 'liblinear', 'newton-cg', 'sag', 'saga'],
   'C': [0.5, 0.8, 1.5],
   'max_iter': [100, 200, 500],
   'multi_class': ['auto', 'multinomial']})]

In [14]:
from sklearn.model_selection import GridSearchCV

results = []

for name, classifier, params in zip(names, classifiers, parameters):
    print("Grid search for {}".format(name))
    gs = GridSearchCV(classifier, param_grid=params, cv=5, scoring='accuracy', n_jobs=-1)
    gs.fit(X_train, y_train)
    print("Best accuracy score found: {:.3f}\n".format(gs.best_score_))
    results.append([name, gs.best_score_, gs.best_estimator_])

results

Grid search for Logistic Regression


45 fits failed out of a total of 450.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
45 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\rahul\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\model_selection\_validation.py", line 681, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\rahul\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\linear_model\_logistic.py", line 1519, in fit
    multi_class = _check_multi_class(self.multi_class, solver, len(self.classes_))
  File "C:\Users\rahul\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\linear_model\_logistic.py", line 483, in _check_multi_class
    raise ValueError(

Best accuracy score found: 0.643



[['Logistic Regression',
  0.6427673174459807,
  LogisticRegression(C=0.5, solver='liblinear')]]

In [26]:
def add_interaction(data, feature_cols, interaction):
    for color in "DHSC":
        new_col = f"{color}_{interaction}"
        data[new_col] = reduce(lambda a, b: a & b, [data[f"{color}{feature}"] for feature in interaction])
        feature_cols.append(new_col)

X_train_interactions = X_train.copy()
X_test_interactions = X_test.copy()
feature_columns_interactions = list(feature_columns)

for dataframe in [X_train_interactions, X_test_interactions]:
    add_interaction(dataframe, feature_columns_interactions, "J9")
    add_interaction(dataframe, feature_columns_interactions, "AKQ")

print(X_train.head())
print(X_train_interactions.head())


           DA     DK     DQ     DJ    D10     D9     D8     D7     D6     HA  \
207217  False  False  False  False   True  False  False  False   True   True   
18232   False  False  False  False  False  False   True   True  False  False   
226960  False   True  False  False  False  False  False  False  False  False   
128112  False   True  False  False  False  False  False  False  False  False   
126859   True  False  False   True  False  False   True  False  False  False   

        ...     C6     FH   D_J9  D_AKQ   H_J9  H_AKQ   S_J9  S_AKQ   C_J9  \
207217  ...   True  False  False  False   True  False  False  False  False   
18232   ...  False  False  False  False  False  False  False  False  False   
226960  ...  False  False  False  False  False  False  False  False  False   
128112  ...  False  False  False  False   True  False  False  False  False   
126859  ...  False  False  False  False  False  False  False  False  False   

        C_AKQ  
207217  False  
18232   False  
22

In [18]:
X_train_interactions, X_test_interactions, y_train, y_test = train_test_split(data[feature_columns], data.trump, test_size=0.2,
                                                    stratify=data.trump, random_state=42)

In [19]:
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import GradientBoostingClassifier

names = [
    "Logistic Regression",
    "Stochastic Gradient Descent",
    "Random Forest",
    # "Support Vector Classification",
    "K Neighbors",
    "Gradient Boosting"
]

# Based on GridSearch results
classifier_logisticRegression = LogisticRegression(C=0.5)
classifier_sgd = SGDClassifier(alpha=5e-05)
classifier_randomForest = RandomForestClassifier(n_estimators=120)
# classifier_svc = SVC()
classifier_kNeighbors = KNeighborsClassifier(n_neighbors=10)
classifier_gradientBoosting = GradientBoostingClassifier(n_estimators=200)

linear_classifiers = [classifier_logisticRegression, classifier_sgd]
nonlinear_classifiers = [
    classifier_randomForest,
    # classifier_svc,
    classifier_kNeighbors,
    classifier_gradientBoosting
]
classifiers = linear_classifiers + nonlinear_classifiers

for name, classifier in zip(names, classifiers):
    print(f"Getting score for {name}:")
    classifier.fit(X_train_interactions if classifier in linear_classifiers else X_train, y_train)
    print(classifier.score(X_test_interactions if classifier in linear_classifiers else X_test, y_test))

Getting score for Logistic Regression:


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.6402278885569374
Getting score for Stochastic Gradient Descent:
0.6226498992565831
Getting score for Random Forest:
0.6400750364760648
Getting score for K Neighbors:
0.5911206836656708
Getting score for Gradient Boosting:
0.6563607309108594


In [40]:
import pickle
with open(Path('data') / "logistic_regression.pkl", "wb") as file:
    pickle.dump(classifier_logisticRegression, file)

In [41]:
with open(Path("data") / "logistic_regression.pkl", "rb") as file:
    model = pickle.load(file)

model.score(X_test, y_test)

0.6402278885569374