# Multi-class Classification - Predict the Poker Hand

Dataset:
https://archive.ics.uci.edu/ml/datasets/Poker+Hand

## Dataset observations

https://archive.ics.uci.edu/ml/machine-learning-databases/poker/poker-hand.names

- 10 classes
- 1 million test samples
- missing values: None
- classes are not balanced (some poker hands are rare)
- separate test dataset from train dataset

## Workflow

Data Gathering
1. read_csv for both training and test set

Data Transformation
2. transform dataframe
3. PCA to plot (for classification)
4. shuffle training set (train test split not necessary as there is a separate test set)
5. (scaling is optional because the column values are all similar)

Training
6. Train a Logistic Regression model
7. Train a Logistic Regression model with SGD

Validation
8. metrics
9. learning curve
10. predictions

In [5]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression, SGDClassifier

from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import learning_curve

## Data Gathering

1. read_csv for both training and test set

In [6]:
df = pd.read_csv('C:\\courses\\data\\poker-hand\\poker-hand-training-true.data',
                 names=['S1', 'C1', 'S2', 'C2', 'S3', 'C3', 'S4', 'C4', 'S5', 'C5', 'CLASS'])

df.head()

FileNotFoundError: File b'D:/tmp/poker-hand/poker-hand-training-true.data' does not exist

In [None]:
df.describe()

In [None]:
df_test = pd.read_csv('D:/tmp/poker-hand/poker-hand-testing.data',
                      names=['S1', 'C1', 'S2', 'C2', 'S3', 'C3', 'S4', 'C4', 'S5', 'C5', 'CLASS'])

df_test.head()

In [None]:
df_test.describe()

## Data Transformation (Estimated time: 30 min)
2. transform dataframe
3. PCA to plot (for classification)
4. shuffle training set (train test split not necessary as there is a separate test set)
5. (scaling is optional because the column values are all similar)

```
# How to shuffle a pandas DataFrame
df_shuffled = df.sample(frac=1)
```

In [None]:
# 0: Nothing in hand; not a recognized poker hand 
# 1: One pair; one pair of equal ranks within five cards
# 2: Two pairs; two pairs of equal ranks within five cards
# 3: Three of a kind; three equal ranks within five cards
# 4: Straight; five cards, sequentially ranked with no gaps
# 5: Flush; five cards with the same suit
# 6: Full house; pair + different rank three of a kind
# 7: Four of a kind; four equal ranks within five cards
# 8: Straight flush; straight + flush
# 9: Royal flush; {Ace, King, Queen, Jack, Ten} + flush

labels = np.array([[0, 'nothing'], [1, 'one pair'],
          [2, 'two pair'], [3, '3 of a kind'],
          [4, 'straight'], [5, 'flush'],
          [6, 'full house'], [7, '4 of a kind'],
          [8, 'straight flush'], [9, 'royal flush']])


X_train = df.loc[:, 'S1':'C5']
y_train = df.loc[:, 'CLASS']

pca = PCA(n_components=2)
X_train_2d = pca.fit_transform(X_train)

fig, ax = plt.subplots(figsize=(15, 10))

for value in y_train.unique():
#for value in [9, 8, 1, 0, 4, 3, 2, 5, 6, 7]:
#for value in [9, 8, 4, 3]:
    ax.scatter(X_train_2d[y_train==value][:, 0],
               X_train_2d[y_train==value][:, 1],
               label=labels[value, 1],
               cmap=plt.cm.plasma)
ax.legend()
plt.show()

In [None]:
# plot a subset of samples
density = .25
df_subset = df.sample(frac=density, random_state=42)

X_train_subset = df_subset.loc[:, 'S1':'C5']
y_train_subset = df_subset.loc[:, 'CLASS']

pca = PCA(n_components=2)
X_train_subset_2d = pca.fit_transform(X_train_subset)

fig, ax = plt.subplots(figsize=(15, 10))
for value in y_train.unique():    
    ax.scatter(X_train_subset_2d[y_train_subset==value][:, 0],
               X_train_subset_2d[y_train_subset==value][:, 1],
               label=labels[value, 1],
               cmap=plt.cm.plasma)
ax.legend()
plt.show()

In [None]:
# 1-d PCA
# plot a subset of samples
density = .25
df_subset = df.sample(frac=density, random_state=42)

X_train_subset = df_subset.loc[:, 'S1':'C5']
y_train_subset = df_subset.loc[:, 'CLASS']

pca = PCA(n_components=1)
X_train_subset_1d = pca.fit_transform(X_train_subset)

fig, ax = plt.subplots(figsize=(10, 8))
ax.scatter(X_train_subset_1d, y_train_subset)
ax.set_title('1D PCA')
plt.show()

In [None]:
# How to shuffle a pandas DataFrame
df_shuffled = df.sample(frac=1)

X_train = df_shuffled.loc[:, 'S1':'C5']
y_train = df_shuffled.loc[:, 'CLASS']

In [None]:
X_test = df_test.loc[:, 'S1':'C5']
y_test = df_test.loc[:, 'CLASS']

## Training
6. logistic regression
7. SGD logistic regression

In [None]:
logistic = LogisticRegression(random_state=42)
logistic.fit(X_train, y_train)

pred_logistic = logistic.predict(X_test)

In [None]:
logistic_bal = LogisticRegression(random_state=42, class_weight='balanced')
logistic_bal.fit(X_train, y_train)

pred_logistic_bal = logistic_bal.predict(X_test)

In [None]:
sgd = SGDClassifier(random_state=42, max_iter=1000, tol=1e-3)
sgd.fit(X_train, y_train)

pred_sgd = sgd.predict(X_test)

In [None]:
sgd_bal = SGDClassifier(random_state=42, max_iter=1000, tol=1e-3, class_weight='balanced')
sgd_bal.fit(X_train, y_train)

pred_sgd_bal = sgd_bal.predict(X_test)

In [None]:
from sklearn.dummy import DummyClassifier
baseline = DummyClassifier()
baseline.fit(X_train, y_train)

pred_baseline = baseline.predict(X_test)

## Validation
8. metrics
9. learning curve
10. prediction

In [None]:
print('Logistic Regresion:')
print(classification_report(y_test, pred_logistic))
print('Logistic Regresion (balanced):')
print(classification_report(y_test, pred_logistic_bal))

print('SGD:')
print(classification_report(y_test, pred_sgd))
print('SGD (balanced):')
print(classification_report(y_test, pred_sgd_bal))

print('Baseline:')
print(classification_report(y_test, pred_baseline))

In [None]:
import seaborn as sns

cm_logistic = confusion_matrix(y_test, pred_logistic)
cm_sgd = confusion_matrix(y_test, pred_sgd)
cm_baseline = confusion_matrix(y_test, pred_baseline)

fig, axes = plt.subplots(nrows=3, ncols=1, figsize=(10, 30))
ax = axes.flatten()

# annotate cells with ticks
sns.heatmap(cm_logistic, annot=True, ax=ax[0])
sns.heatmap(cm_sgd, annot=True, ax=ax[1])
sns.heatmap(cm_baseline, annot=True, ax=ax[2])

print(cm_logistic)
print(cm_sgd)
print(cm_baseline)

tick_labels = labels[:, 0]

ax[0].set(xlabel='Predicted labels', ylabel='True labels', title='Confusion Matrix (Logistic Regression)') 
ax[0].xaxis.set_ticklabels(tick_labels)
ax[0].yaxis.set_ticklabels(tick_labels)

ax[1].set(xlabel='Predicted labels', ylabel='True labels', title='Confusion Matrix (Logistic Regression using SGD)'); 
ax[1].xaxis.set_ticklabels(tick_labels)
ax[1].yaxis.set_ticklabels(tick_labels)

ax[2].set(xlabel='Predicted labels', ylabel='True labels', title='Confusion Matrix (Baseline)'); 
ax[2].xaxis.set_ticklabels(tick_labels)
ax[2].yaxis.set_ticklabels(tick_labels)

plt.show()

## Alternative Feature Extraction (from Kaggle)

https://www.kaggle.com/c/poker-rule-induction/discussion/11177

```
Extract these features:

- flush (1 or 0 boolean)

- kind1 1 through 5 , number of most frequent card (e.g. 4 means 4 of a kind, 3 could be 3 of a kind or full house, 2 is 2 of a kind or 2 pair, etc.,  1 means, straight or nothing)

- kind2 number of second-most frequent card, so if kind1=3 and kind2=2, full house, kind1=2, kind2=1, then one pair

- high card (1 considered high if it occurs --yeah, concession to poker and other card games)

- low card (needed to distinguish royal flush from ace 2 3 4 5)

- straight (1 or 0 boolean)
```

Reference code:
https://github.com/tdvance/kaggle_submissions/blob/master/poker/dataPrep.py

### Getting the code to run 

In [None]:
def histogram(l):
    h={}
    for x in l:
        if x in h:
            h[x]+=1
        else:
            h[x] = 1
    return h

def hOfH(l, size=None):
    h = histogram(l)
    h = histogram(dict.values(h))
    result = list(dict.values(h))
    result.sort(reverse=True)
    if(size is not None):
        while(len(result)<size):
            result += [0]
        while len(result) > size:
            result.pop()
    return result

def handToFeatures(hand):
    #import pdb; pdb.set_trace()
    suits = [hand[0], hand[2], hand[4], hand[6], hand[8]]
    ranks = [hand[1], hand[3], hand[5], hand[7], hand[9]]
    sh = hOfH(suits, 4)

    flush = int(sh[0] == 1 and sh[1] == 0)

    h = list(dict.values(histogram(ranks)))
    h.sort(reverse=True)
    kind1 = h[0]
    kind2 = h[1]

    ranks.sort()
    if(1 in ranks):
        high = 1
        low = ranks[0]
        if low == 1:
            low = ranks[1]
    else:
        high = ranks[-1]
        low = ranks[0]

    normalized = [(r - low + 13)%13 for r in ranks]
    normalized.sort()
    straight = int(normalized[-1]==4)

    return [flush, kind1, kind2, high, low, straight]

In [None]:
# let's test the code first, before making any changes
df.iloc[0]

In [None]:
print(df.iloc[0, :10].values)

print(handToFeatures(df.iloc[0, :10].values))

### Transformation

In [None]:
# select just the features (ignoring the class)
df_original_features = df.iloc[:, :10]

df_original_features.head()

In [None]:
# compose a new dataframe with the new features
# apply the transformation per row
# (confusingly, axis=1 means per row for pandas.DataFrame.apply())

df_new_features = df_original_features.apply(handToFeatures, axis=1)

df_new_features.head()

In [None]:
# problem with the above dataframe is that it's a Series with nested lists
# we need it to be 6 columns

print(type(df_new_features))

print(df_new_features.iloc[0]) # returns a list - we want 6 columns

In [None]:
# convert the list into columns, and add column names too
df_new_features = pd.DataFrame(df_new_features.values.tolist(),
                              columns=['flush', 'most_frequent', '2nd_most_frequent', 'high_card', 'low_card', 'straight'],
                              index=df_new_features.index)

df_new_features.head()

In [None]:
# tack on the class so that we can shuffle the X and y values together

df_new_features['CLASS'] = df['CLASS']
df_new_features.head()

In [None]:
df_new_features.to_csv('C:\\courses\\data\\poker-hand-kaggleSolution\\kaggle_train.csv', index=False)

In [None]:
df_2 = pd.read_csv('C:\\courses\\data\\poker-hand-kaggleSolution\\kaggle_train.csv')
df_2.head()

In [None]:
# Repeat the same procedure for test

# apply() can take a while to run through 1 million rows
%time df_new_features_test = df_test.iloc[:, :10].apply(handToFeatures, axis=1)

df_new_features_test = pd.DataFrame(df_new_features_test.values.tolist(),
                                    columns=['flush', 'most_frequent', '2nd_most_frequent', 'high_card', 'low_card', 'straight'],
                                    index=df_new_features_test.index)

df_new_features_test['CLASS'] = df_test['CLASS']
df_new_features_test.head()

In [None]:
df_new_features_test.to_csv('C:\\courses\\data\\poker-hand-kaggleSolution\\kaggle_test.csv', index=False)

In [None]:
df_2_test = pd.read_csv('C:\\courses\\data\\poker-hand-kaggleSolution\\kaggle_test.csv')
df_2_test.head()

In [None]:
labels = np.array([[0, 'nothing'], [1, 'one pair'],
          [2, 'two pair'], [3, '3 of a kind'],
          [4, 'straight'], [5, 'flush'],
          [6, 'full house'], [7, '4 of a kind'],
          [8, 'straight flush'], [9, 'royal flush']])


X_train = df_new_features.loc[:, 'flush':'straight']
y_train = df_new_features.loc[:, 'CLASS']

pca = PCA(n_components=2)
X_train_2d = pca.fit_transform(X_train)

fig, ax = plt.subplots(figsize=(15, 10))

for value in y_train.unique():
    ax.scatter(X_train_2d[y_train==value][:, 0],
               X_train_2d[y_train==value][:, 1],
               label=labels[value, 1],
               cmap=plt.cm.plasma)
ax.set_title('2D PCA (new features)')
ax.legend()
plt.show()

In [None]:
# 1-D PCA
pca = PCA(n_components=1)
X_train_1d = pca.fit_transform(X_train)

fig, ax = plt.subplots(figsize=(10, 8))
ax.scatter(X_train_1d, y_train)
ax.set_title('1D PCA (new features)')
plt.show()

In [None]:
# Shuffle, train, etc.

df_shuffled = df_new_features.sample(frac=1)
X_train = df_shuffled.loc[:, 'flush':'straight']
y_train = df_shuffled.loc[:, 'CLASS']

X_test = df_new_features_test.loc[:, 'flush':'straight']
y_test = df_new_features_test.loc[:, 'CLASS']

print(X_train.head())
print(y_train.head())
print(X_test.head())
print(y_test.head())

In [None]:
logistic = LogisticRegression(random_state=42)
logistic.fit(X_train, y_train)

pred_logistic = logistic.predict(X_test)
print(classification_report(y_test, pred_logistic))
print(confusion_matrix(y_test, pred_logistic))