In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score


pd.set_option('display.max_columns', None)
# магическая функция, позволяющая выводить графики прямо в ноутбук
%matplotlib inline

# DATA TRANSFORMATION

Read the file

In [3]:
data = pd.read_csv('C:/Users/Marianna Rybnikova/Desktop/Graphs_NN/poker-hand-testing.txt', header=None) # около 1 000 000 строк


Name columns

In [4]:
data.columns = ['A' + str(i) for i in range(1, 11)] + ['Combination']
data.head()

Unnamed: 0,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,Combination
0,1,1,1,13,2,4,2,3,1,12,0
1,3,12,3,2,3,11,4,5,2,5,1
2,1,9,4,6,1,4,3,2,3,9,1
3,1,4,3,13,2,13,2,1,3,6,1
4,3,10,2,7,1,2,2,11,4,9,0


Add new columns

In [5]:
data['rank_1'] = data['A2']
data['rank_2'] = data['A4']
data['rank_3'] = data['A6']
data['rank_4'] = data['A8']
data['rank_5'] = data['A10']

In [6]:
data['suit_1'] = data['A1']
data['suit_2'] = data['A3']
data['suit_3'] = data['A5']
data['suit_4'] = data['A7']
data['suit_5'] = data['A9']

Turn new columns into the binary format: each of the recieved numbers has exactly one "1" in their binary representation. Numbers are written in their decimak notation (this is done for both rank and suit)

In [7]:
data['suit_1'] = 2**(data['suit_1'] - 1)
data['suit_2'] = 2**(data['suit_2'] - 1)
data['suit_3'] = 2**(data['suit_3'] - 1)
data['suit_4'] = 2**(data['suit_4'] - 1)
data['suit_5'] = 2**(data['suit_5'] - 1)
data

Unnamed: 0,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,Combination,rank_1,rank_2,rank_3,rank_4,rank_5,suit_1,suit_2,suit_3,suit_4,suit_5
0,1,1,1,13,2,4,2,3,1,12,0,1,13,4,3,12,1,1,2,2,1
1,3,12,3,2,3,11,4,5,2,5,1,12,2,11,5,5,4,4,4,8,2
2,1,9,4,6,1,4,3,2,3,9,1,9,6,4,2,9,1,8,1,4,4
3,1,4,3,13,2,13,2,1,3,6,1,4,13,13,1,6,1,4,2,2,4
4,3,10,2,7,1,2,2,11,4,9,0,10,7,2,11,9,4,2,1,2,8
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
999995,3,1,1,12,2,9,4,9,2,6,1,1,12,9,9,6,4,1,2,8,2
999996,3,3,4,5,2,7,1,4,4,3,1,3,5,7,4,3,4,8,2,1,8
999997,1,11,4,7,3,9,1,13,2,7,1,11,7,9,13,7,1,8,4,1,2
999998,3,11,1,8,1,1,3,13,2,8,1,11,8,1,13,8,4,1,1,4,2


In [8]:
data['rank_1'] = (2 ** data['A2']) * 16
data['rank_2'] = (2 ** data['A4']) * 16
data['rank_3'] = (2 ** data['A6']) * 16
data['rank_4'] = (2 ** data['A8']) * 16
data['rank_5'] = (2 ** data['A10']) * 16
data

Unnamed: 0,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,Combination,rank_1,rank_2,rank_3,rank_4,rank_5,suit_1,suit_2,suit_3,suit_4,suit_5
0,1,1,1,13,2,4,2,3,1,12,0,32,131072,256,128,65536,1,1,2,2,1
1,3,12,3,2,3,11,4,5,2,5,1,65536,64,32768,512,512,4,4,4,8,2
2,1,9,4,6,1,4,3,2,3,9,1,8192,1024,256,64,8192,1,8,1,4,4
3,1,4,3,13,2,13,2,1,3,6,1,256,131072,131072,32,1024,1,4,2,2,4
4,3,10,2,7,1,2,2,11,4,9,0,16384,2048,64,32768,8192,4,2,1,2,8
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
999995,3,1,1,12,2,9,4,9,2,6,1,32,65536,8192,8192,1024,4,1,2,8,2
999996,3,3,4,5,2,7,1,4,4,3,1,128,512,2048,256,128,4,8,2,1,8
999997,1,11,4,7,3,9,1,13,2,7,1,32768,2048,8192,131072,2048,1,8,4,1,2
999998,3,11,1,8,1,1,3,13,2,8,1,32768,4096,32,131072,4096,4,1,1,4,2


Sum up suit and rank for each card, recieve one feature instead of two: each card is a number with exactly two "1" in their binary representation.

In [9]:
data['card_1'] = data['suit_1'] + data['rank_1']

data['card_2'] = data['suit_2'] + data['rank_2']

data['card_3'] = data['suit_3'] + data['rank_3']

data['card_4'] = data['suit_4'] + data['rank_4']

data['card_5'] = data['suit_5'] + data['rank_5']

data

Unnamed: 0,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,Combination,rank_1,rank_2,rank_3,rank_4,rank_5,suit_1,suit_2,suit_3,suit_4,suit_5,card_1,card_2,card_3,card_4,card_5
0,1,1,1,13,2,4,2,3,1,12,0,32,131072,256,128,65536,1,1,2,2,1,33,131073,258,130,65537
1,3,12,3,2,3,11,4,5,2,5,1,65536,64,32768,512,512,4,4,4,8,2,65540,68,32772,520,514
2,1,9,4,6,1,4,3,2,3,9,1,8192,1024,256,64,8192,1,8,1,4,4,8193,1032,257,68,8196
3,1,4,3,13,2,13,2,1,3,6,1,256,131072,131072,32,1024,1,4,2,2,4,257,131076,131074,34,1028
4,3,10,2,7,1,2,2,11,4,9,0,16384,2048,64,32768,8192,4,2,1,2,8,16388,2050,65,32770,8200
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
999995,3,1,1,12,2,9,4,9,2,6,1,32,65536,8192,8192,1024,4,1,2,8,2,36,65537,8194,8200,1026
999996,3,3,4,5,2,7,1,4,4,3,1,128,512,2048,256,128,4,8,2,1,8,132,520,2050,257,136
999997,1,11,4,7,3,9,1,13,2,7,1,32768,2048,8192,131072,2048,1,8,4,1,2,32769,2056,8196,131073,2050
999998,3,11,1,8,1,1,3,13,2,8,1,32768,4096,32,131072,4096,4,1,1,4,2,32772,4097,33,131076,4098


Write the result in a new txt file:

In [10]:
with open('C:/Users/Marianna Rybnikova/Desktop/Graphs_NN/result.txt', 'w') as file:
    for index, row in data.iterrows():
        print('%0.0f,%0.0f,%0.0f,%0.0f,%0.0f,%0.0f' % (row['Combination'], row['card_1'],
                                                       row['card_2'], row['card_3'], row['card_4'],
                                                      row['card_5']), file=file)

# DATA SORTING

Read the file:

In [11]:
binary_data = pd.read_csv('C:/Users/Marianna Rybnikova/Desktop/Graphs_NN/result.txt', header=None) 
binary_data

Unnamed: 0,0,1,2,3,4,5
0,0,33,131073,258,130,65537
1,1,65540,68,32772,520,514
2,1,8193,1032,257,68,8196
3,1,257,131076,131074,34,1028
4,0,16388,2050,65,32770,8200
...,...,...,...,...,...,...
999995,1,36,65537,8194,8200,1026
999996,1,132,520,2050,257,136
999997,1,32769,2056,8196,131073,2050
999998,1,32772,4097,33,131076,4098


Name each column:

In [12]:
binary_data.columns = ['Combination'] + ['A' + str(i) for i in range(1, 6)]
binary_data.head()

Unnamed: 0,Combination,A1,A2,A3,A4,A5
0,0,33,131073,258,130,65537
1,1,65540,68,32772,520,514
2,1,8193,1032,257,68,8196
3,1,257,131076,131074,34,1028
4,0,16388,2050,65,32770,8200


Rewrite each "hand" in such form that the leftmost card is the weakest, the ritghtmost card is the strongest:

In [13]:
with open('C:/Users/Marianna Rybnikova/Desktop/Graphs_NN/sorted_binary_data.txt', 'w') as file:
    for index, row in binary_data.iterrows():
        cards = [row['A1'], row['A2'], row['A3'], row['A4'], row['A5']]
        sort = sorted(cards)
        print('%0.0f,%0.0f,%0.0f,%0.0f,%0.0f,%0.0f' % (sort[0], sort[1], 
              sort[2], sort[3], sort[4], row['Combination']), file=file)

In [14]:
sorted_binary_data = pd.read_csv('C:/Users/Marianna Rybnikova/Desktop/Graphs_NN/sorted_binary_data.txt', header=None) 
sorted_binary_data.columns = ['A' + str(i) for i in range(1, 6)] + ['Combination']
sorted_binary_data.head()

Unnamed: 0,A1,A2,A3,A4,A5,Combination
0,33,130,258,65537,131073,0
1,68,514,520,32772,65540,1
2,68,257,1032,8193,8196,1
3,34,257,1028,131074,131076,1
4,65,2050,8200,16388,32770,0


Now apply the same algorithm for the testing file

In [15]:
test_data = pd.read_csv('C:/Users/Marianna Rybnikova/Desktop/Graphs_NN/poker-hand-training-true.txt', header=None) # около 1 000 000 строк


In [16]:
test_data.columns = ['A' + str(i) for i in range(1, 11)] + ['Combination']
test_data.head()

Unnamed: 0,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,Combination
0,1,10,1,11,1,13,1,12,1,1,9
1,2,11,2,13,2,10,2,12,2,1,9
2,3,12,3,11,3,13,3,10,3,1,9
3,4,10,4,11,4,1,4,13,4,12,9
4,4,1,4,13,4,12,4,11,4,10,9


In [17]:
test_data['rank_1'] = test_data['A2']
test_data['rank_2'] = test_data['A4']
test_data['rank_3'] = test_data['A6']
test_data['rank_4'] = test_data['A8']
test_data['rank_5'] = test_data['A10']
test_data['suit_1'] = test_data['A1']
test_data['suit_2'] = test_data['A3']
test_data['suit_3'] = test_data['A5']
test_data['suit_4'] = test_data['A7']
test_data['suit_5'] = test_data['A9']

In [18]:
test_data['suit_1'] = 2**(test_data['suit_1'] - 1)
test_data['suit_2'] = 2**(test_data['suit_2'] - 1)
test_data['suit_3'] = 2**(test_data['suit_3'] - 1)
test_data['suit_4'] = 2**(test_data['suit_4'] - 1)
test_data['suit_5'] = 2**(test_data['suit_5'] - 1)
test_data

Unnamed: 0,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,Combination,rank_1,rank_2,rank_3,rank_4,rank_5,suit_1,suit_2,suit_3,suit_4,suit_5
0,1,10,1,11,1,13,1,12,1,1,9,10,11,13,12,1,1,1,1,1,1
1,2,11,2,13,2,10,2,12,2,1,9,11,13,10,12,1,2,2,2,2,2
2,3,12,3,11,3,13,3,10,3,1,9,12,11,13,10,1,4,4,4,4,4
3,4,10,4,11,4,1,4,13,4,12,9,10,11,1,13,12,8,8,8,8,8
4,4,1,4,13,4,12,4,11,4,10,9,1,13,12,11,10,8,8,8,8,8
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25005,3,9,2,6,4,11,4,12,2,4,0,9,6,11,12,4,4,2,8,8,2
25006,4,1,4,10,3,13,3,4,1,10,1,1,10,13,4,10,8,8,4,4,1
25007,2,1,2,10,4,4,4,1,4,13,1,1,10,4,1,13,2,2,8,8,8
25008,2,12,4,3,1,10,1,12,4,9,1,12,3,10,12,9,2,8,1,1,8


In [19]:
test_data['rank_1'] = (2 ** test_data['A2']) * 16
test_data['rank_2'] = (2 ** test_data['A4']) * 16
test_data['rank_3'] = (2 ** test_data['A6']) * 16
test_data['rank_4'] = (2 ** test_data['A8']) * 16
test_data['rank_5'] = (2 ** test_data['A10']) * 16
test_data

Unnamed: 0,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,Combination,rank_1,rank_2,rank_3,rank_4,rank_5,suit_1,suit_2,suit_3,suit_4,suit_5
0,1,10,1,11,1,13,1,12,1,1,9,16384,32768,131072,65536,32,1,1,1,1,1
1,2,11,2,13,2,10,2,12,2,1,9,32768,131072,16384,65536,32,2,2,2,2,2
2,3,12,3,11,3,13,3,10,3,1,9,65536,32768,131072,16384,32,4,4,4,4,4
3,4,10,4,11,4,1,4,13,4,12,9,16384,32768,32,131072,65536,8,8,8,8,8
4,4,1,4,13,4,12,4,11,4,10,9,32,131072,65536,32768,16384,8,8,8,8,8
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25005,3,9,2,6,4,11,4,12,2,4,0,8192,1024,32768,65536,256,4,2,8,8,2
25006,4,1,4,10,3,13,3,4,1,10,1,32,16384,131072,256,16384,8,8,4,4,1
25007,2,1,2,10,4,4,4,1,4,13,1,32,16384,256,32,131072,2,2,8,8,8
25008,2,12,4,3,1,10,1,12,4,9,1,65536,128,16384,65536,8192,2,8,1,1,8


In [20]:
test_data['card_1'] = test_data['suit_1'] + test_data['rank_1']
test_data['card_2'] = test_data['suit_2'] + test_data['rank_2']
test_data['card_3'] = test_data['suit_3'] + test_data['rank_3']
test_data['card_4'] = test_data['suit_4'] + test_data['rank_4']
test_data['card_5'] = test_data['suit_5'] + test_data['rank_5']
test_data

Unnamed: 0,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,Combination,rank_1,rank_2,rank_3,rank_4,rank_5,suit_1,suit_2,suit_3,suit_4,suit_5,card_1,card_2,card_3,card_4,card_5
0,1,10,1,11,1,13,1,12,1,1,9,16384,32768,131072,65536,32,1,1,1,1,1,16385,32769,131073,65537,33
1,2,11,2,13,2,10,2,12,2,1,9,32768,131072,16384,65536,32,2,2,2,2,2,32770,131074,16386,65538,34
2,3,12,3,11,3,13,3,10,3,1,9,65536,32768,131072,16384,32,4,4,4,4,4,65540,32772,131076,16388,36
3,4,10,4,11,4,1,4,13,4,12,9,16384,32768,32,131072,65536,8,8,8,8,8,16392,32776,40,131080,65544
4,4,1,4,13,4,12,4,11,4,10,9,32,131072,65536,32768,16384,8,8,8,8,8,40,131080,65544,32776,16392
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25005,3,9,2,6,4,11,4,12,2,4,0,8192,1024,32768,65536,256,4,2,8,8,2,8196,1026,32776,65544,258
25006,4,1,4,10,3,13,3,4,1,10,1,32,16384,131072,256,16384,8,8,4,4,1,40,16392,131076,260,16385
25007,2,1,2,10,4,4,4,1,4,13,1,32,16384,256,32,131072,2,2,8,8,8,34,16386,264,40,131080
25008,2,12,4,3,1,10,1,12,4,9,1,65536,128,16384,65536,8192,2,8,1,1,8,65538,136,16385,65537,8200


In [21]:
with open('C:/Users/Marianna Rybnikova/Desktop/Graphs_NN/test.txt', 'w') as file:
    for index, row in test_data.iterrows():
        print('%0.0f,%0.0f,%0.0f,%0.0f,%0.0f,%0.0f' % (row['Combination'], row['card_1'],
                                                       row['card_2'], row['card_3'], row['card_4'],
                                                      row['card_5']), file=file)

In [22]:
binary_data2 = pd.read_csv('C:/Users/Marianna Rybnikova/Desktop/Graphs_NN/test.txt', header=None) 
binary_data2

Unnamed: 0,0,1,2,3,4,5
0,9,16385,32769,131073,65537,33
1,9,32770,131074,16386,65538,34
2,9,65540,32772,131076,16388,36
3,9,16392,32776,40,131080,65544
4,9,40,131080,65544,32776,16392
...,...,...,...,...,...,...
25005,0,8196,1026,32776,65544,258
25006,1,40,16392,131076,260,16385
25007,1,34,16386,264,40,131080
25008,1,65538,136,16385,65537,8200


In [23]:
binary_data2.columns = ['Combination'] + ['A' + str(i) for i in range(1, 6)]
binary_data2.head()

Unnamed: 0,Combination,A1,A2,A3,A4,A5
0,9,16385,32769,131073,65537,33
1,9,32770,131074,16386,65538,34
2,9,65540,32772,131076,16388,36
3,9,16392,32776,40,131080,65544
4,9,40,131080,65544,32776,16392


In [24]:
with open('C:/Users/Marianna Rybnikova/Desktop/Graphs_NN/sorted_binary_data_TEST.txt', 'w') as file:
    for index, row in binary_data2.iterrows():
        cards = [row['A1'], row['A2'], row['A3'], row['A4'], row['A5']]
        sort = sorted(cards)
        print('%0.0f,%0.0f,%0.0f,%0.0f,%0.0f,%0.0f' % (sort[0], sort[1], 
              sort[2], sort[3], sort[4], row['Combination']), file=file)