# Ghouls-Goblins-Ghosts
Classification for kaggle problem

## Formatting 

In [106]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.decomposition import KernelPCA
from sklearn.preprocessing import LabelEncoder, LabelBinarizer, StandardScaler
from sklearn.metrics import accuracy_score
%matplotlib inline

In [135]:
train = pd.read_csv('resources/train.csv')
test = pd.read_csv('resources/test.csv')

In [136]:
train.shape, test.shape

((371, 7), (529, 6))

In [137]:
train.head()

Unnamed: 0,id,bone_length,rotting_flesh,hair_length,has_soul,color,type
0,0,0.354512,0.350839,0.465761,0.781142,clear,Ghoul
1,1,0.57556,0.425868,0.531401,0.439899,green,Goblin
2,2,0.467875,0.35433,0.811616,0.791225,black,Ghoul
3,4,0.776652,0.508723,0.636766,0.884464,black,Ghoul
4,5,0.566117,0.875862,0.418594,0.636438,green,Ghost


In [138]:
train_y = train['type']
train_x = train.drop(['id', 'type'], axis=1)

In [139]:
test.head()

Unnamed: 0,id,bone_length,rotting_flesh,hair_length,has_soul,color
0,3,0.471774,0.387937,0.706087,0.698537,black
1,6,0.427332,0.645024,0.565558,0.451462,white
2,9,0.549602,0.491931,0.660387,0.449809,black
3,10,0.638095,0.682867,0.471409,0.356924,white
4,13,0.361762,0.583997,0.377256,0.276364,black


In [140]:
test_x = test.drop(['id'], axis=1)

In [141]:
dataset = pd.concat([train_x, test_x], axis=0)

In [142]:
dataset.shape

(900, 5)

## Preprocessing

Check for missing values

In [143]:
for col in dataset.columns:
    if np.any(pd.isnull(dataset[col])):
        print(col)

Convert categorical attributes to numeric

In [144]:
set(dataset['color'])

{'black', 'blood', 'blue', 'clear', 'green', 'white'}

In [145]:
encoder = LabelEncoder()
dataset['color'] = encoder.fit_transform(dataset['color'])

In [146]:
set(train_y)

{'Ghost', 'Ghoul', 'Goblin'}

In [147]:
encoder = LabelBinarizer()
train_y = encoder.fit_transform(train_y)

In [148]:
dataset.head()

Unnamed: 0,bone_length,rotting_flesh,hair_length,has_soul,color
0,0.354512,0.350839,0.465761,0.781142,3
1,0.57556,0.425868,0.531401,0.439899,4
2,0.467875,0.35433,0.811616,0.791225,0
3,0.776652,0.508723,0.636766,0.884464,0
4,0.566117,0.875862,0.418594,0.636438,4


## Normalization 

In [149]:
normalize = StandardScaler()
dataset = normalize.fit_transform(dataset)

In [150]:
dataset

array([[-0.55352147, -1.00416702, -0.33005656,  1.76375272, -0.24332911],
       [ 1.08789151, -0.51531896,  0.05406109, -0.15279153,  0.3735616 ],
       [ 0.28826969, -0.98141913,  1.69383282,  1.82038426, -2.09400126],
       ..., 
       [ 0.60476305, -0.04308772,  1.08760456,  1.38752468,  0.99045231],
       [-0.72460763,  1.69970305, -1.0764836 , -1.53704571, -0.24332911],
       [-1.27917377,  1.21735271, -0.62594951, -0.6634746 ,  0.99045231]])

In [151]:
train_x = dataset[:train_x.shape[0],:]
test_x = dataset[train_x.shape[0]:,:]

In [152]:
train_x.shape, test_x.shape

((371, 5), (529, 5))

## MLP Classifier

In [213]:
in_dim = train_x.shape[1]
hid_dim_1 = 4
hid_dim_2 = 4
out_dim = 3

w0 = np.random.normal(0., 0.25, (in_dim, hid_dim_1))
w1 = np.random.normal(0., 0.25, (hid_dim_1, hid_dim_2))
w2 = np.random.normal(0., 0.25, (hid_dim_2, out_dim))

b0 = np.random.normal(0., 0.25, (1, hid_dim_1))
b1 = np.random.normal(0., 0.25, (1, hid_dim_2))
b2 = np.random.normal(0., 0.25, (1, out_dim))

In [214]:
def sigmoid(x, deriv = False):
    if deriv:
        return sigmoid(x) * (1 - sigmoid(x))
    return 1 / (1 + np.exp(-x))

In [215]:
num_itr = 70000
batch_size = 100
alpha = 0.01

idx = np.arange(train_x.shape[0])
np.random.shuffle(idx)
id_val = idx[:batch_size]
id_train = idx[batch_size:]
valid_set = train_x[id_val]
valid_labels = train_y[id_val]
X = train_x[id_train]
Y = train_y[id_train]

for i in range(num_itr):
    z1 = np.dot(X,w0) + b0
    a1 = sigmoid(z1)
    z2 = np.dot(a1,w1) + b1
    a2 = sigmoid(z2)
    z3 = np.dot(a2,w2) + b2
    a3 = np.exp(z3) / np.sum(np.exp(z3), axis=1).reshape(-1,1)
    
    loss = -np.sum((Y * np.log(a3)) + ((1 - Y) * np.log(1 - a3))) / Y.shape[0]

    dw2 = a3 - Y
    dw1 = np.dot(dw2, w2.T) * sigmoid(np.dot(a1, w1), deriv=True)
    dw0 = np.dot(dw1, w1.T) * sigmoid(np.dot(X, w0), deriv=True)

    db2 = a3 - Y
    db1 = np.dot(dw2, w2.T) * sigmoid(np.dot(a1, w1), deriv=True)
    db0 = np.dot(dw1, w1.T) * sigmoid(np.dot(X, w0), deriv=True)

    w2 -= (alpha * np.dot(a2.T, dw2)) / Y.shape[0]
    w1 -= (alpha * np.dot(a1.T, dw1)) / Y.shape[0]
    w0 -= (alpha * np.dot(X.T, dw0)) / Y.shape[0]
    
    b2 -= (alpha * np.dot(np.ones((X.shape[0], 1)).T, db2))/ Y.shape[0]
    b1 -= (alpha * np.dot(np.ones((X.shape[0], 1)).T, db1))/ Y.shape[0]
    b0 -= (alpha * np.dot(np.ones((X.shape[0], 1)).T, db0))/ Y.shape[0]
    
    if i%10000==0:
        z1 = np.dot(valid_set,w0) + b0
        a1 = sigmoid(z1)
        z2 = np.dot(a1,w1) + b1
        a2 = sigmoid(z2)
        z3 = np.dot(a2,w2) + b2
        a3 = np.exp(z3) / np.sum(np.exp(z3), axis=1).reshape(-1,1)
        pred = np.argmax(a3, axis=1).reshape(-1,1)
        target = np.argmax(valid_labels, axis=1).reshape(-1,1)
        print(accuracy_score(pred, target), loss)

0.39 1.9256026928
0.69 1.23861075972
0.71 1.12048515972
0.75 1.03956640379
0.75 1.01493392943
0.74 1.00012107152
0.75 0.987208891546


In [216]:
z1 = np.dot(test_x,w0) + b0
a1 = sigmoid(z1)
z2 = np.dot(a1,w1) + b1
a2 = sigmoid(z2)
z3 = np.dot(a2,w2) + b2
a3 = np.exp(z3) / np.sum(np.exp(z3), axis=1).reshape(-1,1)
pred = np.argmax(a3, axis=1).reshape(-1,1)

In [217]:
result = pd.DataFrame()
result['id'] = test['id']
result['type'] = pred

In [218]:
result['type'] = result['type'].replace({1:'Ghoul', 2:'Goblin', 0:'Ghost'})

In [219]:
result.head()

Unnamed: 0,id,type
0,3,Ghoul
1,6,Goblin
2,9,Ghoul
3,10,Goblin
4,13,Ghost


In [220]:
result.to_csv('submission/NN_sub_3.csv', index=False)