# Ghouls, Goblins, and Ghosts
## Multiclass Classification Task

Falconi Nicasio

April 23 2019

In [6]:
# load data
import pandas as pd
train_data = pd.read_csv("../input/train.csv")
test_data = pd.read_csv("../input/test.csv")

In [7]:
from sklearn.preprocessing import LabelEncoder

# encode color features
gle = LabelEncoder()
train_color_labels = gle.fit_transform(train_data['color'])
test_color_labels = gle.fit_transform(test_data['color'])
train_data['color_labels'] = train_color_labels
test_data['color_labels'] = test_color_labels

In [8]:
# take target out of training set
Y = train_data['type']
train_data = train_data.drop(['type', 'id', 'color'], axis=1)
test_data = test_data.drop(['id', 'color'], axis=1)

In [9]:
# standardize values
train_data = (train_data - train_data.mean()) / train_data.std()
test_data = (test_data - test_data.mean()) / test_data.std()

In [10]:
train_data.describe()

Unnamed: 0,bone_length,rotting_flesh,hair_length,has_soul,color_labels
count,371.0,371.0,371.0,371.0,371.0
mean,4.387027e-16,-5.284781e-16,-9.288766e-16,2.503538e-15,-4.488772e-18
std,1.0,1.0,1.0,1.0,1.0
min,-2.809001,-2.809293,-2.322011,-2.623018,-2.107595
25%,-0.7088173,-0.6288435,-0.7162151,-0.7005625,-0.2503082
50%,0.005504366,-0.03619028,0.05608044,-0.02850387,-0.2503082
75%,0.6253172,0.6636377,0.695285,0.7336534,0.9878831
max,2.882125,2.908063,2.771516,2.636297,0.9878831


In [11]:
#check correlation for possible feature engineering
train_data.corr()

Unnamed: 0,bone_length,rotting_flesh,hair_length,has_soul,color_labels
bone_length,1.0,-0.041716,0.353881,0.381675,-0.03383
rotting_flesh,-0.041716,1.0,-0.220353,-0.132051,-0.041714
hair_length,0.353881,-0.220353,1.0,0.474835,0.009093
has_soul,0.381675,-0.132051,0.474835,1.0,-0.025546
color_labels,-0.03383,-0.041714,0.009093,-0.025546,1.0


In [12]:
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV

# initiate model
model_to_set = OneVsRestClassifier(LogisticRegression(penalty = 'l2', max_iter = 1000))

# create parameter options
parameters = {
    "estimator__C": [0.1, 0.5, 0.7, 1.0, 1.2, 2, 5, 10, 20, 100],
    "estimator__solver": ['newton-cg', 'lbfgs', 'sag'],
    "estimator__multi_class" : ['multinomial', 'ovr']
}

# Cross validation
model_tunning = GridSearchCV(model_to_set, param_grid=parameters,
                             cv = 2)

model_tunning.fit(train_data, Y)

print(model_tunning.best_score_)
print(model_tunning.best_params_)

0.738544474393531
{'estimator__C': 0.1, 'estimator__multi_class': 'ovr', 'estimator__solver': 'newton-cg'}


In [13]:
# create model
ovr = OneVsRestClassifier(LogisticRegression(penalty = "l2", C = 0.1, multi_class = 'ovr', solver = 'newton-cg'))
# train model
ovr_fitted = ovr.fit(train_data, Y)
# predict test_data
res = ovr_fitted.predict(test_data)

In [14]:
# save predictions
sample_data = pd.read_csv("../input/sample_submission.csv")
sample_data['type'] = res
sample_data.to_csv('prediction.csv', index = False)