用神经网络对Otto商品分类预测

In [None]:
import numpy as np
import pandas as pd
from patsy import dmatrices
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn import metrics
import seaborn as sns
import matplotlib.pyplot as plt

从./input/train.csv读入数据

In [None]:
data=pd.read_csv('../input/train.csv',sep=',')
data_test=pd.read_csv('../input/test.csv',sep=',')

数据中从第一列到倒数第二列是feature

In [None]:
columns=data.columns[1:-1]

In [None]:
X = data[columns]

In [None]:
y = np.ravel(data['target'])

观察商品种类的分布

In [None]:
data.target.value_counts().plot(kind='bar')

显示feature 20在不同类下的分布图

In [None]:

fig, axes = plt.subplots(3, 3, figsize=(10, 6),gridspec_kw=dict(hspace=0.5, wspace=0.4))  
for i, ax in enumerate(axes.flat):
    ax.hist(data[data.target=='Class_'+str(i+1)].feat_20)
    ax.set_title('feat_20 in Class_'+str(i+1))

显示feature 19和feature 20的散点图

In [None]:
plt.scatter(data['feat_19'],data['feat_20'])

显示所有feature的相关系数矩阵

In [None]:
#X.corr()
sns.heatmap(X.corr(), square=True)

In [None]:
num_fea = X.shape[1]

#这里用GridSearchCV来找最好超参数，注意根据本题的评分标准，scoring要用neg_log_loss 而不是之前常用的accuracy

In [None]:
"""
from sklearn.model_selection import GridSearchCV
gs = GridSearchCV(MLPClassifier(), scoring="neg_log_loss", cv=3, verbose=3,
                 param_grid={"solver":['lbfgs','sgd','adam'],"activation": ['logistic','relu'],"alpha":[1e-3],"hidden_layer_sizes":[(20,30)]}, )
gs.fit(X, y)
print("best params",gs.best_params_,"best scores:",gs.best_score_)
"""

根据刚才找到的最好超参数，用全数据来训练一个模型

In [None]:
model = MLPClassifier(solver='lbfgs',activation='relu', alpha=1e-3, hidden_layer_sizes = (20, 30), random_state = 1, verbose = False)
model.fit(X, y)
model.intercepts_


In [None]:
print(model.coefs_[0].shape)
print(model.coefs_[1].shape)
print(model.coefs_[2].shape)

In [None]:
pred = model.predict(X)


输出训练数据上的准确度

In [None]:
metrics.accuracy_score(pred, y)

In [None]:
data_test[columns[:]].head()

在测试数据../input/test.csv上进行预测

In [None]:
pred_test_proba=model.predict_proba(data_test[columns[:]])

输出为对每一个商品预测出的属于每一种类别的概率，并加入id列，输出到./otto_prediction.tsv里

In [None]:
output=pd.DataFrame(pred_test_proba,columns=['Class_1','Class_2','Class_3','Class_4','Class_5','Class_6','Class_7','Class_8','Class_9'])

In [None]:
output.insert(0,'id',data_test.id)
output.head()

In [None]:
output.to_csv('./my_otto_prediction.csv', index = False)