用神经网络对Otto商品分类预测

In [None]:
import numpy as np
import pandas as pd
from patsy import dmatrices
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn import metrics
import matplotlib.pyplot as plt

从./input/train.csv读入数据

In [None]:
data = pd.read_csv('../input/train.csv')
data.head()

数据中从第一列到倒数第二列是feature

In [None]:
columns = data.columns.tolist()[1:-1]

In [None]:
X = data[columns]

In [None]:
y = np.ravel(data['target'])

观察商品种类的分布

In [None]:
df = data.groupby(['target']).size() / data.shape[0] * 100.
df.plot(kind='bar')
plt.show()

显示feature 20在不同类下的分布图

In [None]:
for i in range(9):
    plt.subplot(3, 3, i+1)
    data[data['target'] == 'Class_' + str(i+1)].feat_20.hist()
plt.show()

显示feature 19和feature 20的散点图

In [None]:
plt.scatter(data.feat_19, data.feat_20)
plt.show()

显示所有feature的相关系数矩阵

In [None]:
fig = plt.figure()
ax = fig.add_subplot(1, 1, 1)
cax = ax.matshow(X.corr())
fig.colorbar(cax)
plt.show()

In [None]:
num_fea = X.shape[1]
num_fea

初始化神经网络模型，两个隐藏层，整个网络为93x30x10x9

In [None]:
model = MLPClassifier(hidden_layer_sizes=(30, 10), alpha=1e-5, activation='relu', \
                      solver='lbfgs', verbose=3, random_state=0)

预测需要约1分钟

In [None]:
model.fit(X, y)

观察模型系数和bias

In [None]:
model.intercepts_

In [None]:
print(model.coefs_[0].shape)
print(model.coefs_[1].shape)
print(model.coefs_[2].shape)

In [None]:
pred = model.predict(X)
pred

输出训练数据上的准确度

In [None]:
model.score(X, y)

In [None]:
sum(pred == y) / len(y)

In [None]:
cross_val_score(model, X, y, cv=4)

在测试数据../input/test.csv上进行预测

In [None]:
test = pd.read_csv('../input/test.csv')

输出为对每一个商品预测出的属于每一种类别的概率，并加入id列，输出到./otto_prediction.tsv里

In [None]:
preds_test = model.predict_proba(test.iloc[:,1:])

result = pd.DataFrame(preds_test, columns=['Class_'+str(i+1) for i in range(9)])
result['id'] = test['id']

# reorder 
cols = result.columns.tolist()
cols = cols[-1:] + cols[:-1]
result = result[cols]

In [None]:
result.to_csv('./otto_predictions.csv', index=False)