用神经网络对Otto商品分类预测

In [None]:
import numpy as np
import pandas as pd
from patsy import dmatrices
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn import metrics
import matplotlib.pyplot as plt

从./input/train.csv读入数据

In [None]:
df = pd.read_csv('../input/train.csv')

In [None]:
df.head()

数据中从第一列到倒数第二列是feature

In [None]:
columns = df.columns[1:-1]
columns

In [None]:
X = df[columns]

In [None]:
y = np.ravel(df['target'])
y

观察商品种类的分布

In [None]:
df['target'].value_counts().plot(kind='bar')

显示feature 20在不同类下的分布图

In [None]:
for id, class_i in enumerate(set(y)):
    plt.subplot(3, 3, id + 1)
    df[df.target == class_i].feat_20.hist()
plt.show()

显示feature 19和feature 20的散点图

In [None]:
plt.scatter(np.ravel(df['feat_19']), np.ravel(df['feat_20']))

显示所有feature的相关系数矩阵

In [None]:
fig = plt.figure()
ax = fig.add_subplot(111)
cax = ax.matshow(X.corr(), interpolation='nearest')
fig.colorbar(cax)
plt.show()

In [None]:
num_fea = X.shape[1]

初始化神经网络模型，两个隐藏层，整个网络为93x30x10x9

In [None]:
model = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(30, 10), random_state=1, verbose=True)

预测需要约1分钟

In [None]:
model.fit(X, y)

观察模型系数和bias

In [None]:
model.intercepts_

In [None]:
print(model.coefs_[0].shape)
print(model.coefs_[1].shape)
print(model.coefs_[2].shape)

In [None]:
pred = model.predict(X)
pred

输出训练数据上的准确度

In [None]:
model.score(X,y)

In [None]:
sum(pred == y) / len(y)

在测试数据../input/test.csv上进行预测

In [None]:
pd_test = pd.read_csv('../input/test.csv')

In [None]:
features = pd_test.columns[1:]
features

In [None]:
X_test = pd_test[features]
X_test

In [None]:
y_test = model.predict_proba(X_test)
y_test

输出为对每一个商品预测出的属于每一种类别的概率，并加入id列，输出到./otto_prediction.tsv里

In [None]:
solution = pd.DataFrame(y_test, columns = ['Class_' + str(i) for i in range(1, 10)])
solution.head()

In [None]:
solution['id'] = pd_test.id
solution.head()

In [None]:
solution = solution[['id'] + ['Class_' + str(i) for i in range(1, 10)]]
solution.head()

In [None]:
solution.to_csv('./otto_prediction.csv', index=False)