用神经网络对Otto商品分类预测

In [None]:
import numpy as np
import pandas as pd
from patsy import dmatrices
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn import metrics
import matplotlib.pyplot as plt

从./input/train.csv读入数据

In [None]:
data = pd.read_csv('../input/train.csv')

In [None]:
data.head()

In [None]:
data.dtypes

In [None]:
columns = data.columns[1:-1] #数据中从第一列到倒数第二列是feature
X = data[columns]
y = np.ravel(data['target'])

In [None]:
X

In [None]:
y #字符串的数组

观察商品种类的分布

In [None]:
distribution = data.groupby('target').size()
print(distribution)

In [None]:
distribution = data.groupby('target').size() / data.shape[0] * 100.0  #得到百分比
distribution.plot(kind='bar') #用柱状图画出来
plt.show()

显示一个feature在不同类下的分布图（如果一个feature在不同类下的分布不一样，就表示这个feature挺有用的）

In [None]:
#feature 20 
for id in range(1, 10):#Class 1-9对应的feat_20都画出来
    plt.subplot(3, 3, id)
    data[data.target=='Class_' + str(id)].feat_20.hist()
plt.show()

In [None]:
plt.scatter(data.feat_19, data.feat_20)
plt.xlabel('feat_19')
plt.ylabel('feat_20')
plt.show()#负相关

显示所有feature的相关系数矩阵

In [None]:
X.corr() #对角线上自己和自己相关性自然为1

In [None]:
fig = plt.figure()
ax  = fig.add_subplot(111) # 1 row, 1 column, 1st plot
cax = ax.matshow(X.corr(), interpolation='nearest')
fig.colorbar(cax)
plt.show()

In [None]:
num_fea = X.shape[1]
num_fea

初始化神经网络模型，两个隐藏层，整个网络为93x30x10x9(Class 1-9) 
#第一层输入93个神经元，第二层连30个神经元，第三层连9个神经元
#神经网络最后还是要比概率

In [None]:
#solver用来设置比如用什么梯度下降方法，包括学习率怎么调整这些
#lbfgs---优化方法，alpha is L-2 regularization coefficient 
#random_state = 1 随机种子，为了保持每次一致  #verbose = True 输出一些东西
model = MLPClassifier(solver='lbfgs', alpha = 1e-5, hidden_layer_sizes = (30, 10), random_state = 1, verbose = True)

In [None]:
model.fit(X,y) #这里的激活函数是'relu'

In [None]:
#截距 
#ppt page16---第一层是93连到30,所以是30个截距（30个weight）；第二层是30连到10,所以是10个截距；第三层是10连到9,所以是9个截距
model.intercepts_ 

In [None]:
#系数
model.coefs_[0]

In [None]:
model.coefs_[0].shape

In [None]:
#总共有30+10+9+93*30+30*10+10*9个参数，对应ppt page16上的-30,10,20,......
print(model.coefs_[0].shape)
print(model.coefs_[1].shape) #第一隐藏层到第二隐藏层除去截距的个数
print(model.coefs_[2].shape)

In [None]:
pred = model.predict(X)
pred

输出训练数据上的准确度

In [None]:
#准确度
model.score(X, y)

In [None]:
#准确度也可以这么算
sum(pred == y) / len(y)

在测试数据../input/test.csv上进行预测

In [None]:
test_data = pd.read_csv('../input/test.csv')
test_data.shape

In [None]:
data.head()

In [None]:
Xtest = test_data[test_data.columns[1:]]
Xtest.head()

In [None]:
test_prob = model.predict_proba(Xtest)
test_prob.shape

In [None]:
test_prob

In [None]:
np.sum(test_prob, axis = 1) #对每一个商品预测出的属于每一种类别的概率加起来肯定是1

输出为对每一个商品预测出的属于每一种类别的概率，并加入id列，输出到./prediction.tsv里

In [None]:
solution = pd.DataFrame(test_prob, columns = ['Class_1','Class_2','Class_3','Class_4', 'Class_5','Class_6','Class_7','Class_8','Class_9'])

In [None]:
solution['id'] = test_data['id'] #加入id列
solution.head()

In [None]:
solution.shape

In [None]:
cols = solution.columns.tolist() #拿出所有的column
cols = cols[-1:] + cols[:-1] #拿出最后一个column补在前面
solution = solution[cols]
solution

In [None]:
solution.to_csv('predict.csv', index = False)