# 手写数字识别
https://www.kaggle.com/c/digit-recognizer

In [None]:
import pandas as pd # Dataframe
from sklearn.ensemble import RandomForestClassifier # Classification algorithm - random forest
from sklearn import metrics, grid_search
from sklearn.metrics import classification_report
from sklearn.cross_validation import train_test_split
import numpy as np
import math
import random as rd
import pylab as pl
import matplotlib.pyplot as plt
%matplotlib inline


# 1. Load Data

In [None]:
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

train_data = train_df.drop('label',axis=1).values
train_labels = train_df['label'].values
test_data = test_df.values

print ("Loading finished.")
print ("Train size:", train_df.shape)
print ("Test size:", test_df.shape)

# 2. 训练数据集含有42000条记录，每条记录代表了一个手写数字，每一个数字含有784个特征，代表了一个28*28的二维图像。训练集每条记录的第一个数字（label）代表了这个数字的真实数值，通常称为标签。测试数据集由28000个数字组成，没有标签值：这正是我们需要预测的。

In [None]:
train_df.head(5)

In [None]:
test_df.head(5)

# 3. 可视化数字：

In [None]:
train_images = []

for image in train_data:
    train_images.append(image.reshape(28,28))

train_images = np.array(train_images)

plt.figure(figsize=(20,10), dpi=600)
for i in range(10):
    plt.subplot(1,10,(i+1))
    print train_labels[i],
    pl.imshow(train_images[i],cmap=pl.cm.gray_r)
pl.show()

# 4.训练

In [None]:
clf = RandomForestClassifier()
clf.fit(train_data,train_labels)

# 5.预测

In [None]:
predictions = clf.predict(test_data)
print ("Predicting finished.")

# 6.生成提交文件

In [None]:
submission = pd.DataFrame({"ImageId": np.arange(1,28001),"Label": predictions})
submission.to_csv('./submission.csv',index=False)  
print ("Submission created.")

# 7.训练集和验证集
* 现实应用中，纯粹的“训练->预测”模式过于简单，无法验证模型的效果
* 最常见的方式是把训练集拆分成两部分，一部分用于训练，另一部分用于验证

In [None]:
_train_data = train_data[:32000]
_train_labels = train_labels[:32000]
_val_data = train_data[32000:]
_val_labels = train_labels[32000:]

# 8.训练-预测-验证
* 通过对验证集标签及预测结果的比较来判断模型的效果

In [None]:
clf.fit(_train_data,_train_labels)
_val_predictions = clf.predict(_val_data)
print("Classification report for classifier %s:\n%s\n" % (clf, metrics.classification_report(_val_predictions, _val_labels)))
print("Confusion matrix:\n%s" % metrics.confusion_matrix(_val_predictions, _val_labels))
print("Accuracy score: %f" % metrics.accuracy_score(_val_predictions, _val_labels))

# 9.可视化预测错的数字

In [None]:
wrong_images = []
wrong_labels = []
wrong_predictions = []

for (image, label, prediction) in zip(_val_data, _val_labels, _val_predictions):
    if label<>prediction:
        wrong_images.append(image.reshape(28,28))
        wrong_labels.append(label)
        wrong_predictions.append(prediction)

wrong_images = np.array(wrong_images)

plt.figure(figsize=(20,10), dpi=600)
for i in range(10):
    plt.subplot(1,10,(i+1))
    print ("Label:", wrong_labels[i], "Prediction:", wrong_predictions[i])
    pl.imshow(wrong_images[i],cmap=pl.cm.gray_r)
pl.show()

# 10.参数调优
* 一个机器学习算法可能有很多个参数,如何对参数进行调优以获得最佳效果是机器学习最常见的挑战之一
* 最常见的方法是网格搜索(grid search):对所有可能出现的参数值的排列组合进行验证
* 效率很低.更重要的是对模型的深入理解

In [None]:
clf?

In [None]:
def search_model(train_x, train_y, est, param_grid, n_jobs, cv):
    model = grid_search.GridSearchCV(estimator  = est,
                                     param_grid = param_grid,
#                                      scoring    = 'roc_auc',
                                     verbose    = 10,
                                     n_jobs  = n_jobs,
                                     iid        = True,
                                     refit    = False,
                                     cv      = cv)
    # Fit Grid Search Model
    model.fit(train_x, train_y)
    print("Best score: %0.3f" % model.best_score_)
    print("Best parameters set:", model.best_params_)
    return model.best_score_, model.best_params_

param_grid = {'n_estimators': [10,50,100]
                , 'criterion': ['gini','entropy']
              }
(best_score, best_params) = search_model(_val_data
                                         , _val_labels
                                         , RandomForestClassifier()
                                         , param_grid
                                         , n_jobs=1
                                         , cv=3)   

print best_score, best_params

# 11. 用调优后的参数重新训练并预测

In [None]:
clf = RandomForestClassifier(n_estimators=100,criterion='gini')
clf.fit(train_data,train_labels)
predictions = clf.predict(test_data)
submission = pd.DataFrame({"ImageId": np.arange(1,28001),"Label": predictions})
submission.to_csv('submission_tuned.csv',index=False) 
print ("Finished.")