# Sprint-1 （Week4の続き）

___
## 問題２：学習と検証

### import section

In [17]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_curve, auc

from sklearn.ensemble import RandomForestClassifier

from sklearn.model_selection import KFold
from sklearn.grid_search import GridSearchCV



### read csv

In [2]:
train_dataset = pd.read_csv("../Week3/application_train.csv")
test_dataset = pd.read_csv("../Week3/application_test.csv")

# train_dataset.info()

### データの前処理

In [3]:
# Concatenate two dataset to treat as one
# train_test_df = pd.concat([train_dataset, test_dataset], axis=0, sort=False)

# Fill missings
train_dataset_filled = train_dataset.fillna(-1)
test_dataset_filled = test_dataset.fillna(-1)

# Replace categorical value to dummy value
for col in train_dataset_filled.columns:
    labels, uniques = pd.factorize(train_dataset_filled[col])
    train_dataset_filled[[col]] = labels

for col in test_dataset_filled.columns:
    labels, uniques = pd.factorize(test_dataset_filled[col])
    test_dataset_filled[[col]] = labels
                                     

### テスト用データの前処理

In [18]:
train_X = train_dataset_filled.drop(["TARGET"], axis=1)
train_Y = train_dataset_filled["TARGET"]

### トレーニング用とテスト用のデータ分割

In [19]:
# Create train data and test data from train_dataset_nomiss
x_train, x_test, y_train, y_test = train_test_split(np.array(train_X), \
                                                    np.array(train_Y), \
                                                    train_size=0.70, test_size=0.30)
                                                                                           

### ランダムフォレストのモデルによる評価（モデル作成が主目的）

In [50]:
clf = RandomForestClassifier(random_state=0)
clf = clf.fit(x_train, y_train)
pred = clf.predict(x_test)
fpr, tpr, thresholds = roc_curve(y_test, pred, pos_label=1)
auc(fpr, tpr)

0.5054595790296003

In [51]:
accuracy_score(pred, y_test)

0.9160253213952783

### 本番のテストデータで評価

In [53]:
pred = clf.predict_proba(test_dataset_filled)

### Kaggle提出用のフォーマットによるファイル出力

In [75]:
pred_reshape = pred[:, 0:1].reshape(len(pred))

In [86]:
commit_format = pd.DataFrame({"SK_ID_CURR" : test_dataset["SK_ID_CURR"],
                             "TARGET" : pred_reshape})

In [87]:
commit_format.to_csv("test_output_1.csv", index=False)

# ☆☆☆ Sprint-1 ここから ☆☆☆

## 問題1：クロスバリデーション

In [6]:
kf = KFold(n_splits=5,random_state=42,shuffle=True)

In [13]:
train_X = train_dataset_filled.drop(["TARGET"], axis=1)
train_Y = train_dataset_filled["TARGET"]
accuracies = []

In [14]:
for train_index, test_index in kf.split(train_X):

    data_train   = train_X.iloc[train_index]
    target_train = train_Y.iloc[train_index]

    data_test    = train_X.iloc[test_index]
    target_test  = train_Y.iloc[test_index]

    # if needed, do preprocessing here

    model_rfc = RandomForestClassifier(random_state=0)
    model_rfc.fit(data_train,target_train)

    preds = model_rfc.predict(data_test)

    # accuracy for the current fold only    
    accuracy = accuracy_score(target_test,preds)

    accuracies.append(accuracy)

# this is the average accuracy over all folds
average_accuracy = np.mean(accuracies)
print(average_accuracy)

0.9159639804825745


## 問題2：グリッドサーチ

In [15]:
model = RandomForestClassifier(random_state=0)
tuned_parameters = {"n_estimators" : [120, 300, 500, 800, 1200]}

In [21]:
gscv = GridSearchCV(model, tuned_parameters)

In [22]:
gscv.fit(x_train, y_train)

GridSearchCV(cv=None, error_score='raise',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=0, verbose=0, warm_start=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'n_estimators': [120, 300, 500, 800, 1200]},
       pre_dispatch='2*n_jobs', refit=True, scoring=None, verbose=0)

In [27]:
gscv.score(x_test, y_test)

0.9189411841220977

In [28]:
gscv.best_params_

{'n_estimators': 300}