# 問題1：train_test_splitのスクラッチ

In [11]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_curve, auc

from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LinearRegression

from sklearn.model_selection import KFold
from sklearn.grid_search import GridSearchCV

In [3]:
def scratch_train_test_split(X, y, train_size=0.8,):
    """
    検証用データを分割する。

    Parameters
    ----------
    X : 次の形のndarray, shape (n_samples, n_features)
      学習データ
    y : 次の形のndarray, shape (n_samples, )
      正解値
    train_size : float (0<train_size<1)
      何割をtrainとするか指定

    Returns
    ----------
    X_train : 次の形のndarray, shape (n_samples, n_features)
      学習データ
    X_test : 次の形のndarray, shape (n_samples, n_features)
      検証データ
    y_train : 次の形のndarray, shape (n_samples, )
      学習データの正解値
    y_test : 次の形のndarray, shape (n_samples, )
      検証データの正解値
    """
    
    X_train_index = int(len(X)*train_size)
    y_train_index = int(len(y)*train_size)
    
    X_train = X[0:X_train_index, :]
    X_test = X[X_train_index:, :]
    y_train = y[0:y_train_index]
    y_test = y[y_train_index:]

    return X_train, X_test, y_train, y_test

In [10]:
# dummy ndarray
dummy_ndarray_X = np.array([[0, 1, 2],
                            [2, 4, 6],
                            [3, 6, 9],
                            [1, 1, 2],
                            [3, 1, 4],
                            [9, 8, 7],
                            [8, 4, 2],
                            [7, 1, 9],
                            [4, 0, 1],
                            [5, 3, 9]])

dummy_ndarray_y = np.array([2, 6, 9, 2, 4, 7, 2, 9, 1, 9])

X_train, X_test, y_train, y_test = scratch_train_test_split(dummy_ndarray_X, dummy_ndarray_y)

print("X_train = {}".format(X_train))
print("X_test = {}".format(X_test))
print("y_train = {}".format(y_train))
print("y_test = {}".format(y_test))

X_train = [[0 1 2]
 [2 4 6]
 [3 6 9]
 [1 1 2]
 [3 1 4]
 [9 8 7]
 [8 4 2]
 [7 1 9]]
X_test = [[4 0 1]
 [5 3 9]]
y_train = [2 6 9 2 4 7 2 9]
y_test = [1 9]


___
## 問題3：回帰問題を解くコードの作成

In [13]:
train_dataset = pd.read_csv("../Week3/application_train.csv")
test_dataset = pd.read_csv("../Week3/application_test.csv")

# Fill missings
train_dataset_filled = train_dataset.fillna(-1)
test_dataset_filled = test_dataset.fillna(-1)

# Replace categorical value to dummy value
for col in train_dataset_filled.columns:
    labels, uniques = pd.factorize(train_dataset_filled[col])
    train_dataset_filled[[col]] = labels

for col in test_dataset_filled.columns:
    labels, uniques = pd.factorize(test_dataset_filled[col])
    test_dataset_filled[[col]] = labels
    
train_X = train_dataset_filled.drop(["TARGET"], axis=1)
train_Y = train_dataset_filled["TARGET"]

# Create train data and test data
x_train, x_test, y_train, y_test = train_test_split(np.array(train_X), \
                                                    np.array(train_Y), \
                                                    train_size=0.70, test_size=0.30)

In [12]:
lr_model = LinearRegression()

In [14]:
lr_model = lr_model.fit(x_train, y_train)

In [15]:
pred = lr_model.predict(x_test)

In [16]:
fpr, tpr, thresholds = roc_curve(y_test, pred, pos_label=1)

In [17]:
auc(fpr, tpr)

0.6588709702609495