### 【問題1】train_test_splitのスクラッチ
スクラッチの練習として、scikit-learnのtrain_test_splitを自作してみます。以下の雛形をベースとして関数を完成させてください。

In [86]:
import numpy as np
import random
from math import ceil, floor

def train_test_split_handmade(X, y, train_size = 0.8):
    
    shapeX = X.shape[0]
    shapey = y.shape[0]

    if not shapeX == shapey:
        return print('X, y must be same shape')

    n_samples = X.shape[0]

    train_size_num = ceil(train_size * n_samples)
    test_size_num = n_samples - train_size_num

    X_train, X_test = np.zeros([train_size_num, X.shape[1]]), np.zeros([test_size_num, X.shape[1]])
    y_train, y_test = np.zeros([train_size_num]), np.zeros([test_size_num])

    random_index = random.sample([ i for i in range(n_samples)], n_samples)

    for i, z in enumerate(random_index[:train_size_num]):
        X_train[i], y_train[i] = X[z], y[z]

    for i, z in enumerate(random_index[train_size_num : n_samples]):
        X_test[i], y_test[i] = X[z], y[z]
        
    return X_train, y_train, X_test,y_test

### 【問題2】 分類問題を解くコードの作成
上記3種類の手法で3種類のデータセットを学習・推定するコードを作成してください。

### ・使うモデル
#### ロジスティック回帰
#### SVM
#### 決定木

In [87]:
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn import linear_model
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

lreg = linear_model.SGDClassifier(max_iter=1000, tol=1e-3)
svc = SVC(gamma='auto')
dtc = DecisionTreeClassifier(random_state=0)

### データ　３個

### iris_data　virgicolorとvirginicaを使う

In [50]:
from sklearn.datasets import load_iris
data = load_iris()

In [56]:
data.keys()

dict_keys(['data', 'target', 'target_names', 'DESCR', 'feature_names', 'filename'])

In [58]:
data['target_names']

array(['setosa', 'versicolor', 'virginica'], dtype='<U10')

In [72]:
y_iris = data['target'][50:150]
X_iris = data['data'][50:150]

### シンプルデータセット

In [73]:
import numpy as np
np.random.seed(seed=0)
n_samples = 500
f0 = [-1, 2]
f1 = [2, -1]
cov = [[1.0,0.8], [0.8, 1.0]]
f0 = np.random.multivariate_normal(f0, cov, int(n_samples/2))
f1 = np.random.multivariate_normal(f1, cov, int(n_samples/2))
X = np.concatenate((f0, f1))
y = np.concatenate((np.ones((int(n_samples/2))), np.ones((int(n_samples/2))) *(-1))).astype(np.int)
random_index = np.random.permutation(np.arange(n_samples))
X_simp1 = X[random_index]
y_simp1 = y[random_index]

### シンプルデータセット２

In [74]:
X_simp2 = np.array([[-0.44699 , -2.8073  ],[-1.4621  , -2.4586  ],
       [ 0.10645 ,  1.9242  ],[-3.5944  , -4.0112  ],
       [-0.9888  ,  4.5718  ],[-3.1625  , -3.9606  ],
       [ 0.56421 ,  0.72888 ],[-0.60216 ,  8.4636  ],
       [-0.61251 , -0.75345 ],[-0.73535 , -2.2718  ],
       [-0.80647 , -2.2135  ],[ 0.86291 ,  2.3946  ],
       [-3.1108  ,  0.15394 ],[-2.9362  ,  2.5462  ],
       [-0.57242 , -2.9915  ],[ 1.4771  ,  3.4896  ],
       [ 0.58619 ,  0.37158 ],[ 0.6017  ,  4.3439  ],
       [-2.1086  ,  8.3428  ],[-4.1013  , -4.353   ],
       [-1.9948  , -1.3927  ],[ 0.35084 , -0.031994],
       [ 0.96765 ,  7.8929  ],[-1.281   , 15.6824  ],
       [ 0.96765 , 10.083   ],[ 1.3763  ,  1.3347  ],
       [-2.234   , -2.5323  ],[-2.9452  , -1.8219  ],
       [ 0.14654 , -0.28733 ],[ 0.5461  ,  5.8245  ],
       [-0.65259 ,  9.3444  ],[ 0.59912 ,  5.3524  ],
       [ 0.50214 , -0.31818 ],[-3.0603  , -3.6461  ],
       [-6.6797  ,  0.67661 ],[-2.353   , -0.72261 ],
       [ 1.1319  ,  2.4023  ],[-0.12243 ,  9.0162  ],
       [-2.5677  , 13.1779  ],[ 0.057313,  5.4681  ]])
y_simp2 = np.array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])

In [88]:
model_list = [lreg, svc, dtc]
print_model_list = ['Liner_classfier = ', 'SVC = ' , 'Decition_tree = ' ]
print_data_list = ['Iris_data', 'Simple1_data', 'Simple2_data']
data_list = [X_iris, X_simp1, X_simp2]
target_list = [y_iris, y_simp1, y_simp2]

In [89]:
for X_data, y_data, print_data in zip(data_list, target_list, print_data_list):
    print(print_data)
    for model, printer in zip(model_list, print_list):
        X_train, y_train, X_test, y_test = train_test_split_handmade(X_data, y_data, train_size = 0.7)
        print('X_train_shape  ' + str(X_train.shape), 'y_train_shape  ' + str(y_train.shape),
               '\n X_test_shape  ' + str(X_test.shape), 'y_train_shape  ' + str(y_test.shape))
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        print('ACCURACY    ' + printer + str(accuracy_score(y_test, y_pred)))
        print()
    print()
    print()

Iris_data
X_train_shape  (70, 4) y_train_shape  (70,) 
 X_test_shape  (30, 4) y_train_shape  (30,)
ACCURACY    Liner_classfier = 0.9333333333333333

X_train_shape  (70, 4) y_train_shape  (70,) 
 X_test_shape  (30, 4) y_train_shape  (30,)
ACCURACY    SVC = 1.0

X_train_shape  (70, 4) y_train_shape  (70,) 
 X_test_shape  (30, 4) y_train_shape  (30,)
ACCURACY    Decition_tree = 0.9333333333333333



Simple1_data
X_train_shape  (350, 2) y_train_shape  (350,) 
 X_test_shape  (150, 2) y_train_shape  (150,)
ACCURACY    Liner_classfier = 1.0

X_train_shape  (350, 2) y_train_shape  (350,) 
 X_test_shape  (150, 2) y_train_shape  (150,)
ACCURACY    SVC = 1.0

X_train_shape  (350, 2) y_train_shape  (350,) 
 X_test_shape  (150, 2) y_train_shape  (150,)
ACCURACY    Decition_tree = 1.0



Simple2_data
X_train_shape  (28, 2) y_train_shape  (28,) 
 X_test_shape  (12, 2) y_train_shape  (12,)
ACCURACY    Liner_classfier = 0.5833333333333334

X_train_shape  (28, 2) y_train_shape  (28,) 
 X_test_shape  (12

### 【問題3】 回帰問題を解くコードの作成
線形回帰でHouse Pricesデータセットを学習・推定するコードを作成してください。

In [90]:
import pandas as pd

In [91]:
df = pd.read_csv('/Users/user/Downloads/house-prices-advanced-regression-techniques/train.csv')

In [93]:
df.columns

Index(['Id', 'MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street',
       'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig',
       'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType',
       'HouseStyle', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd',
       'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType',
       'MasVnrArea', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual',
       'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1',
       'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating',
       'HeatingQC', 'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF',
       'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath',
       'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual',
       'TotRmsAbvGrd', 'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType',
       'GarageYrBlt', 'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual',
       'GarageCond', 'PavedDrive

In [98]:
X_df = pd.concat([df['YearBuilt'], df['GrLivArea']], 1)

In [105]:
X_df.head()

Unnamed: 0,YearBuilt,GrLivArea
0,2003,1710
1,1976,1262
2,2001,1786
3,1915,1717
4,2000,2198


In [102]:
y_df = df[['SalePrice']]

In [104]:
y_df.head()

Unnamed: 0,SalePrice
0,208500
1,181500
2,223500
3,140000
4,250000


In [110]:
X_np = X_df.to_numpy()
y_np = y_df.to_numpy()


In [112]:
from sklearn.metrics import mean_squared_error

In [114]:
X_log = np.log(X_np)
y_log = np.log(y_np)

X_train, y_train, X_test, y_test = train_test_split_handmade(X_log, y_log, train_size = 0.7)

lreg = linear_model.SGDRegressor(max_iter=1000, tol=1e-3)
lreg.fit(X_train, y_train)
y_pred = lreg.predict(X_test)
print('MSE=  '  + str(mean_squared_error(y_test, y_pred)))

MSE=  0.07464920994406987
