## 코드

# Import

In [84]:
import pandas as pd
import random
import os
import numpy as np
import matplotlib.pyplot as plt

from sklearn.linear_model import Ridge, RidgeCV, LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
seed_everything(42) # Seed 고정

# Data Load

In [85]:
train_df = pd.read_csv('train.csv')

train_x = train_df.filter(regex='X') # Input : X Featrue
train_y = train_df.filter(regex='Y') # Output : Y Feature

# Regression & Inference

In [86]:
alphas = [0, 0.001, 0.01, 0.1, 1]
test_x = pd.read_csv('./test.csv').drop(columns=['ID'])
X_train, X_test, y_train, y_test = train_test_split(train_x, train_y, train_size=0.8, test_size=0.2, random_state=42)

In [87]:
# RidgeCV는 alpha로 넣고자 하는 값들을 리스트로 전달하면 내부적으로 최적의 alpha값을 찾아냄
ridgecv = RidgeCV(alphas=alphas, cv=5)
# cv : cross-validation -> 데이터를 k등분한 후 각각에 대하여 검증 진행
# 검증 결과 가장 점수가 높은 모델을 채택
ridgecv.fit(X_train, y_train)
y_pred = ridgecv.predict(X_test)
preds = ridgecv.predict(test_x)
preds

array([[  1.45568687,   1.18434903,   1.13426587, ..., -26.07647958,
        -26.06846779, -26.0917598 ],
       [  1.49821564,   1.21317316,   1.14751628, ..., -26.06053514,
        -26.0558639 , -26.07456222],
       [  1.39973159,   1.06836143,   1.03095293, ..., -26.00904263,
        -25.99950124, -26.00474151],
       ...,
       [  1.32047814,   0.99507484,   1.00112669, ..., -26.45976906,
        -26.45055743, -26.45705415],
       [  1.33256853,   1.00487716,   1.02319434, ..., -26.44381131,
        -26.44201944, -26.45184721],
       [  1.38467239,   1.05871297,   1.06304636, ..., -26.42904244,
        -26.42544328, -26.42798179]])

# Evaluation

In [88]:
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f'Test MSE: ${mse:,.0f}')
print(f'R2 Score: {r2:,.4f}\n')

print(f'alpha: {ridgecv.alpha_}') # 최종 결정된 alpha값
print(f'cv best score: {ridgecv.best_score_}') # 최종 alpha에서의 점수(R^2 of self.predict(X) wrt. y.)

Test MSE: $1
R2 Score: 0.0338

alpha: 0.1
cv best score: 0.0366188672785799


# Submit

In [89]:
submit = pd.read_csv('./sample_submission.csv')

In [90]:
for idx, col in enumerate(submit.columns):
    if col=='ID':
        continue
    submit[col] = preds[:,idx-1]
print('Done.')

Done.


In [91]:
submit.to_csv('./submit_Ridge.csv', index=False)