### Overfit Problem


In [1]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import PolynomialFeatures

from math import sqrt

import numpy as np
import pandas as pd



In [2]:
ADMISSION_FILE_PATH = '../resources/admission_data.csv'
admission_df = pd.read_csv(ADMISSION_FILE_PATH).drop('Serial No.', axis=1)

In [32]:
admission_df.head()

Unnamed: 0,GRE Score,TOEFL Score,University Rating,SOP,LOR,CGPA,Research,Chance of Admit
0,337,118,4,4.5,4.5,9.65,1,0.92
1,324,107,4,4.0,4.5,8.87,1,0.76
2,316,104,3,3.0,3.5,8.0,1,0.72
3,322,110,3,3.5,2.5,8.67,1,0.8
4,314,103,2,2.0,3.0,8.21,0,0.65


In [4]:
admission_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 8 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   GRE Score          500 non-null    int64  
 1   TOEFL Score        500 non-null    int64  
 2   University Rating  500 non-null    int64  
 3   SOP                500 non-null    float64
 4   LOR                500 non-null    float64
 5   CGPA               500 non-null    float64
 6   Research           500 non-null    int64  
 7   Chance of Admit    500 non-null    float64
dtypes: float64(4), int64(4)
memory usage: 31.4 KB


In [27]:
X = admission_df.drop(['Chance of Admit '], axis=1)
polynomial_transformer = PolynomialFeatures(6)
polynomial_features = polynomial_transformer.fit_transform(X.values)
features = polynomial_transformer.get_feature_names(X.columns) 

In [47]:
polynomial_features

array([[  1.    , 337.    , 118.    , ...,  93.1225,   9.65  ,   1.    ],
       [  1.    , 324.    , 107.    , ...,  78.6769,   8.87  ,   1.    ],
       [  1.    , 316.    , 104.    , ...,  64.    ,   8.    ,   1.    ],
       ...,
       [  1.    , 330.    , 120.    , ...,  91.3936,   9.56  ,   1.    ],
       [  1.    , 312.    , 103.    , ...,   0.    ,   0.    ,   0.    ],
       [  1.    , 327.    , 113.    , ...,   0.    ,   0.    ,   0.    ]])

In [29]:
X = pd.DataFrame(polynomial_features, columns=features)

In [36]:
y=admission_df[['Chance of Admit ']]
y.head()

Unnamed: 0,Chance of Admit
0,0.92
1,0.76
2,0.72
3,0.8
4,0.65


In [43]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state = 5)

In [44]:
model = LinearRegression()
model.fit(X_train, y_train)

LinearRegression()

In [45]:
y_train_predict = model.predict(X_train)
y_test_predict = model.predict(X_test)

In [46]:
mse = mean_squared_error(y_train, y_train_predict)

print("Training Set에서의 성능")
print("-------------------")
print(sqrt(mse))

mse = mean_squared_error(y_test, y_test_predict)

print("Test Set에서의 성능")
print("-------------------")
print(sqrt(mse))

#거의 비슷하게 나오네. 강의랑 결과가 다름. 

Training Set에서의 성능
-------------------
0.05980162567151887
Test Set에서의 성능
-------------------
0.05966452891517713


### scikit-learn으로 과적합 문제 해결해 보기(Lasso, Ridge Model)

In [56]:
from sklearn.linear_model import Lasso
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import PolynomialFeatures

from math import sqrt

import numpy as np
import pandas as pd

In [49]:
ADMISSION_FILE_PATH = '../resources/admission_data.csv'
admission_df = pd.read_csv(ADMISSION_FILE_PATH).drop('Serial No.', axis=1)

In [50]:
admission_df.head()

Unnamed: 0,GRE Score,TOEFL Score,University Rating,SOP,LOR,CGPA,Research,Chance of Admit
0,337,118,4,4.5,4.5,9.65,1,0.92
1,324,107,4,4.0,4.5,8.87,1,0.76
2,316,104,3,3.0,3.5,8.0,1,0.72
3,322,110,3,3.5,2.5,8.67,1,0.8
4,314,103,2,2.0,3.0,8.21,0,0.65


In [51]:
X = admission_df.drop(['Chance of Admit '], axis=1)
polynomial_transformer = PolynomialFeatures(6)
polynomial_features = polynomial_transformer.fit_transform(X.values)
features = polynomial_transformer.get_feature_names(X.columns) 

In [53]:
X = pd.DataFrame(polynomial_features, columns=features)
y=admission_df[['Chance of Admit ']]

In [54]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state = 5)

In [59]:
# Params : lambda=alpha & 경사하강 최대 몇번 할지 & 자체적으로 Feature Scaling도 해줌(0과 1사이 숫자로)
# Ridge쓰고 싶으면 이 부분만 그냥 Ridge로 쓰면 됨. Params도 똑같음. 
model = Lasso(alpha=0.001, max_iter = 1000, normalize=True) 
model.fit(X_train, y_train)

Lasso(alpha=0.001, normalize=True)

In [60]:
y_train_predict = model.predict(X_train)
y_test_predict = model.predict(X_test)

이렇게 복잡한 6차원을 써도, 세타값이 커지는 것을 방지하니깐 과적합이 꽤 많이 방지됨. 

In [61]:
mse = mean_squared_error(y_train, y_train_predict)

print("Training Set에서의 성능")
print("-------------------")
print(sqrt(mse))

mse = mean_squared_error(y_test, y_test_predict)

print("Test Set에서의 성능")
print("-------------------")
print(sqrt(mse))

#거의 비슷하게 나오네. 강의랑 결과가 다름. 

Training Set에서의 성능
-------------------
0.06336620966147144
Test Set에서의 성능
-------------------
0.06007719092689257


### K-Fold Validation

In [65]:
from sklearn import datasets
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression

import numpy as np
import pandas as pd

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [66]:
iris_data = datasets.load_iris()

In [68]:
X = pd.DataFrame(iris_data.data, columns=iris_data.feature_names)
y = pd.DataFrame(iris_data.target, columns=['Class'])

In [69]:
logistic_model = LogisticRegression(max_iter = 2000)

In [71]:
# 보통은 이제 여기에서 train_test_split을 하는데
# 이번에는 여기서 k-fold를 할꺼라서 그럴 필요가 없음. 
cross_val_score(logistic_model, X, y.values.ravel(), cv = 5) # 5겹 교차검증 

array([0.96666667, 1.        , 0.93333333, 0.96666667, 1.        ])

In [72]:
# 이제 이 성능의 평균을 내면, 이게 우리가 만든 로지스틱 모델의 성능
np.average(cross_val_score(logistic_model, X, y.values.ravel(), cv = 5))

0.9733333333333334

### Grid Search

In [77]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import GridSearchCV

from math import sqrt

import numpy as np
import pandas as pd

In [91]:
# 데이터준비
ADMISSION_FILE_PATH = '../resources/admission_data.csv'
admission_df = pd.read_csv(ADMISSION_FILE_PATH)

X = admission_df.drop(['Chance of Admit '], axis=1)
polynomial_transformer = PolynomialFeatures(2)
polynomial_features = polynomial_transformer.fit_transform(X.values)

features = polynomial_transformer.get_feature_names(X.columns) 

X = pd.DataFrame(polynomial_features, columns=features)
y = admission_df[['Chance of Admit ']]

In [92]:
hyper_parameter = {
    'alpha': [0.01, 0.1, 1, 10],
    'max_iter': [100, 500, 1000, 1500, 2000]
}

In [93]:
lasso_model = Lasso()

In [95]:
# CV는 각 하이퍼파라미터로 검증할때 k-fold를 몇으로 어떻게 할지. 
hyper_parameter_tuner = GridSearchCV(lasso_model, hyper_parameter, cv = 5) 
hyper_parameter_tuner.fit(X, y)

  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = c

  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = c

  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


GridSearchCV(cv=5, estimator=Lasso(),
             param_grid={'alpha': [0.01, 0.1, 1, 10],
                         'max_iter': [100, 500, 1000, 1500, 2000]})

In [96]:
hyper_parameter_tuner.best_params_

{'alpha': 1, 'max_iter': 100}