In [None]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LinearRegression
import numpy as np
from matplotlib import font_manager, rc
import matplotlib as mpl
import matplotlib.pyplot as plt
import platform

# 그래프에서 한글 표현과 마이너스 폰트 깨지는 문제에 대한 대처
if platform.system() == 'Windows':
    font_name=font_manager.FontProperties(fname="c:/Windows/Fonts/malgun.ttf").get_name()
    rc('font', family=font_name)
else:
     rc('font', family='AppleGothic')   
        
mpl.rcParams['axes.unicode_minus'] = False


In [None]:
# 데이터 준비

In [None]:
train_df = pd.read_excel('examples/carprice.xlsx', sheet_name='train')
test_df = pd.read_excel('examples/carprice.xlsx', sheet_name='test')


In [None]:
train_df.head()

In [None]:
train_df.describe()

In [None]:
x_train = train_df.drop(['가격'], axis=1)
x_test = test_df.drop(['가격'], axis=1)
y_train = train_df['가격']
y_test = test_df['가격']


In [None]:
x_train['변속기'].value_counts()

In [None]:
from sklearn.compose import ColumnTransformer

c_t = ColumnTransformer([('cat', OneHotEncoder(), ['종류', '연료', '변속기']),], 
                        remainder='passthrough' )
x_train_c = c_t.fit_transform(x_train)
x_test_c = c_t.fit_transform(x_test)

In [None]:
columns=['소형','준중형','중형','대형','가솔린','디젤','LPG','자동','수동','년식', '연비', 
         '마력', '토크', '하이브리드', '배기량', '중량']
x_train_new = pd.DataFrame(x_train_c, columns=columns)
x_test_new = pd.DataFrame(x_test_c, columns=columns)

In [None]:
from sklearn import metrics
def print_evaluate(true, predicted):  
    mae = metrics.mean_absolute_error(true, predicted)
    mse = metrics.mean_squared_error(true, predicted)
    rmse = np.sqrt(metrics.mean_squared_error(true, predicted))
    r2_square = metrics.r2_score(true, predicted)
    print('MAE:', mae)
    print('MSE:', mse)
    print('RMSE:', rmse)
    print('R2 Square', r2_square)
   

In [None]:
# LinearRegressor, Ridge, Lasso, ElasticNet, LiniearRegressor-Poly, SGDRegressor

In [None]:
LinearRegression?

In [None]:
model = LinearRegression()

model.fit(x_train_new, y_train)
test_pred = model.predict(x_test_new)
print_evaluate(y_test, test_pred)


In [None]:
# Lasso에 의한 훈련
Lasso?

In [None]:
from sklearn.linear_model import Lasso
model = Lasso(random_state=42)

model.fit(x_train_new, y_train)
test_pred = model.predict(x_test_new)  
print_evaluate(y_test, test_pred)


In [None]:
# Ridge에 의한 훈련

In [None]:
from sklearn.linear_model import Ridge
model = Ridge()

model.fit(x_train_new, y_train)
test_pred = model.predict(x_test_new)
print_evaluate(y_test, test_pred)


In [None]:
# ElasticNet에 의한 훈련

In [None]:
from sklearn.linear_model import ElasticNet
model = ElasticNet(alpha=0.1)

model.fit(x_train_new, y_train)
test_pred = model.predict(x_test_new)
print_evaluate(y_test, test_pred)


In [None]:
# Polynomial Regression에 의한 훈련

In [None]:
from sklearn.preprocessing import PolynomialFeatures
#from sklearn.metrics import r2_score

poly_reg = PolynomialFeatures(degree=3)

x_train_2_d = poly_reg.fit_transform(x_train_new)
x_test_2_d = poly_reg.transform(x_test_new)

model = LinearRegression(normalize=True)
model.fit(x_train_2_d,y_train)
test_pred = model.predict(x_test_2_d)
print_evaluate(y_test, test_pred)

In [None]:
# SGDRegressor에 의한 훈련

In [None]:
from sklearn.linear_model import SGDRegressor
model = SGDRegressor(random_state=42)

model.fit(x_train_new, y_train)
test_pred = model.predict(x_test_new)
print_evaluate(y_test, test_pred)


### 데이터 정규화

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

pipeline = Pipeline([
      ('std_scalar', StandardScaler())
])

x_train_r = pipeline.fit_transform(x_train_c)
x_test_r = pipeline.fit_transform(x_test_c)

# sc = StandardScaler()
# sc.fit_transform(x_train_c)
# sc.fit_transform(x_test_c)


In [None]:
from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet, SGDRegressor
from sklearn.preprocessing import PolynomialFeatures

# LinearRegression
model = LinearRegression()
model.fit(x_train_r, y_train)
test_pred = model.predict(x_test_r)
print("\nLinearRegression ")
print_evaluate(y_test, test_pred)

# Lasso
model = Lasso(alpha=0.1)
model.fit(x_train_r, y_train)
test_pred = model.predict(x_test_r)
print("\nLasso ")
print_evaluate(y_test, test_pred)

# Ridge
model = Ridge(alpha=0.01, normalize=True)
model.fit(x_train_r, y_train)
test_pred = model.predict(x_test_r)
print("\nRidge ")
print_evaluate(y_test, test_pred)

# ElasticNet
model = ElasticNet(alpha=0.1, l1_ratio=0.9)
model.fit(x_train_r, y_train)
test_pred = model.predict(x_test_r)
print("\nElasticNet ")
print_evaluate(y_test, test_pred)

# Polynomial
poly_reg = PolynomialFeatures(degree=3)

x_train_2_d = poly_reg.fit_transform(x_train_new)
x_test_2_d = poly_reg.transform(x_test_new)

model = LinearRegression(normalize=True)
model.fit(x_train_2_d,y_train)
test_pred = model.predict(x_test_2_d)
print("\nPolynomialRegression ")
print_evaluate(y_test, test_pred)


#SGDRegressor
model = SGDRegressor()
model.fit(x_train_r, y_train)
test_pred = model.predict(x_test_r)
print("\nSGDRegressor ")
print_evaluate(y_test, test_pred)

In [None]:
ElasticNet?

### 로그 변환

In [None]:
import seaborn as sns
sns.distplot(y_test)

In [None]:
y_train_log = np.log1p(y_train)
y_test_log = np.log1p(y_test)

In [None]:
# LinearRegression
model = LinearRegression()
model.fit(x_train_new, y_train_log)
print(model.score(x_test_new, y_test_log))


# Lasso
model = Lasso(alpha=1.0)
model.fit(x_train_new, y_train_log)
print(model.score(x_test_new, y_test_log))

# Ridge
model = Ridge(alpha=1.0)
model.fit(x_train_new, y_train_log)
print(model.score(x_test_new, y_test_log))

In [None]:
# '하이브리드' 특성 로그 변환
temp = np.log1p(x_train_new['하이브리드'])
x_train_new['하이브리드'] = temp

In [None]:
# LinearRegression
model = LinearRegression()
model.fit(x_train_new, y_train_log)
print(model.score(x_test_new, y_test_log))


# Lasso
model = Lasso(alpha=0.01)
model.fit(x_train_new, y_train_log)
print(model.score(x_test_new, y_test_log))

# Ridge
model = Ridge(alpha=10)
model.fit(x_train_new, y_train_log)
print(model.score(x_test_new, y_test_log))

### 특성 선택

- 선형회귀의 가정(선형성, 독립성, 정규성) 중 선형성 특성 분석

In [None]:
sns.pairplot(train_df[['가격', '년식', '연비', '마력', '토크', '하이브리드', '배기량', '중량']])

In [None]:
x_train_s = x_train_new
x_test_s = x_test_new

In [None]:
# LinearRegression
model = LinearRegression()
model.fit(x_train_s, y_train)
print(model.score(x_test_s, y_test))


# Lasso
model = Lasso(alpha=1.0)
model.fit(x_train_s, y_train)
print(model.score(x_test_s, y_test)) 


# Ridge
model = Ridge(alpha=1.0)
model.fit(x_train_s, y_train)
print(model.score(x_test_s, y_test)) 

- 선형 특성에 위배되는 하이브리드 속성 제거

In [None]:
x_train_s = x_train_new.drop('하이브리드', axis=1)
x_test_s = x_test_new.drop('하이브리드', axis=1)

In [None]:
# LinearRegression
model = LinearRegression()
model.fit(x_train_s, y_train_log)
print(model.score(x_test_s, y_test_log))


# Lasso
model = Lasso(alpha=0.01)
model.fit(x_train_s, y_train_log)
print(model.score(x_test_s, y_test_log)) 


# Ridge
model = Ridge(alpha=10)
model.fit(x_train_s, y_train_log)
print(model.score(x_test_s, y_test_log)) 

### 하이퍼파라미터 튜닝

In [None]:
alphas = [1000,100, 10, 1, 0.1, 0.01, 0.001, 0.0001]

In [None]:
for alpha in alphas:
    model = Ridge(alpha=alpha, random_state=42)
    model.fit(x_train_new, y_train_log)
    print(alpha," : ", model.score(x_test_new, y_test_log)) 

In [None]:
for alpha in alphas:
    model = Lasso(alpha=alpha, random_state=42)
    model.fit(x_train_new, y_train_log)
    print(alpha," : ", model.score(x_test_new, y_test_log)) 

In [None]:
alphas = [1, 0.1, 0.01, 0.001, 0.0001]
ratios = [0.2, 0.5, 0.8]

In [None]:
for alpha in alphas:
    for ratio in ratios:
        model = ElasticNet(alpha=alpha, l1_ratio=ratio, random_state=42)
        model.fit(x_train_new, y_train_log)
        print(alpha,",",ratio, ": ", model.score(x_test_new, y_test_log)) 