# 전체데이터 linear regression

In [130]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

In [131]:
#데이터 로드
train_data = pd.read_csv(r"C:\Users\sinha\data mining\dataset\data1_train.csv", encoding='cp949')
test_data = pd.read_csv(r"C:\Users\sinha\data mining\dataset\data1_test.csv", encoding='UTF-8')

In [132]:
#데이터 전처리
train_data['모기 개체수'] = train_data['모기 개체수'].str.replace(',', '').astype(int)
test_data['모기 개체수'] = test_data['모기 개체수'].str.replace(',', '').astype(int)

In [133]:
features = [
    '평균기온', '최저기온', '최고기온', '일강수량(mm)', '최대 풍속(m/s)', '평균 풍속(m/s)',
    '최소 상대습도(%)', '평균 상대습도(%)', '평균 현지기압(hPa)', '평균 해면기압(hPa)',
    '합계 일조시간(hr)', '합계 일사량(MJ/m2)', '평균 전운량(1/10)', '평균 지면온도','1일전 모기 개체수','2일전 모기 개체수','3일전 모기 개체수']

target = '모기 개체수'

In [134]:
#데이터 split
X_train = train_data[features]
y_train = train_data[target]
X_test = test_data[features]
y_test = test_data[target]

In [135]:
#데이터 scaling
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [136]:
model = LinearRegression()
model.fit(X_train_scaled, y_train)

In [137]:
y_train_hat = model.predict(X_train_scaled)
y_test_hat = model.predict(X_test_scaled)

In [138]:
print('Performance for TRAIN--------')
print('Train MSE : {:.4f}'.format(mean_squared_error(y_train, y_train_hat)))
print('Train RMSE : {:.4f}'.format(mean_squared_error(y_train, y_train_hat)**0.5))
print('Train R^2 : {:.4f}'.format(r2_score(y_train, y_train_hat)))

print('Performance for TEST--------')
print('Test MSE : {:.4f}'.format(mean_squared_error(y_test, y_test_hat)))
print('Test RMSE : {:.4f}'.format(mean_squared_error(y_test, y_test_hat)**0.5))
print('Test R^2 : {:.4f}'.format(r2_score(y_test, y_test_hat)))

Performance for TRAIN--------
Train MSE : 1611716.2491
Train RMSE : 1269.5339
Train R^2 : 0.6766
Performance for TEST--------
Test MSE : 130036.4291
Test RMSE : 360.6056
Test R^2 : 0.6464


# 전체데이터 Ridge

In [139]:
from sklearn.linear_model import Ridge

In [140]:
#hyperparameter를 정하기 위한 validation set 분리
X_split_train, X_split_val, y_split_train, y_split_val = train_test_split(X_train_scaled, y_train, random_state=42)  

In [141]:
alphas = [0.01, 0.1, 1, 10, 100, 1000]
train_r2_scores = []
val_r2_scores = []

for alpha in alphas:
    # Ridge 회귀 모델 생성 및 학습
    model = Ridge(alpha=alpha)
    model.fit(X_split_train, y_split_train)

    # 훈련 세트와 검증 세트에서의 예측 수행
    y_train_hat = model.predict(X_split_train)
    y_val_hat = model.predict(X_split_val)

    # R^2 값 계산
    train_r2 = r2_score(y_split_train, y_train_hat)
    val_r2 = r2_score(y_split_val, y_val_hat)
    
    # 결과 저장
    train_r2_scores.append(train_r2)
    val_r2_scores.append(val_r2)
    
    # 결과 출력
    print(f"Alpha: {alpha}")
    print(f"Train R^2: {train_r2:.4f}")
    print(f"Validation R^2: {val_r2:.4f}")
    print('---------------------')

Alpha: 0.01
Train R^2: 0.6802
Validation R^2: 0.6363
---------------------
Alpha: 0.1
Train R^2: 0.6800
Validation R^2: 0.6413
---------------------
Alpha: 1
Train R^2: 0.6736
Validation R^2: 0.6657
---------------------
Alpha: 10
Train R^2: 0.5924
Validation R^2: 0.6162
---------------------
Alpha: 100
Train R^2: 0.3038
Validation R^2: 0.3173
---------------------
Alpha: 1000
Train R^2: 0.0721
Validation R^2: 0.0755
---------------------


In [142]:
model = Ridge(alpha=1) #alpha = 1 채택
model.fit(X_train_scaled, y_train)

In [143]:
y_train_hat = model.predict(X_train_scaled)
y_test_hat = model.predict(X_test_scaled)

In [144]:
print('Performance for TRAIN--------')
print('Train MSE : {:.4f}'.format(mean_squared_error(y_train, y_train_hat)))
print('Train RMSE : {:.4f}'.format(mean_squared_error(y_train, y_train_hat)**0.5))
print('Train R^2 : {:.4f}'.format(r2_score(y_train, y_train_hat)))

print('Performance for TEST--------')
print('Test MSE : {:.4f}'.format(mean_squared_error(y_test, y_test_hat)))
print('Test RMSE : {:.4f}'.format(mean_squared_error(y_test, y_test_hat)**0.5))
print('Test R^2 : {:.4f}'.format(r2_score(y_test, y_test_hat)))

Performance for TRAIN--------
Train MSE : 1626552.3392
Train RMSE : 1275.3636
Train R^2 : 0.6736
Performance for TEST--------
Test MSE : 157946.5159
Test RMSE : 397.4249
Test R^2 : 0.5705


# 전체데이터 Lasso

In [145]:
from sklearn.linear_model import Lasso

In [146]:
alphas = [0.01, 0.1, 1, 10, 100]

train_r2_scores = []
val_r2_scores = []

for alpha in alphas:
    model = Lasso(alpha=alpha)  
    model.fit(X_split_train, y_split_train)

    y_train_hat = model.predict(X_split_train)
    y_val_hat = model.predict(X_split_val)

    train_r2 = r2_score(y_split_train, y_train_hat)
    val_r2 = r2_score(y_split_val, y_val_hat)
    
    train_r2_scores.append(train_r2)
    val_r2_scores.append(val_r2)
    
    print(f"Alpha: {alpha}")
    print(f"Train R^2: {train_r2:.4f}")
    print(f"validation R^2: {val_r2:.4f}")
    print('---------------------')

Alpha: 0.01
Train R^2: 0.6802
validation R^2: 0.6357
---------------------
Alpha: 0.1
Train R^2: 0.6801
validation R^2: 0.6356
---------------------
Alpha: 1
Train R^2: 0.6798
validation R^2: 0.6357
---------------------
Alpha: 10
Train R^2: 0.6727
validation R^2: 0.6355
---------------------
Alpha: 100
Train R^2: 0.3765
validation R^2: 0.3607
---------------------


  model = cd_fast.enet_coordinate_descent(


In [147]:
model = Lasso(alpha=1) #1채택
model.fit(X_train_scaled, y_train)

In [148]:
y_train_hat = model.predict(X_train_scaled)
y_test_hat = model.predict(X_test_scaled)

In [149]:
print('Performance for TRAIN--------')
print('Train MSE : {:.4f}'.format(mean_squared_error(y_train, y_train_hat)))
print('Train RMSE : {:.4f}'.format(mean_squared_error(y_train, y_train_hat)**0.5))
print('Train R^2 : {:.4f}'.format(r2_score(y_train, y_train_hat)))

print('Performance for TEST--------')
print('Test MSE : {:.4f}'.format(mean_squared_error(y_test, y_test_hat)))
print('Test RMSE : {:.4f}'.format(mean_squared_error(y_test, y_test_hat)**0.5))
print('Test R^2 : {:.4f}'.format(r2_score(y_test, y_test_hat)))

Performance for TRAIN--------
Train MSE : 1617113.8318
Train RMSE : 1271.6579
Train R^2 : 0.6755
Performance for TEST--------
Test MSE : 133804.6741
Test RMSE : 365.7932
Test R^2 : 0.6362
