# 비수기 linear regression

In [23]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

In [24]:
data = pd.read_csv(r"data45910.csv", encoding='cp949')

In [25]:
data['모기 개체수'] = data['모기 개체수'].str.replace(',', '').astype(int)

In [26]:
features = [
    '평균기온', '최저기온','일강수량(mm)', '평균 풍속(m/s)',
     '최소 상대습도(%)','평균 상대습도(%)', '평균 현지기압(hPa)', '평균 해면기압(hPa)',
    '합계 일조시간(hr)', '평균 전운량(1/10)', '1일전 모기 개체수','2일전 모기 개체수','3일전 모기 개체수']

target = '모기 개체수'

In [27]:
X = data[features]
y = data[target]

In [28]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, random_state=0)

In [29]:
model = LinearRegression()
model.fit(X_train, y_train)

In [30]:
y_train_hat = model.predict(X_train)
y_test_hat = model.predict(X_test)

In [31]:
print('Performance for TRAIN--------')
print('Train MSE : {:.4f}'.format(mean_squared_error(y_train, y_train_hat)))
print('Train RMSE : {:.4f}'.format(mean_squared_error(y_train, y_train_hat)**0.5))
print('Train R^2 : {:.4f}'.format(r2_score(y_train, y_train_hat)))

print('Performance for TEST--------')
print('Test MSE : {:.4f}'.format(mean_squared_error(y_test, y_test_hat)))
print('Test RMSE : {:.4f}'.format(mean_squared_error(y_test, y_test_hat)**0.5))
print('Test R^2 : {:.4f}'.format(r2_score(y_test, y_test_hat)))

Performance for TRAIN--------
Train MSE : 1100709.5041
Train RMSE : 1049.1470
Train R^2 : 0.6474
Performance for TEST--------
Test MSE : 2140568.5378
Test RMSE : 1463.0682
Test R^2 : 0.5591


# 성수기 Ridge

In [32]:
from sklearn.linear_model import Ridge

In [33]:
X_train_val, X_test, y_train_val, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.25, random_state=42)  # 0.25 x 0.8 = 0.2

In [34]:
alphas = [0.01, 0.1, 1, 10, 100, 1000]
train_r2_scores = []
val_r2_scores = []

for alpha in alphas:
    # Ridge 회귀 모델 생성 및 학습
    model = Ridge(alpha=alpha)
    model.fit(X_train, y_train)

    # 훈련 세트와 검증 세트에서의 예측 수행
    y_train_hat = model.predict(X_train)
    y_val_hat = model.predict(X_val)

    # R^2 값 계산
    train_r2 = r2_score(y_train, y_train_hat)
    val_r2 = r2_score(y_val, y_val_hat)
    
    # 결과 저장
    train_r2_scores.append(train_r2)
    val_r2_scores.append(val_r2)
    
    # 결과 출력
    print(f"Alpha: {alpha}")
    print(f"Train R^2: {train_r2:.4f}")
    print(f"Validation R^2: {val_r2:.4f}")
    print('---------------------')

Alpha: 0.01
Train R^2: 0.7361
Validation R^2: 0.7032
---------------------
Alpha: 0.1
Train R^2: 0.7358
Validation R^2: 0.7009
---------------------
Alpha: 1
Train R^2: 0.7269
Validation R^2: 0.6764
---------------------
Alpha: 10
Train R^2: 0.6034
Validation R^2: 0.5047
---------------------
Alpha: 100
Train R^2: 0.2474
Validation R^2: 0.1546
---------------------
Alpha: 1000
Train R^2: 0.0386
Validation R^2: -0.0104
---------------------


In [35]:
model = Ridge(alpha=0.01) #0.01 채택
model.fit(X_train, y_train)

In [36]:
y_train_hat = model.predict(X_train)
y_test_hat = model.predict(X_test)

In [37]:
print('Performance for TRAIN--------')
print('Train MSE : {:.4f}'.format(mean_squared_error(y_train, y_train_hat)))
print('Train RMSE : {:.4f}'.format(mean_squared_error(y_train, y_train_hat)**0.5))
print('Train R^2 : {:.4f}'.format(r2_score(y_train, y_train_hat)))

print('Performance for TEST--------')
print('Test MSE : {:.4f}'.format(mean_squared_error(y_test, y_test_hat)))
print('Test RMSE : {:.4f}'.format(mean_squared_error(y_test, y_test_hat)**0.5))
print('Test R^2 : {:.4f}'.format(r2_score(y_test, y_test_hat)))

Performance for TRAIN--------
Train MSE : 617991.8768
Train RMSE : 786.1246
Train R^2 : 0.7361
Performance for TEST--------
Test MSE : 3853036.0227
Test RMSE : 1962.9152
Test R^2 : 0.3513


# 성수기 Lasso

In [38]:
from sklearn.linear_model import Lasso

In [39]:
alphas = [0.01, 0.1, 1, 10, 100]

train_r2_scores = []
val_r2_scores = []

for alpha in alphas:
    model = Lasso(alpha=alpha)  
    model.fit(X_train, y_train)

    y_train_hat = model.predict(X_train)
    y_val_hat = model.predict(X_val)

    train_r2 = r2_score(y_train, y_train_hat)
    val_r2 = r2_score(y_val, y_val_hat)
    
    train_r2_scores.append(train_r2)
    val_r2_scores.append(val_r2)
    
    print(f"Alpha: {alpha}")
    print(f"Train R^2: {train_r2:.4f}")
    print(f"validation R^2: {val_r2:.4f}")
    print('---------------------')

Alpha: 0.01
Train R^2: 0.7361
validation R^2: 0.7035
---------------------
Alpha: 0.1
Train R^2: 0.7359
validation R^2: 0.7033
---------------------
Alpha: 1
Train R^2: 0.7354
validation R^2: 0.7003
---------------------
Alpha: 10
Train R^2: 0.7244
validation R^2: 0.6642
---------------------
Alpha: 100
Train R^2: 0.2931
validation R^2: 0.1730
---------------------


  model = cd_fast.enet_coordinate_descent(


In [40]:
model = Lasso(alpha=1) #1채택
model.fit(X_train, y_train)

In [41]:
y_train_hat = model.predict(X_train)
y_test_hat = model.predict(X_test)

In [42]:
print('Performance for TRAIN--------')
print('Train MSE : {:.4f}'.format(mean_squared_error(y_train, y_train_hat)))
print('Train RMSE : {:.4f}'.format(mean_squared_error(y_train, y_train_hat)**0.5))
print('Train R^2 : {:.4f}'.format(r2_score(y_train, y_train_hat)))

print('Performance for TEST--------')
print('Test MSE : {:.4f}'.format(mean_squared_error(y_test, y_test_hat)))
print('Test RMSE : {:.4f}'.format(mean_squared_error(y_test, y_test_hat)**0.5))
print('Test R^2 : {:.4f}'.format(r2_score(y_test, y_test_hat)))

Performance for TRAIN--------
Train MSE : 619473.5779
Train RMSE : 787.0664
Train R^2 : 0.7354
Performance for TEST--------
Test MSE : 3862710.9251
Test RMSE : 1965.3781
Test R^2 : 0.3497
