# 성수기 linear regression

In [88]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np
import statsmodels.api as sm

In [89]:
data = pd.read_csv(r"data678.csv", encoding='cp949')

In [90]:
data['모기 개체수'] = data['모기 개체수'].str.replace(',', '').astype(int)

In [91]:
features = [
    '평균기온', '최저기온','일강수량(mm)', '평균 풍속(m/s)',
     '최소 상대습도(%)','평균 상대습도(%)', '평균 현지기압(hPa)', '평균 해면기압(hPa)',
    '합계 일조시간(hr)', '평균 전운량(1/10)', '1일전 모기 개체수','2일전 모기 개체수','3일전 모기 개체수']

target = '모기 개체수'

In [92]:
X = data[features]
y = data[target]
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

In [93]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [94]:
X_train_scaled = sm.add_constant(X_train_scaled)
X_test_scaled = sm.add_constant(X_test_scaled)

In [95]:
model = sm.OLS(y_train, X_train_scaled)
results = model.fit()

In [96]:
print(results.summary())

                            OLS Regression Results                            
Dep. Variable:                 모기 개체수   R-squared:                       0.675
Model:                            OLS   Adj. R-squared:                  0.668
Method:                 Least Squares   F-statistic:                     93.86
Date:                Tue, 28 May 2024   Prob (F-statistic):          7.40e-134
Time:                        21:18:52   Log-Likelihood:                -5160.3
No. Observations:                 601   AIC:                         1.035e+04
Df Residuals:                     587   BIC:                         1.041e+04
Df Model:                          13                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const       1615.0483    669.938      2.411      0.0

In [97]:
model = LinearRegression()
model.fit(X_train_scaled, y_train)

In [98]:
y_train_hat = model.predict(X_train_scaled)
y_test_hat = model.predict(X_test_scaled)

In [99]:
print('Performance for TRAIN--------')
print('Train MSE : {:.4f}'.format(mean_squared_error(y_train, y_train_hat)))
print('Train RMSE : {:.4f}'.format(mean_squared_error(y_train, y_train_hat)**0.5))
print('Train R^2 : {:.4f}'.format(r2_score(y_train, y_train_hat)))

print('Performance for TEST--------')
print('Test MSE : {:.4f}'.format(mean_squared_error(y_test, y_test_hat)))
print('Test RMSE : {:.4f}'.format(mean_squared_error(y_test, y_test_hat)**0.5))
print('Test R^2 : {:.4f}'.format(r2_score(y_test, y_test_hat)))

Performance for TRAIN--------
Train MSE : 1680151.5197
Train RMSE : 1296.2066
Train R^2 : 0.6752
Performance for TEST--------
Test MSE : 884992.0604
Test RMSE : 940.7402
Test R^2 : 0.7607


# 성수기 Ridge

In [100]:
from sklearn.linear_model import Ridge

In [101]:
X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, random_state=42)  # 0.25 x 0.8 = 0.2

In [102]:
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)

In [103]:
from sklearn.linear_model import Ridge
from sklearn.metrics import r2_score

alphas = [0.01, 0.1, 1, 10, 100, 1000]
train_r2_scores = []
val_r2_scores = []

for alpha in alphas:
    # Ridge 회귀 모델 생성 및 학습
    model = Ridge(alpha=alpha)
    model.fit(X_train_scaled, y_train)

    # 훈련 세트와 검증 세트에서의 예측 수행
    y_train_hat = model.predict(X_train_scaled)  
    y_val_hat = model.predict(X_val_scaled)  

    # R^2 값 계산
    train_r2 = r2_score(y_train, y_train_hat)
    val_r2 = r2_score(y_val, y_val_hat)
    
    # 결과 저장
    train_r2_scores.append(train_r2)
    val_r2_scores.append(val_r2)
    
    # 결과 출력
    print(f"Alpha: {alpha}")
    print(f"Train R^2: {train_r2:.4f}")
    print(f"Validation R^2: {val_r2:.4f}")
    print('---------------------')


Alpha: 0.01
Train R^2: 0.6689
Validation R^2: 0.7060
---------------------
Alpha: 0.1
Train R^2: 0.6683
Validation R^2: 0.7109
---------------------
Alpha: 1
Train R^2: 0.6528
Validation R^2: 0.7171
---------------------
Alpha: 10
Train R^2: 0.5131
Validation R^2: 0.5705
---------------------
Alpha: 100
Train R^2: 0.1501
Validation R^2: 0.1500
---------------------
Alpha: 1000
Train R^2: 0.0201
Validation R^2: 0.0032
---------------------


In [104]:
model = Ridge(alpha=1) #1 채택
model.fit(X_train_scaled, y_train)

In [105]:
y_train_hat = model.predict(X_train_scaled)
y_test_hat = model.predict(X_test_scaled)

In [106]:
coefficients = pd.DataFrame({
    'Feature': features,
    'Ridge Coefficient': model.coef_
})
print(coefficients)

         Feature  Ridge Coefficient
0           평균기온       -1388.054287
1           최저기온        1477.976006
2       일강수량(mm)         228.662041
3     평균 풍속(m/s)         320.053887
4     최소 상대습도(%)       -1262.198464
5     평균 상대습도(%)        -759.555622
6   평균 현지기압(hPa)         410.511887
7   평균 해면기압(hPa)         414.385613
8    합계 일조시간(hr)        -328.002001
9   평균 전운량(1/10)          34.326922
10    1일전 모기 개체수        8644.969272
11    2일전 모기 개체수        3054.781250
12    3일전 모기 개체수        5369.662776


In [107]:
print('Performance for TRAIN--------')
print('Train MSE : {:.4f}'.format(mean_squared_error(y_train, y_train_hat)))
print('Train RMSE : {:.4f}'.format(mean_squared_error(y_train, y_train_hat)**0.5))
print('Train R^2 : {:.4f}'.format(r2_score(y_train, y_train_hat)))

print('Performance for TEST--------')
print('Test MSE : {:.4f}'.format(mean_squared_error(y_test, y_test_hat)))
print('Test RMSE : {:.4f}'.format(mean_squared_error(y_test, y_test_hat)**0.5))
print('Test R^2 : {:.4f}'.format(r2_score(y_test, y_test_hat)))

Performance for TRAIN--------
Train MSE : 1764936.9619
Train RMSE : 1328.5093
Train R^2 : 0.6528
Performance for TEST--------
Test MSE : 1403290.6920
Test RMSE : 1184.6057
Test R^2 : 0.6488


# 비수기 Lasso

In [108]:
from sklearn.linear_model import Lasso

In [109]:
alphas = [0.01, 0.1, 1, 10, 100]
train_r2_scores = []
val_r2_scores = []

for alpha in alphas:
    # Ridge 회귀 모델 생성 및 학습
    model = Lasso(alpha=alpha)
    model.fit(X_train_scaled, y_train)

    # 훈련 세트와 검증 세트에서의 예측 수행
    y_train_hat = model.predict(X_train_scaled)  
    y_val_hat = model.predict(X_val_scaled)  

    # R^2 값 계산
    train_r2 = r2_score(y_train, y_train_hat)
    val_r2 = r2_score(y_val, y_val_hat)
    
    # 결과 저장
    train_r2_scores.append(train_r2)
    val_r2_scores.append(val_r2)
    
    # 결과 출력
    print(f"Alpha: {alpha}")
    print(f"Train R^2: {train_r2:.4f}")
    print(f"Validation R^2: {val_r2:.4f}")
    print('---------------------')

Alpha: 0.01
Train R^2: 0.6690
Validation R^2: 0.7054
---------------------
Alpha: 0.1
Train R^2: 0.6689
Validation R^2: 0.7054
---------------------
Alpha: 1
Train R^2: 0.6683
Validation R^2: 0.7075
---------------------
Alpha: 10
Train R^2: 0.6567
Validation R^2: 0.7065
---------------------
Alpha: 100
Train R^2: 0.3976
Validation R^2: 0.4509
---------------------


  model = cd_fast.enet_coordinate_descent(


In [110]:
model = Lasso(alpha=1) #1채택
model.fit(X_train_scaled, y_train)

In [111]:
y_train_hat = model.predict(X_train_scaled)
y_test_hat = model.predict(X_test_scaled)

In [112]:
coefficients = pd.DataFrame({
    'Feature': features,
    'Lasso Coefficient': model.coef_
})
print(coefficients)

         Feature  Lasso Coefficient
0           평균기온       -3012.728133
1           최저기온        2922.060565
2       일강수량(mm)         234.888720
3     평균 풍속(m/s)         117.695774
4     최소 상대습도(%)       -1164.261225
5     평균 상대습도(%)        -862.389106
6   평균 현지기압(hPa)         740.862941
7   평균 해면기압(hPa)           0.000000
8    합계 일조시간(hr)         -54.623455
9   평균 전운량(1/10)         -19.728649
10    1일전 모기 개체수       12933.132639
11    2일전 모기 개체수         706.930098
12    3일전 모기 개체수        5246.503019


In [113]:
print('Performance for TRAIN--------')
print('Train MSE : {:.4f}'.format(mean_squared_error(y_train, y_train_hat)))
print('Train RMSE : {:.4f}'.format(mean_squared_error(y_train, y_train_hat)**0.5))
print('Train R^2 : {:.4f}'.format(r2_score(y_train, y_train_hat)))

print('Performance for TEST--------')
print('Test MSE : {:.4f}'.format(mean_squared_error(y_test, y_test_hat)))
print('Test RMSE : {:.4f}'.format(mean_squared_error(y_test, y_test_hat)**0.5))
print('Test R^2 : {:.4f}'.format(r2_score(y_test, y_test_hat)))

Performance for TRAIN--------
Train MSE : 1686294.3563
Train RMSE : 1298.5740
Train R^2 : 0.6683
Performance for TEST--------
Test MSE : 1345976.2104
Test RMSE : 1160.1621
Test R^2 : 0.6632
