In [184]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np
import statsmodels.api as sm

In [185]:
data = pd.read_csv('data2015_2017.csv', encoding='cp949')

In [186]:
data['모기 개체수'] = data['모기 개체수'].str.replace(',', '').astype(int)

In [187]:
features = [
    '평균기온', '최저기온','일강수량(mm)', '평균 풍속(m/s)',
     '최소 상대습도(%)','평균 상대습도(%)', '평균 현지기압(hPa)', '평균 해면기압(hPa)',
    '합계 일조시간(hr)', '평균 전운량(1/10)', '1일전 모기 개체수','2일전 모기 개체수','3일전 모기 개체수']

target = '모기 개체수'

In [188]:
X = data[features]
y = data[target]
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

In [189]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [190]:
X_train_scaled_ = sm.add_constant(X_train_scaled)
X_test_scaled_ = sm.add_constant(X_test_scaled)

In [191]:
model = sm.OLS(y_train, X_train_scaled_)
results = model.fit()

In [192]:
print(results.summary())

                            OLS Regression Results                            
Dep. Variable:                 모기 개체수   R-squared:                       0.561
Model:                            OLS   Adj. R-squared:                  0.547
Method:                 Least Squares   F-statistic:                     39.39
Date:                Tue, 28 May 2024   Prob (F-statistic):           1.65e-63
Time:                        23:26:16   Log-Likelihood:                -3741.1
No. Observations:                 415   AIC:                             7510.
Df Residuals:                     401   BIC:                             7567.
Df Model:                          13                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const       1217.4962   1736.282      0.701      0.4

In [193]:
model = LinearRegression()
model.fit(X_train_scaled, y_train)

In [194]:
y_train_hat = model.predict(X_train_scaled)
y_test_hat = model.predict(X_test_scaled)

In [195]:
print('Performance for TRAIN--------')
print('Train MSE : {:.4f}'.format(mean_squared_error(y_train, y_train_hat)))
print('Train RMSE : {:.4f}'.format(mean_squared_error(y_train, y_train_hat)**0.5))
print('Train R^2 : {:.4f}'.format(r2_score(y_train, y_train_hat)))

print('Performance for TEST--------')
print('Test MSE : {:.4f}'.format(mean_squared_error(y_test, y_test_hat)))
print('Test RMSE : {:.4f}'.format(mean_squared_error(y_test, y_test_hat)**0.5))
print('Test R^2 : {:.4f}'.format(r2_score(y_test, y_test_hat)))

Performance for TRAIN--------
Train MSE : 3959182.5978
Train RMSE : 1989.7695
Train R^2 : 0.5608
Performance for TEST--------
Test MSE : 3416076.7932
Test RMSE : 1848.2632
Test R^2 : 0.5516


In [196]:
# 선택된 특성들의 평균 계산
feature_means = data[features].mean().values.reshape(1, -1)

feature_means_scaled = scaler.transform(feature_means)

y_pred = model.predict(feature_means_scaled)

print(f"Predicted Mosquito Count(2015-2017): {y_pred[0]:.2f}")
print("-----------------------------")

Predicted Mosquito Count(2015-2017): 4406.35
-----------------------------




# 2015-2017 Ridge

In [197]:
from sklearn.linear_model import Ridge

In [198]:
X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, random_state=42)

In [199]:
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)

In [200]:
from sklearn.linear_model import Ridge
from sklearn.metrics import r2_score

alphas = [0.01, 0.1, 1, 10, 100, 1000]
train_r2_scores = []
val_r2_scores = []

for alpha in alphas:
    # Ridge 회귀 모델 생성 및 학습
    model = Ridge(alpha=alpha)
    model.fit(X_train_scaled, y_train)

    # 훈련 세트와 검증 세트에서의 예측 수행
    y_train_hat = model.predict(X_train_scaled)  
    y_val_hat = model.predict(X_val_scaled)  

    # R^2 값 계산
    train_r2 = r2_score(y_train, y_train_hat)
    val_r2 = r2_score(y_val, y_val_hat)
    
    # 결과 저장
    train_r2_scores.append(train_r2)
    val_r2_scores.append(val_r2)
    
    # 결과 출력
    print(f"Alpha: {alpha}")
    print(f"Train R^2: {train_r2:.4f}")
    print(f"Validation R^2: {val_r2:.4f}")
    print('---------------------')


Alpha: 0.01
Train R^2: 0.6112
Validation R^2: 0.4275
---------------------
Alpha: 0.1
Train R^2: 0.6110
Validation R^2: 0.4312
---------------------
Alpha: 1
Train R^2: 0.6004
Validation R^2: 0.4435
---------------------
Alpha: 10
Train R^2: 0.4799
Validation R^2: 0.3929
---------------------
Alpha: 100
Train R^2: 0.1829
Validation R^2: 0.1632
---------------------
Alpha: 1000
Train R^2: 0.0272
Validation R^2: 0.0194
---------------------


In [201]:
model = Ridge(alpha=1) #1 채택
model.fit(X_train_scaled, y_train)

In [202]:
y_train_hat = model.predict(X_train_scaled)
y_test_hat = model.predict(X_test_scaled)

In [203]:
coefficients = pd.DataFrame({
    'Feature': features,
    'Ridge Coefficient': model.coef_
})
print(coefficients)

         Feature  Ridge Coefficient
0           평균기온        1663.998841
1           최저기온        1764.652345
2       일강수량(mm)        -144.525771
3     평균 풍속(m/s)         429.028012
4     최소 상대습도(%)        -913.120350
5     평균 상대습도(%)         239.833343
6   평균 현지기압(hPa)         109.742854
7   평균 해면기압(hPa)          32.309277
8    합계 일조시간(hr)        -170.647935
9   평균 전운량(1/10)         -26.818540
10    1일전 모기 개체수        7474.515428
11    2일전 모기 개체수        3156.100539
12    3일전 모기 개체수        3713.681276


In [204]:
print('Performance for TRAIN--------')
print('Train MSE : {:.4f}'.format(mean_squared_error(y_train, y_train_hat)))
print('Train RMSE : {:.4f}'.format(mean_squared_error(y_train, y_train_hat)**0.5))
print('Train R^2 : {:.4f}'.format(r2_score(y_train, y_train_hat)))

print('Performance for TEST--------')
print('Test MSE : {:.4f}'.format(mean_squared_error(y_test, y_test_hat)))
print('Test RMSE : {:.4f}'.format(mean_squared_error(y_test, y_test_hat)**0.5))
print('Test R^2 : {:.4f}'.format(r2_score(y_test, y_test_hat)))

Performance for TRAIN--------
Train MSE : 2479618.8764
Train RMSE : 1574.6806
Train R^2 : 0.6004
Performance for TEST--------
Test MSE : 6543519.6512
Test RMSE : 2558.0304
Test R^2 : 0.5223


# 2015-2017 MinMax Scale, Lasso

In [205]:
from sklearn.linear_model import Lasso

In [206]:
alphas = [0.01, 0.1, 1, 10, 100]
train_r2_scores = []
val_r2_scores = []

for alpha in alphas:
    # Ridge 회귀 모델 생성 및 학습
    model = Lasso(alpha=alpha)
    model.fit(X_train_scaled, y_train)

    # 훈련 세트와 검증 세트에서의 예측 수행
    y_train_hat = model.predict(X_train_scaled)  
    y_val_hat = model.predict(X_val_scaled)  

    # R^2 값 계산
    train_r2 = r2_score(y_train, y_train_hat)
    val_r2 = r2_score(y_val, y_val_hat)
    
    # 결과 저장
    train_r2_scores.append(train_r2)
    val_r2_scores.append(val_r2)
    
    # 결과 출력
    print(f"Alpha: {alpha}")
    print(f"Train R^2: {train_r2:.4f}")
    print(f"Validation R^2: {val_r2:.4f}")
    print('---------------------')

Alpha: 0.01
Train R^2: 0.6113
Validation R^2: 0.4271
---------------------
Alpha: 0.1
Train R^2: 0.6112
Validation R^2: 0.4271
---------------------
Alpha: 1
Train R^2: 0.6110
Validation R^2: 0.4278
---------------------
Alpha: 10
Train R^2: 0.6063
Validation R^2: 0.4282
---------------------
Alpha: 100
Train R^2: 0.4330
Validation R^2: 0.3357
---------------------


  model = cd_fast.enet_coordinate_descent(


In [207]:
model = Lasso(alpha=1) #1채택
model.fit(X_train_scaled, y_train)

In [208]:
y_train_hat = model.predict(X_train_scaled)
y_test_hat = model.predict(X_test_scaled)

In [209]:
coefficients = pd.DataFrame({
    'Feature': features,
    'Lasso Coefficient': model.coef_
})
print(coefficients)

         Feature  Lasso Coefficient
0           평균기온        1543.460310
1           최저기온        1640.759965
2       일강수량(mm)          -0.000000
3     평균 풍속(m/s)         514.326734
4     최소 상대습도(%)       -1269.753393
5     평균 상대습도(%)         437.162301
6   평균 현지기압(hPa)         319.746259
7   평균 해면기압(hPa)           0.000000
8    합계 일조시간(hr)         -95.362409
9   평균 전운량(1/10)          11.727855
10    1일전 모기 개체수       10397.656421
11    2일전 모기 개체수        1827.417305
12    3일전 모기 개체수        4226.007809


In [210]:
print('Performance for TRAIN--------')
print('Train MSE : {:.4f}'.format(mean_squared_error(y_train, y_train_hat)))
print('Train RMSE : {:.4f}'.format(mean_squared_error(y_train, y_train_hat)**0.5))
print('Train R^2 : {:.4f}'.format(r2_score(y_train, y_train_hat)))

print('Performance for TEST--------')
print('Test MSE : {:.4f}'.format(mean_squared_error(y_test, y_test_hat)))
print('Test RMSE : {:.4f}'.format(mean_squared_error(y_test, y_test_hat)**0.5))
print('Test R^2 : {:.4f}'.format(r2_score(y_test, y_test_hat)))

Performance for TRAIN--------
Train MSE : 2414198.7142
Train RMSE : 1553.7692
Train R^2 : 0.6110
Performance for TEST--------
Test MSE : 6090841.6972
Test RMSE : 2467.9631
Test R^2 : 0.5553
