# 전체데이터 linear regression

In [112]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np
import statsmodels.api as sm

In [113]:
data = pd.read_csv(r"data.csv", encoding='cp949')

In [114]:
features = [
    '평균기온', '최저기온','일강수량(mm)', '평균 풍속(m/s)',
     '최소 상대습도(%)','평균 상대습도(%)', '평균 현지기압(hPa)', '평균 해면기압(hPa)',
    '합계 일조시간(hr)', '평균 전운량(1/10)', '1일전 모기 개체수','2일전 모기 개체수','3일전 모기 개체수']

target = '모기 개체수'

In [115]:
X = data[features]
y = data[target]
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

In [116]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [117]:
X_train_scaled = sm.add_constant(X_train_scaled)
X_test_scaled = sm.add_constant(X_test_scaled)

In [118]:
model = sm.OLS(y_train, X_train_scaled)
results = model.fit()

In [119]:
print(results.summary())

                            OLS Regression Results                            
Dep. Variable:                 모기 개체수   R-squared:                       0.674
Model:                            OLS   Adj. R-squared:                  0.671
Method:                 Least Squares   F-statistic:                     194.5
Date:                Tue, 28 May 2024   Prob (F-statistic):          2.36e-286
Time:                        21:18:57   Log-Likelihood:                -10547.
No. Observations:                1236   AIC:                         2.112e+04
Df Residuals:                    1222   BIC:                         2.119e+04
Df Model:                          13                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const        445.2961    643.649      0.692      0.4

In [120]:
model = LinearRegression()
model.fit(X_train_scaled, y_train)

In [121]:
y_train_hat = model.predict(X_train_scaled)
y_test_hat = model.predict(X_test_scaled)

In [122]:
print('Performance for TRAIN--------')
print('Train MSE : {:.4f}'.format(mean_squared_error(y_train, y_train_hat)))
print('Train RMSE : {:.4f}'.format(mean_squared_error(y_train, y_train_hat)**0.5))
print('Train R^2 : {:.4f}'.format(r2_score(y_train, y_train_hat)))

print('Performance for TEST--------')
print('Test MSE : {:.4f}'.format(mean_squared_error(y_test, y_test_hat)))
print('Test RMSE : {:.4f}'.format(mean_squared_error(y_test, y_test_hat)**0.5))
print('Test R^2 : {:.4f}'.format(r2_score(y_test, y_test_hat)))

Performance for TRAIN--------
Train MSE : 1512177.6301
Train RMSE : 1229.7063
Train R^2 : 0.6742
Performance for TEST--------
Test MSE : 1258855.2636
Test RMSE : 1121.9872
Test R^2 : 0.7269


# 전체데이터 Ridge

In [123]:
from sklearn.linear_model import Ridge

In [124]:
X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, random_state=42)  # 0.25 x 0.8 = 0.2

In [125]:
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)

In [126]:
from sklearn.linear_model import Ridge
from sklearn.metrics import r2_score

alphas = [0.01, 0.1, 1, 10, 100, 1000]
train_r2_scores = []
val_r2_scores = []

for alpha in alphas:
    # Ridge 회귀 모델 생성 및 학습
    model = Ridge(alpha=alpha)
    model.fit(X_train_scaled, y_train)

    # 훈련 세트와 검증 세트에서의 예측 수행
    y_train_hat = model.predict(X_train_scaled)  
    y_val_hat = model.predict(X_val_scaled)  

    # R^2 값 계산
    train_r2 = r2_score(y_train, y_train_hat)
    val_r2 = r2_score(y_val, y_val_hat)
    
    # 결과 저장
    train_r2_scores.append(train_r2)
    val_r2_scores.append(val_r2)
    
    # 결과 출력
    print(f"Alpha: {alpha}")
    print(f"Train R^2: {train_r2:.4f}")
    print(f"Validation R^2: {val_r2:.4f}")
    print('---------------------')


Alpha: 0.01
Train R^2: 0.7335
Validation R^2: 0.5475
---------------------
Alpha: 0.1
Train R^2: 0.7332
Validation R^2: 0.5498
---------------------
Alpha: 1
Train R^2: 0.7229
Validation R^2: 0.5552
---------------------
Alpha: 10
Train R^2: 0.6329
Validation R^2: 0.5131
---------------------
Alpha: 100
Train R^2: 0.2985
Validation R^2: 0.2498
---------------------
Alpha: 1000
Train R^2: 0.0542
Validation R^2: 0.0452
---------------------


In [127]:
model = Ridge(alpha=1) #1 채택
model.fit(X_train_scaled, y_train)

In [128]:
y_train_hat = model.predict(X_train_scaled)
y_test_hat = model.predict(X_test_scaled)

In [129]:
coefficients = pd.DataFrame({
    'Feature': features,
    'Ridge Coefficient': model.coef_
})
print(coefficients)

         Feature  Ridge Coefficient
0           평균기온         700.430228
1           최저기온         764.584267
2       일강수량(mm)          19.945391
3     평균 풍속(m/s)         381.486991
4     최소 상대습도(%)       -1342.287146
5     평균 상대습도(%)         254.641259
6   평균 현지기압(hPa)         -50.436098
7   평균 해면기압(hPa)         -88.882696
8    합계 일조시간(hr)         -51.611176
9   평균 전운량(1/10)         213.615653
10    1일전 모기 개체수       11005.826915
11    2일전 모기 개체수        2560.612783
12    3일전 모기 개체수        4716.953933


In [130]:
print('Performance for TRAIN--------')
print('Train MSE : {:.4f}'.format(mean_squared_error(y_train, y_train_hat)))
print('Train RMSE : {:.4f}'.format(mean_squared_error(y_train, y_train_hat)**0.5))
print('Train R^2 : {:.4f}'.format(r2_score(y_train, y_train_hat)))

print('Performance for TEST--------')
print('Test MSE : {:.4f}'.format(mean_squared_error(y_test, y_test_hat)))
print('Test RMSE : {:.4f}'.format(mean_squared_error(y_test, y_test_hat)**0.5))
print('Test R^2 : {:.4f}'.format(r2_score(y_test, y_test_hat)))

Performance for TRAIN--------
Train MSE : 1207197.8476
Train RMSE : 1098.7256
Train R^2 : 0.7229
Performance for TEST--------
Test MSE : 1183232.3424
Test RMSE : 1087.7648
Test R^2 : 0.7227


# 전체 데이터 Lasso

In [131]:
from sklearn.linear_model import Lasso

In [132]:
alphas = [0.01, 0.1, 1, 10, 100]
train_r2_scores = []
val_r2_scores = []

for alpha in alphas:
    # Ridge 회귀 모델 생성 및 학습
    model = Lasso(alpha=alpha)
    model.fit(X_train_scaled, y_train)

    # 훈련 세트와 검증 세트에서의 예측 수행
    y_train_hat = model.predict(X_train_scaled)  
    y_val_hat = model.predict(X_val_scaled)  

    # R^2 값 계산
    train_r2 = r2_score(y_train, y_train_hat)
    val_r2 = r2_score(y_val, y_val_hat)
    
    # 결과 저장
    train_r2_scores.append(train_r2)
    val_r2_scores.append(val_r2)
    
    # 결과 출력
    print(f"Alpha: {alpha}")
    print(f"Train R^2: {train_r2:.4f}")
    print(f"Validation R^2: {val_r2:.4f}")
    print('---------------------')

Alpha: 0.01
Train R^2: 0.7335
Validation R^2: 0.5472
---------------------
Alpha: 0.1
Train R^2: 0.7335
Validation R^2: 0.5471
---------------------
Alpha: 1
Train R^2: 0.7332
Validation R^2: 0.5473
---------------------
Alpha: 10
Train R^2: 0.7270
Validation R^2: 0.5420
---------------------
Alpha: 100
Train R^2: 0.3840
Validation R^2: 0.3318
---------------------


  model = cd_fast.enet_coordinate_descent(


In [133]:
model = Lasso(alpha=1) #1채택
model.fit(X_train_scaled, y_train)

In [134]:
y_train_hat = model.predict(X_train_scaled)
y_test_hat = model.predict(X_test_scaled)

In [135]:
coefficients = pd.DataFrame({
    'Feature': features,
    'Lasso Coefficient': model.coef_
})
print(coefficients)

         Feature  Lasso Coefficient
0           평균기온         658.912013
1           최저기온         562.440512
2       일강수량(mm)           0.000000
3     평균 풍속(m/s)         280.860669
4     최소 상대습도(%)       -1092.198011
5     평균 상대습도(%)          72.116573
6   평균 현지기압(hPa)         -29.861797
7   평균 해면기압(hPa)         -95.583201
8    합계 일조시간(hr)          -0.000000
9   평균 전운량(1/10)         249.146019
10    1일전 모기 개체수       15371.271332
11    2일전 모기 개체수         147.523511
12    3일전 모기 개체수        4611.876017


In [136]:
print('Performance for TRAIN--------')
print('Train MSE : {:.4f}'.format(mean_squared_error(y_train, y_train_hat)))
print('Train RMSE : {:.4f}'.format(mean_squared_error(y_train, y_train_hat)**0.5))
print('Train R^2 : {:.4f}'.format(r2_score(y_train, y_train_hat)))

print('Performance for TEST--------')
print('Test MSE : {:.4f}'.format(mean_squared_error(y_test, y_test_hat)))
print('Test RMSE : {:.4f}'.format(mean_squared_error(y_test, y_test_hat)**0.5))
print('Test R^2 : {:.4f}'.format(r2_score(y_test, y_test_hat)))

Performance for TRAIN--------
Train MSE : 1162133.0362
Train RMSE : 1078.0227
Train R^2 : 0.7332
Performance for TEST--------
Test MSE : 1302716.2471
Test RMSE : 1141.3660
Test R^2 : 0.6947
