In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score

import numpy as np

from google.colab import drive
drive.mount('/content/drive')

import warnings
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import joblib

In [None]:
cd /content/drive/MyDrive/

In [None]:
from sklearn.preprocessing import StandardScaler
# 데이터 로드
data = pd.read_csv('PRSA_TRAIN.csv')

# 결측치 확인 및 처리
print(f'Initial missing pm2.5 values: {data["pm2.5"].isnull().sum()}')
data.dropna(subset=['pm2.5'], inplace=True)
print(f'Missing pm2.5 values after drop: {data["pm2.5"].isnull().sum()}')

# One-hot encoding
data_encoded = pd.get_dummies(data, columns=['cbwd'])

# 특성 스케일링
scaler = StandardScaler()
feature_columns = data_encoded.drop(columns=['pm2.5']).columns
data_encoded[feature_columns] = scaler.fit_transform(data_encoded[feature_columns])

# 모델링을 위한 데이터 준비
X = data_encoded.drop(columns=['pm2.5'])
y = data_encoded['pm2.5']

# 학습 및 테스트 데이터 분할
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Random Forest Regressor 초기화 및 하이퍼파라미터 튜닝
rf_regressor = RandomForestRegressor(random_state=42)
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth': [10, 20, 30, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True, False]
}

grid_search = GridSearchCV(estimator=rf_regressor, param_grid=param_grid, cv=3, n_jobs=-1, verbose=2)
grid_search.fit(X_train, y_train)
best_params = grid_search.best_params_
print(f'Best parameters: {best_params}')

# 최적의 파라미터로 모델 재훈련
best_rf_regressor = grid_search.best_estimator_
best_rf_regressor.fit(X_train, y_train)

# 예측 및 성능 평가
y_pred = best_rf_regressor.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

print(f'Mean Squared Error: {mse}')
print(f'Root Mean Squared Error: {rmse}')
print(f'R-squared (R²) Score: {r2}')

# 모델 저장
joblib.dump(best_rf_regressor, 'rf_regressor_model.pkl')

# 테스트 데이터 로드 및 예측
test_data = pd.read_csv('PRSA_TEST.csv')
test_data_encoded = pd.get_dummies(test_data, columns=['cbwd'])

# 테스트 데이터에 없는 열을 추가하고 0으로 채우기
missing_cols = set(feature_columns) - set(test_data_encoded.columns)
for col in missing_cols:
    test_data_encoded[col] = 0

# 동일한 스케일러 적용
test_data_encoded[feature_columns] = scaler.transform(test_data_encoded[feature_columns])

# 예측 수행
test_predictions = best_rf_regressor.predict(test_data_encoded[feature_columns])

# 예측 결과를 테스트 데이터에 추가
test_data_encoded['Predicted_PRSA_TEST'] = test_predictions

# 결과 저장
test_data_encoded.to_csv('TEST_with_predictions.csv', index=False)

In [None]:
data = pd.read_csv('PRSA_TRAIN.csv')


data['pm2.5'].isnull().sum()
data.dropna(subset=['pm2.5'], inplace=True)
data['pm2.5'].isnull().sum()
# Perform one-hot encoding on the 'cbwd' column
data_encoded = pd.get_dummies(data, columns=['cbwd'])

data = data_encoded
# Prepare the data for modeling
X = data.drop(columns=['pm2.5'])
y = data['pm2.5']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the Random Forest Regressor
rf_regressor = RandomForestRegressor(n_estimators=100, random_state=42)

# Train the model
rf_regressor.fit(X_train, y_train)

# Make predictions
y_pred = rf_regressor.predict(X_test)

# Calculate the Mean Squared Error, RMSE and R^2 score
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

print(f'Mean Squared Error: {mse}')
print(f'Root Mean Squared Error: {rmse}')
print(f'R-squared (R²) Score: {r2}')


# Save the trained model to a file
joblib.dump(rf_regressor, 'rf_regressor_model.pkl')

# Load the test data
test_data = pd.read_csv('PRSA_TEST.csv')
data_encoded = pd.get_dummies(test_data, columns=['cbwd'])

test_data = data_encoded
# Make predictions on the test data
test_predictions = rf_regressor.predict(test_data)

# Append the predictions to the test data
test_data['Predicted_PRSA_TEST'] = test_predictions

# Save the results to a new CSV file
test_data.to_csv('TEST_with_predictions.csv', index=False)