In [37]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score


# -----------------------------
# 1) 데이터 준비
# -----------------------------

df = pd.read_csv("/content/drive/MyDrive/weather.csv").dropna()
df

Unnamed: 0,Date,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,...,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RISK_MM,RainTomorrow
0,11/1/2007,Canberra,8.0,24.3,0.0,3.4,6.3,NW,30.0,SW,...,29,1019.7,1015.0,7,7,14.4,23.6,No,3.6,Yes
1,11/2/2007,Canberra,14.0,26.9,3.6,4.4,9.7,ENE,39.0,E,...,36,1012.4,1008.4,5,3,17.5,25.7,Yes,3.6,Yes
2,11/3/2007,Canberra,13.7,23.4,3.6,5.8,3.3,NW,85.0,N,...,69,1009.5,1007.2,8,7,15.4,20.2,Yes,39.8,Yes
3,11/4/2007,Canberra,13.3,15.5,39.8,7.2,9.1,NW,54.0,WNW,...,56,1005.5,1007.0,2,7,13.5,14.1,Yes,2.8,Yes
4,11/5/2007,Canberra,7.6,16.1,2.8,5.6,10.6,SSE,50.0,SSE,...,49,1018.3,1018.5,7,7,11.1,15.4,Yes,0.0,No
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
361,10/27/2008,Canberra,9.0,30.7,0.0,7.6,12.1,NNW,76.0,SSE,...,15,1016.1,1010.8,1,3,20.4,30.0,No,0.0,No
362,10/28/2008,Canberra,7.1,28.4,0.0,11.6,12.7,N,48.0,NNW,...,22,1020.0,1016.9,0,1,17.2,28.2,No,0.0,No
363,10/29/2008,Canberra,12.5,19.9,0.0,8.4,5.3,ESE,43.0,ENE,...,47,1024.0,1022.8,3,2,14.5,18.3,No,0.0,No
364,10/30/2008,Canberra,12.5,26.9,0.0,5.0,7.1,NW,46.0,SSW,...,39,1021.0,1016.2,6,7,15.8,25.9,No,0.0,No


In [38]:
# 결측치 확인
df.isnull().sum()

Unnamed: 0,0
Date,0
Location,0
MinTemp,0
MaxTemp,0
Rainfall,0
Evaporation,0
Sunshine,0
WindGustDir,0
WindGustSpeed,0
WindDir9am,0


In [39]:
# 결측치 제거
df = df.dropna()
df.isnull().sum()

Unnamed: 0,0
Date,0
Location,0
MinTemp,0
MaxTemp,0
Rainfall,0
Evaporation,0
Sunshine,0
WindGustDir,0
WindGustSpeed,0
WindDir9am,0


In [40]:
df.info() # 컬럼 정보 및 데이터 타입 확인

<class 'pandas.core.frame.DataFrame'>
Index: 328 entries, 0 to 365
Data columns (total 24 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Date           328 non-null    object 
 1   Location       328 non-null    object 
 2   MinTemp        328 non-null    float64
 3   MaxTemp        328 non-null    float64
 4   Rainfall       328 non-null    float64
 5   Evaporation    328 non-null    float64
 6   Sunshine       328 non-null    float64
 7   WindGustDir    328 non-null    object 
 8   WindGustSpeed  328 non-null    float64
 9   WindDir9am     328 non-null    object 
 10  WindDir3pm     328 non-null    object 
 11  WindSpeed9am   328 non-null    float64
 12  WindSpeed3pm   328 non-null    int64  
 13  Humidity9am    328 non-null    int64  
 14  Humidity3pm    328 non-null    int64  
 15  Pressure9am    328 non-null    float64
 16  Pressure3pm    328 non-null    float64
 17  Cloud9am       328 non-null    int64  
 18  Cloud3pm       

In [41]:
from sklearn.preprocessing import LabelEncoder

categorical_cols = ['Location', 'WindGustDir', 'WindDir9am', 'WindDir3pm', 'RainToday', 'RainTomorrow']
label_encoders = {}
for column in categorical_cols:
    label_encoders[column] = LabelEncoder()
    df[column] = label_encoders[column].fit_transform(df[column])

In [42]:
df # 모든 컬럼 값이 숫자로 변경되었는지 확인

Unnamed: 0,Date,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,...,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RISK_MM,RainTomorrow
0,11/1/2007,0,8.0,24.3,0.0,3.4,6.3,7,30.0,12,...,29,1019.7,1015.0,7,7,14.4,23.6,0,3.6,1
1,11/2/2007,0,14.0,26.9,3.6,4.4,9.7,1,39.0,0,...,36,1012.4,1008.4,5,3,17.5,25.7,1,3.6,1
2,11/3/2007,0,13.7,23.4,3.6,5.8,3.3,7,85.0,3,...,69,1009.5,1007.2,8,7,15.4,20.2,1,39.8,1
3,11/4/2007,0,13.3,15.5,39.8,7.2,9.1,7,54.0,14,...,56,1005.5,1007.0,2,7,13.5,14.1,1,2.8,1
4,11/5/2007,0,7.6,16.1,2.8,5.6,10.6,10,50.0,10,...,49,1018.3,1018.5,7,7,11.1,15.4,1,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
361,10/27/2008,0,9.0,30.7,0.0,7.6,12.1,6,76.0,10,...,15,1016.1,1010.8,1,3,20.4,30.0,0,0.0,0
362,10/28/2008,0,7.1,28.4,0.0,11.6,12.7,3,48.0,6,...,22,1020.0,1016.9,0,1,17.2,28.2,0,0.0,0
363,10/29/2008,0,12.5,19.9,0.0,8.4,5.3,2,43.0,1,...,47,1024.0,1022.8,3,2,14.5,18.3,0,0.0,0
364,10/30/2008,0,12.5,26.9,0.0,5.0,7.1,7,46.0,11,...,39,1021.0,1016.2,6,7,15.8,25.9,0,0.0,0


In [43]:
X = df.drop(columns=["Rainfall", "Date"])
y = df["Rainfall"]

In [44]:
display(X.head())

Unnamed: 0,Location,MinTemp,MaxTemp,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,WindDir3pm,WindSpeed9am,...,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RISK_MM,RainTomorrow
0,0,8.0,24.3,3.4,6.3,7,30.0,12,7,6.0,...,29,1019.7,1015.0,7,7,14.4,23.6,0,3.6,1
1,0,14.0,26.9,4.4,9.7,1,39.0,0,13,4.0,...,36,1012.4,1008.4,5,3,17.5,25.7,1,3.6,1
2,0,13.7,23.4,5.8,3.3,7,85.0,3,5,6.0,...,69,1009.5,1007.2,8,7,15.4,20.2,1,39.8,1
3,0,13.3,15.5,7.2,9.1,7,54.0,14,13,30.0,...,56,1005.5,1007.0,2,7,13.5,14.1,1,2.8,1
4,0,7.6,16.1,5.6,10.6,10,50.0,10,2,20.0,...,49,1018.3,1018.5,7,7,11.1,15.4,1,0.0,0


In [45]:
display(y.head())

Unnamed: 0,Rainfall
0,0.0
1,3.6
2,3.6
3,39.8
4,2.8


In [46]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape  # 각 데이터의 shape 확인 (전체데이터 수, 컬럼숫자)

((262, 22), (66, 22), (262,), (66,))

In [47]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(262, 22) (66, 22) (262,) (66,)


In [48]:
# 회귀 모델 훈련 및 평가 (예: Linear Regression)
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# Linear Regression 모델 초기화
model = LinearRegression()

# 모델 훈련
model.fit(X_train, y_train)

# 예측
y_pred = model.predict(X_test)

# 모델 평가
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse}")
print(f"R-squared: {r2}")

Mean Squared Error: 23.756092227208004
R-squared: 0.33804267641132046


In [49]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [50]:
model_scaled = LinearRegression()
model_scaled.fit(X_train_scaled, y_train)
y_pred_scaled = model_scaled.predict(X_test_scaled)
mse_scaled = mean_squared_error(y_test, y_pred_scaled)
r2_scaled = r2_score(y_test, y_pred_scaled)

print(f"Mean Squared Error (Scaled): {mse_scaled}")
print(f"R-squared (Scaled): {r2_scaled}")

Mean Squared Error (Scaled): 23.75609222720798
R-squared (Scaled): 0.33804267641132113


In [51]:
print("Metrics before scaling:")
print(f"Mean Squared Error: {mse}")
print(f"R-squared: {r2}")

print("\nMetrics after scaling:")
print(f"Mean Squared Error (Scaled): {mse_scaled}")
print(f"R-squared (Scaled): {r2_scaled}")

Metrics before scaling:
Mean Squared Error: 23.756092227208004
R-squared: 0.33804267641132046

Metrics after scaling:
Mean Squared Error (Scaled): 23.75609222720798
R-squared (Scaled): 0.33804267641132113


In [52]:
print("\nAnalysis of performance change:")
if abs(mse - mse_scaled) < 1e-6 and abs(r2 - r2_scaled) < 1e-6:
    print("스케일링 전과 후 mse와 r-squared가 매우 비슷")
    print("선형회귀에서, 특성 스케일링은 성능에 크게 영향 안미침. ")
else:
    print("차이남.")


Analysis of performance change:
스케일링 전과 후 mse와 r-squared가 매우 비슷
선형회귀에서, 특성 스케일링은 성능에 크게 영향 안미침. 
