In [58]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.metrics import accuracy_score, mean_squared_error, classification_report
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer


In [59]:
df= pd.read_csv(r"C:\Users\Dell\Downloads\weatherAUS.csv")
df

Unnamed: 0,Date,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,...,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RainTomorrow
0,2008-12-01,Albury,13.4,22.9,0.6,,,W,44.0,W,...,71.0,22.0,1007.7,1007.1,8.0,,16.9,21.8,No,No
1,2008-12-02,Albury,7.4,25.1,0.0,,,WNW,44.0,NNW,...,44.0,25.0,1010.6,1007.8,,,17.2,24.3,No,No
2,2008-12-03,Albury,12.9,25.7,0.0,,,WSW,46.0,W,...,38.0,30.0,1007.6,1008.7,,2.0,21.0,23.2,No,No
3,2008-12-04,Albury,9.2,28.0,0.0,,,NE,24.0,SE,...,45.0,16.0,1017.6,1012.8,,,18.1,26.5,No,No
4,2008-12-05,Albury,17.5,32.3,1.0,,,W,41.0,ENE,...,82.0,33.0,1010.8,1006.0,7.0,8.0,17.8,29.7,No,No
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8420,2017-06-21,Uluru,2.8,23.4,0.0,,,E,31.0,SE,...,51.0,24.0,1024.6,1020.3,,,10.1,22.4,No,No
8421,2017-06-22,Uluru,3.6,25.3,0.0,,,NNW,22.0,SE,...,56.0,21.0,1023.5,1019.1,,,10.9,24.5,No,No
8422,2017-06-23,Uluru,5.4,26.9,0.0,,,N,37.0,SE,...,53.0,24.0,1021.0,1016.8,,,12.5,26.1,No,No
8423,2017-06-24,Uluru,7.8,27.0,0.0,,,SE,28.0,SSE,...,51.0,24.0,1019.4,1016.5,3.0,2.0,15.1,26.0,No,No


In [60]:
df.dtypes

Date              object
Location          object
MinTemp          float64
MaxTemp          float64
Rainfall         float64
Evaporation      float64
Sunshine         float64
WindGustDir       object
WindGustSpeed    float64
WindDir9am        object
WindDir3pm        object
WindSpeed9am     float64
WindSpeed3pm     float64
Humidity9am      float64
Humidity3pm      float64
Pressure9am      float64
Pressure3pm      float64
Cloud9am         float64
Cloud3pm         float64
Temp9am          float64
Temp3pm          float64
RainToday         object
RainTomorrow      object
dtype: object

In [61]:
column_names = df.columns
for column in column_names:
    print(column + ' - ' + str(df[column].isnull().sum()))


Date - 0
Location - 0
MinTemp - 75
MaxTemp - 60
Rainfall - 240
Evaporation - 3512
Sunshine - 3994
WindGustDir - 991
WindGustSpeed - 991
WindDir9am - 829
WindDir3pm - 308
WindSpeed9am - 76
WindSpeed3pm - 107
Humidity9am - 59
Humidity3pm - 102
Pressure9am - 1309
Pressure3pm - 1312
Cloud9am - 2421
Cloud3pm - 2455
Temp9am - 56
Temp3pm - 96
RainToday - 240
RainTomorrow - 239


In [62]:
label_encoder = LabelEncoder()
df['Raintoday'] = label_encoder.fit_transform(df['RainToday'])
df['WindGustDir'] = label_encoder.fit_transform(df['WindGustDir'])
df['WindGustSpeed'] = label_encoder.fit_transform(df['WindGustSpeed'])
df['Location'] = label_encoder.fit_transform(df['Location'])
df['WindDir3pm'] = label_encoder.fit_transform(df['WindDir3pm'])
df['WindDir9am'] = label_encoder.fit_transform(df['WindDir9am'])

In [63]:
columns_to_clean = ['Location','MinTemp','MaxTemp','Rainfall','Evaporation','Sunshine','WindGustDir','WindGustSpeed','WindDir9am','WindDir3pm','WindSpeed9am','WindSpeed3pm','Humidity9am','Humidity3pm','Pressure9am','Pressure3pm','Cloud9am','Cloud3pm','Temp9am','Temp3pm','RainToday','RainTomorrow',]
df = df.dropna(subset=columns_to_clean)

In [64]:
df.isnull().sum()

Date             0
Location         0
MinTemp          0
MaxTemp          0
Rainfall         0
Evaporation      0
Sunshine         0
WindGustDir      0
WindGustSpeed    0
WindDir9am       0
WindDir3pm       0
WindSpeed9am     0
WindSpeed3pm     0
Humidity9am      0
Humidity3pm      0
Pressure9am      0
Pressure3pm      0
Cloud9am         0
Cloud3pm         0
Temp9am          0
Temp3pm          0
RainToday        0
RainTomorrow     0
Raintoday        0
dtype: int64

In [71]:
X = df.drop(['Location', 'RainToday', 'RainTomorrow', 'Date'], axis=1)  # Features
y_class = df['RainTomorrow']  # Target variable for classification
y_reg = df['Rainfall'] 

In [72]:
X_train_class, X_test_class, y_train_class, y_test_class = train_test_split(X, y_class, test_size=0.2, random_state=42)


In [73]:
X_train_reg, X_test_reg, y_train_reg, y_test_reg = train_test_split(X, y_reg, test_size=0.2, random_state=42)

In [74]:
clf = RandomForestClassifier(random_state=42)

In [75]:
X_train_class

Unnamed: 0,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,WindDir3pm,WindSpeed9am,WindSpeed3pm,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,Raintoday
1509,14.3,22.1,2.6,1.8,12.4,3,9,11,0,11.0,15.0,52.0,42.0,1013.0,1011.9,1.0,1.0,19.6,21.8,1
7290,13.7,30.3,0.0,8.0,13.7,11,19,11,12,11.0,22.0,50.0,35.0,1013.1,1012.0,0.0,0.0,24.9,29.1,0
5240,10.7,17.6,7.6,1.6,2.1,10,22,9,8,30.0,31.0,70.0,57.0,1016.1,1017.1,6.0,7.0,13.5,16.4,1
8155,24.1,32.2,2.8,5.4,11.2,2,15,9,2,17.0,13.0,67.0,39.0,1010.8,1007.4,4.0,2.0,26.8,31.8,1
907,19.8,27.1,24.4,8.6,9.0,2,15,12,2,17.0,22.0,83.0,60.0,1018.9,1017.9,6.0,6.0,22.0,26.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4916,8.8,22.7,0.0,2.0,12.6,8,17,12,8,9.0,17.0,55.0,30.0,1025.1,1021.0,1.0,2.0,13.8,22.6,0
5233,7.5,25.1,0.0,4.4,12.3,3,32,3,3,15.0,33.0,66.0,22.0,1031.2,1027.7,0.0,0.0,12.9,24.6,0
4620,17.4,23.0,0.2,7.6,5.7,8,22,8,10,15.0,30.0,52.0,39.0,1015.6,1016.9,7.0,6.0,18.3,21.5,0
7973,12.8,26.8,0.0,9.0,12.6,12,20,8,12,15.0,30.0,51.0,36.0,1009.1,1010.0,4.0,1.0,22.3,24.5,0


In [76]:
clf.fit(X_train_class, y_train_class)

In [77]:
clf = RandomForestClassifier(random_state=42)
clf.fit(X_train_class, y_train_class)
y_pred_class = clf.predict(X_test_class)


In [78]:
reg = RandomForestRegressor(random_state=42)
reg.fit(X_train_reg, y_train_reg)
y_pred_reg = reg.predict(X_test_reg)

In [79]:
accuracy = accuracy_score(y_test_class, y_pred_class)
classification_report_result = classification_report(y_test_class, y_pred_class)

# Evaluate the regression model
mse = mean_squared_error(y_test_reg, y_pred_reg)

print("Classification Model:")
print(f"Accuracy: {accuracy}")
print("Classification Report:\n", classification_report_result)

print("\nRegression Model:")
print(f"Mean Squared Error: {mse}")

Classification Model:
Accuracy: 0.8954081632653061
Classification Report:
               precision    recall  f1-score   support

          No       0.89      0.98      0.93       589
         Yes       0.90      0.65      0.76       195

    accuracy                           0.90       784
   macro avg       0.90      0.81      0.84       784
weighted avg       0.90      0.90      0.89       784


Regression Model:
Mean Squared Error: 0.171548645408163
