In [8]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline


# Load the dataset
df = pd.read_csv('weatherAUS.csv')

# Explore the data
print(df.head())

# Data Preprocessing

# Handling missing values
imputer = SimpleImputer(strategy='most_frequent')
# Keep the original column names after imputation
df_imputed = pd.DataFrame(imputer.fit_transform(df.select_dtypes(include=['object', 'number'])), columns=df.select_dtypes(include=['object', 'number']).columns)

# Convert categorical columns to numeric using Label Encoding
# Include all object type columns for label encoding
label_cols = df_imputed.select_dtypes(include=['object']).columns.tolist()
label_encoder = LabelEncoder()

for col in label_cols:
    df_imputed[col] = df_imputed[col].astype(str) # Convert to string type
    df_imputed[col] = label_encoder.fit_transform(df_imputed[col])

# Handle missing values for numerical columns if necessary
df_imputed['Rainfall'] = df_imputed['Rainfall'].fillna(df_imputed['Rainfall'].mean())

# --- Convert 'Date' column to numerical features ---
# Extract year, month, and day from the 'Date' column
df_imputed['Date'] = pd.to_datetime(df_imputed['Date'])
df_imputed['Year'] = df_imputed['Date'].dt.year
df_imputed['Month'] = df_imputed['Date'].dt.month
df_imputed['Day'] = df_imputed['Date'].dt.day
# Drop the original 'Date' column
df_imputed = df_imputed.drop(columns=['Date'])

# Define the target variable and features
X = df_imputed.drop(columns=['RainTomorrow'])
y = df_imputed['RainTomorrow']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Build the model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
print("Classification Report:")
print(classification_report(y_test, y_pred))

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

         Date Location  MinTemp  MaxTemp  Rainfall  Evaporation  Sunshine  \
0  2008-12-01   Albury     13.4     22.9       0.6          NaN       NaN   
1  2008-12-02   Albury      7.4     25.1       0.0          NaN       NaN   
2  2008-12-03   Albury     12.9     25.7       0.0          NaN       NaN   
3  2008-12-04   Albury      9.2     28.0       0.0          NaN       NaN   
4  2008-12-05   Albury     17.5     32.3       1.0          NaN       NaN   

  WindGustDir  WindGustSpeed WindDir9am  ... Humidity3pm  Pressure9am  \
0           W           44.0          W  ...        22.0       1007.7   
1         WNW           44.0        NNW  ...        25.0       1010.6   
2         WSW           46.0          W  ...        30.0       1007.6   
3          NE           24.0         SE  ...        16.0       1017.6   
4           W           41.0        ENE  ...        33.0       1010.8   

   Pressure3pm  Cloud9am  Cloud3pm  Temp9am  Temp3pm  RainToday  RISK_MM  \
0       1007.1       8