In [1]:
# 1. Import Necessary Libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.impute import SimpleImputer

In [2]:
# 2. Load the Dataset
# Ensure the filename matches exactly
data = pd.read_csv('Weather.csv - Dataset.csv')

In [3]:
# 3. Initial Analysis
print("Data Shape:", data.shape)
print(data.info())

Data Shape: (145460, 24)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 145460 entries, 0 to 145459
Data columns (total 24 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   Date           145460 non-null  object 
 1   Location       145460 non-null  object 
 2   MinTemp        143975 non-null  float64
 3   MaxTemp        144199 non-null  float64
 4   Rainfall       142199 non-null  float64
 5   Evaporation    82670 non-null   float64
 6   Sunshine       75625 non-null   float64
 7   WindGustDir    135134 non-null  object 
 8   WindGustSpeed  135197 non-null  float64
 9   WindDir9am     134894 non-null  object 
 10  WindDir3pm     141232 non-null  object 
 11  WindSpeed9am   143693 non-null  float64
 12  WindSpeed3pm   142398 non-null  float64
 13  Humidity9am    142806 non-null  float64
 14  Humidity3pm    140953 non-null  float64
 15  Pressure9am    130395 non-null  float64
 16  Pressure3pm    130432 non-null  float64
 17  Clou

In [4]:
# 4. Handling Missing Values (Based on your project logic)
# Drop columns with too many missing values as shown in your images
data.drop(columns=['Evaporation', 'Sunshine', 'Cloud9am', 'Cloud3pm'], axis=1, inplace=True)

In [5]:
# Separate numeric and categorical for cleaning
numeric_cols = data.select_dtypes(include=[np.number]).columns
categorical_cols = data.select_dtypes(include=['object']).columns

In [7]:
# Fill numeric missing values with Mean - The New Way
for col in numeric_cols:
    data[col] = data[col].fillna(data[col].mean())

In [8]:
# Fill categorical missing values with Most Frequent
imp_mode = SimpleImputer(strategy='most_frequent')
data[categorical_cols] = imp_mode.fit_transform(data[categorical_cols])

In [9]:
print("Missing values after cleaning:")
print(data.isnull().sum())

Missing values after cleaning:
Date                  0
Location              0
MinTemp               0
MaxTemp               0
Rainfall              0
WindGustDir           0
WindGustSpeed         0
WindDir9am            0
WindDir3pm            0
WindSpeed9am          0
WindSpeed3pm          0
Humidity9am           0
Humidity3pm           0
Pressure9am           0
Pressure3pm           0
Temp9am               0
Temp3pm               0
RainToday             0
RainTomorrow          0
@dropdown        145460
dtype: int64


In [10]:
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler

# 1. Feature Engineering: Convert 'Date' to Year, Month, and Day
# This is crucial because your Flask app expects these 3 separate inputs
data['Date'] = pd.to_datetime(data['Date'])
data['year'] = data['Date'].dt.year
data['month'] = data['Date'].dt.month
data['day'] = data['Date'].dt.day

# Now we drop the original Date column and the '@dropdown' column if it exists
data.drop(['Date'], axis=1, inplace=True)
if '@dropdown' in data.columns:
    data.drop(['@dropdown'], axis=1, inplace=True)

# 2. Label Encoding: Convert words to numbers (e.g., 'Yes' -> 1, 'No' -> 0)
le = LabelEncoder()

# List of columns that contain text/categories
categorical_cols = ['Location', 'WindGustDir', 'WindDir9am', 'WindDir3pm', 'RainToday', 'RainTomorrow']

for col in categorical_cols:
    data[col] = le.fit_transform(data[col])

# 3. Define the features (X) and target (y)
# We must arrange columns in the EXACT order the Flask app expects
expected_columns = [
    'Location', 'MinTemp', 'MaxTemp', 'Rainfall', 'WindGustSpeed',
    'WindSpeed9am', 'WindSpeed3pm', 'Humidity9am', 'Humidity3pm',
    'Pressure9am', 'Pressure3pm', 'Temp9am', 'Temp3pm', 'RainToday',
    'WindGustDir', 'WindDir9am', 'WindDir3pm', 'year', 'month', 'day'
]

X = data[expected_columns]
y = data['RainTomorrow']

print("Encoding Complete!")
print("Input Features (X) Shape:", X.shape)
print(X.head()) # Look at the numbers!

Encoding Complete!
Input Features (X) Shape: (145460, 20)
   Location  MinTemp  MaxTemp  Rainfall  WindGustSpeed  WindSpeed9am  \
0        14     13.4     22.9       0.6           44.0          20.0   
1        14      7.4     25.1       0.0           44.0           4.0   
2        14     12.9     25.7       0.0           46.0          19.0   
3        14      9.2     28.0       0.0           24.0          11.0   
4        14     17.5     32.3       1.0           41.0           7.0   

   WindSpeed3pm  Humidity9am  Humidity3pm  Pressure9am  Pressure3pm  Temp9am  \
0          24.0         71.0         22.0       1007.7       1007.1     16.9   
1          22.0         44.0         25.0       1010.6       1007.8     17.2   
2          26.0         38.0         30.0       1007.6       1008.7     21.0   
3           9.0         45.0         16.0       1017.6       1012.8     18.1   
4          20.0         82.0         33.0       1010.8       1006.0     17.8   

   Temp3pm  RainToday  WindG

In [11]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix
import pickle

# 1. Feature Scaling
# This normalizes the data so the model treats all features fairly
sc = StandardScaler()
X_scaled = sc.fit_transform(X)

# 2. Splitting the dataset into Train and Test sets
x_train, x_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# 3. Training the Model (Random Forest)
print("Training the model... Please wait, this might take a minute.")
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(x_train, y_train)

# 4. Checking Accuracy
y_pred = rf_model.predict(x_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy * 100:.2f}%")

# 5. SAVE THE FILES (The most important part!)
# These files will appear in your folder. Your app.py needs them.
pickle.dump(rf_model, open('Rainfall.pkl', 'wb'))
pickle.dump(sc, open('scale.pkl', 'wb'))

print("\n--- DONE! ---")
print("Check your folder. You should now see 'Rainfall.pkl' and 'scale.pkl'.")

Training the model... Please wait, this might take a minute.
Model Accuracy: 85.16%

--- DONE! ---
Check your folder. You should now see 'Rainfall.pkl' and 'scale.pkl'.
