## Weather Prediction System 

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio
from sklearn.model_selection import train_test_split
%matplotlib inline
pd.set_option("display.max_columns", None)



In [3]:
raw_df = pd.read_csv("/Users/prashantsoni/Documents/2025/Machine Learning/logestic-regression/data/weatherAUS.csv")

### A lot of features must be missing from the dataset if the target values are missing, So ideally remove those rows 
When there are any missing values in the target drop those because they wont be making any changes for the model. however if there are columns which directly impact the target, We can remove those too, In our case wheather it rained today will have a direct relation of if it will rain tomorrow?

In [4]:
raw_df.dropna(subset=["RainToday", "RainTomorrow"], inplace=True)

### EDA (Exploratry Data Analysis)

In [5]:
px.histogram(raw_df, x="Location", color="RainToday", title="Rain Tomorrow by Location")


In [6]:
px.scatter(raw_df.sample(2000), x="MinTemp", y="MaxTemp",color="RainToday")

Try and run the code on fraction of data first

In [7]:
use_sample = False
sample_size = 0.1
if use_sample:
    raw_df = raw_df.sample(frac=sample_size).copy()

### Test, Training and Validation set
General rule of thumb is that 60% data is for training, and 20-20 percent is for validation and test

In [8]:
training_validation_df, test_df = train_test_split(raw_df, test_size=0.2, random_state=23)
# 80% of training_validation_df will be used for training and 20% will be used for validation

In [9]:
validation_df, train_df = train_test_split(training_validation_df, test_size=0.25, random_state=23)
# 75% of training_validation_df will be used for training and 25% will be used for validation

In [10]:
# Because we are working with time series data, we need to split the data based on year
year = pd.to_datetime(raw_df["Date"]).dt.year
train_df = raw_df[year < 2015]
validation_df = raw_df[year == 2015]
test_df = raw_df[year > 2015]

### Identifying Input and Target Columns

In [11]:
input_cols = [col for col in raw_df.columns if col not in ["RainTomorrow", "Date"]]
target_col = "RainTomorrow"

In [12]:
train_input = train_df[input_cols].copy()
train_target = train_df[target_col].copy()

validation_input = validation_df[input_cols].copy()
validation_target = validation_df[target_col].copy()

In [13]:
numerical_cols = train_input.select_dtypes(include=["number"]).columns.to_list()
categorical_cols = train_input.select_dtypes(include=["object"]).columns.to_list()

print(f"Numerical Columns: {numerical_cols}")
print(f"Categorical Columns: {categorical_cols}")

Numerical Columns: ['MinTemp', 'MaxTemp', 'Rainfall', 'Evaporation', 'Sunshine', 'WindGustSpeed', 'WindSpeed9am', 'WindSpeed3pm', 'Humidity9am', 'Humidity3pm', 'Pressure9am', 'Pressure3pm', 'Cloud9am', 'Cloud3pm', 'Temp9am', 'Temp3pm']
Categorical Columns: ['Location', 'WindGustDir', 'WindDir9am', 'WindDir3pm', 'RainToday']


In [14]:
train_input[numerical_cols].describe()

Unnamed: 0,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustSpeed,WindSpeed9am,WindSpeed3pm,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm
count,97674.0,97801.0,97988.0,61657.0,57942.0,91160.0,97114.0,96919.0,96936.0,96872.0,88876.0,88857.0,63000.0,61966.0,97414.0,97392.0
mean,12.007831,23.022202,2.372935,5.289991,7.609004,40.215873,14.092263,18.764608,68.628745,51.469547,1017.513734,1015.132352,4.302952,4.410677,16.835126,21.540138
std,6.347175,6.984397,8.518819,3.95201,3.788813,13.697967,8.984203,8.872398,19.003097,20.756113,7.07251,6.997072,2.866634,2.69337,6.404586,6.831612
min,-8.5,-4.1,0.0,0.0,0.0,6.0,0.0,0.0,0.0,0.0,980.5,979.0,0.0,0.0,-5.9,-5.1
25%,7.5,17.9,0.0,2.6,4.8,31.0,7.0,13.0,57.0,37.0,1012.8,1010.4,1.0,2.0,12.2,16.6
50%,11.8,22.4,0.0,4.6,8.5,39.0,13.0,19.0,70.0,52.0,1017.5,1015.1,5.0,5.0,16.6,20.9
75%,16.6,27.9,0.8,7.2,10.6,48.0,19.0,24.0,83.0,66.0,1022.3,1019.9,7.0,7.0,21.4,26.2
max,33.9,48.1,371.0,82.4,14.3,135.0,87.0,87.0,100.0,100.0,1041.0,1039.6,9.0,9.0,40.2,46.1


### Imputing missing values

In [None]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy="mean")
imputer.fit(raw_df[numerical_cols])
train_input[numerical_cols] = imputer.transform(train_input[numerical_cols])
validation_input[numerical_cols] = imputer.transform(validation_input[numerical_cols])
test_df[numerical_cols] = imputer.transform(test_df[numerical_cols])


### Scaling values
While dealing with multiple numerical features, Magnitude of one feature might overpower the magnitude of another so we need to ensure all the features follow a similar scalen

In [None]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
scaler.fit(train_input[numerical_cols])
train_input[numerical_cols] = scaler.transform(train_input[numerical_cols])
validation_input[numerical_cols] = scaler.transform(validation_input[numerical_cols])
test_df[numerical_cols] = scaler.transform(test_df[numerical_cols])


### One Hot Encoding for categorical columns

In [None]:
from sklearn.preprocessing import OneHotEncoder
raw_df_without_na = raw_df[categorical_cols].fillna("Unknown")
encoder = OneHotEncoder(sparse_output=False, handle_unknown="ignore")
encoder.fit(raw_df_without_na)

train_input[encoded_cols] = encoder.transform(train_input[categorical_cols].fillna("Unknown"))
validation_input[encoded_cols] = encoder.transform(validation_input[categorical_cols].fillna("Unknown"))
test_df[encoded_cols] = encoder.transform(test_df[categorical_cols].fillna("Unknown"))


In [None]:
encoded_cols = list(encoder.get_feature_names_out(categorical_cols))

### Exporting inputs and targets

In [39]:
# Fix for ArrowKeyError: Use CSV instead of parquet
pd.DataFrame(train_target).to_csv("train_target.csv", index=False)
pd.DataFrame(validation_target).to_csv("validation_target.csv", index=False)
pd.DataFrame(test_df).to_csv("test_df.csv", index=False)

print("Target data saved successfully as CSV files!")
    

Target data saved successfully as CSV files!


In [None]:
train_input.to_csv("train_inputs.csv", index=False)
test_df.to_csv("test_dfs.csv", index=False)
validation_input.to_csv("validation_inputs.csv", index=False)

print("Data saved successfully as CSV files!")




Data saved successfully as CSV files!


### Train the model

In [55]:
    from sklearn.linear_model import LogisticRegression
    from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
    model = LogisticRegression(solver="liblinear", tol=0.00001, max_iter=1000)

In [56]:
input_data = train_input[numerical_cols + encoded_cols]
target_data = train_target
model = model.fit(input_data, target_data)

In [57]:
y_pred = model.predict(validation_input[numerical_cols + encoded_cols])

In [58]:
print(classification_report(validation_target, y_pred))

              precision    recall  f1-score   support

          No       0.87      0.96      0.91     13511
         Yes       0.75      0.46      0.57      3578

    accuracy                           0.85     17089
   macro avg       0.81      0.71      0.74     17089
weighted avg       0.84      0.85      0.84     17089



In [None]:
train_preds = model.predict(train_input[numerical_cols + encoded_cols])
validation_preds = model.predict(validation_input[numerical_cols + encoded_cols])
test_preds = model.predict(test_df[numerical_cols + encoded_cols])


print("Train Accuracy: ", accuracy_score(train_target, train_preds))
print("Validation Accuracy: ", accuracy_score(validation_target, validation_preds))
print("Test Accuracy: ", accuracy_score(test_df[target_col], test_preds))


Train Accuracy:  0.8520022859942034
Test Accuracy:  0.8418514196810579
Validation Accuracy:  0.8539996488969512


### Testing model on single input

In [72]:
new_input = {
    "Date": "2021-06-19",
    "Location": "Katherine",
    "MinTemp": 23.2,
    "MaxTemp": 33.2,
    "Rainfall": 10.2,
    "Evaporation": 4.2,
    "Sunshine": np.nan,
    "WindGustDir": "NNW",
    "WindGustSpeed": 52.0,
    "WindDir9am": "NW",
    "WindDir3pm": "NNE",
    "WindSpeed9am": 13.0,
    "WindSpeed3pm": 20.0,
    "Humidity9am": 89.0,
    "Humidity3pm": 58.0,
    "Pressure9am": 1004.8,
    "Pressure3pm": 1001.5,
    "Cloud9am": 8.0,
    "Cloud3pm": 5.0,
    "Temp9am": 25.7,
    "Temp3pm": 13.0,
    "RainToday": "Yes",
}
new_input_df = pd.DataFrame([new_input])

In [78]:
def predict(input_val):
    input_val[numerical_cols] = imputer.transform(input_val[numerical_cols])
    input_val[numerical_cols] = scaler.transform(input_val[numerical_cols])
    input_val[encoded_cols] = encoder.transform(input_val[categorical_cols])
    X_input = input_val[numerical_cols + encoded_cols]
    pred = model.predict(X_input)[0]
    prob = model.predict_proba(X_input)[0][list(model.classes_).index(pred)]
    return pred, prob


In [79]:
print(predict(new_input_df))

('Yes', np.float64(1.0))
