In [171]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pandas as pd

In [172]:
class LogisticRegressionfromScratch:
    def __init__(self, learning_rate=0.01,iteration=1000, batch_size = 32):
        self.bias = 0
        self.weight = 0
        self.batch_size = batch_size
        self.learning_rate=learning_rate
        self.iteration=iteration

    def sigmoid(self,z):
        return 1/(1+np.exp(-z))

    def fit(self,X,y):
        y = np.asarray(y)
        m,n = X.shape
        self.weight = np.zeros(n)
        for _ in range(self.iteration):
            idx = np.random.randint(0, X.shape[0], self.batch_size)
            xb = X[idx]
            yb = y[idx]
            z = np.dot(xb,self.weight) + self.bias
            h = self.sigmoid(z)

            dw = (1/self.batch_size)*(np.dot(xb.T,h-yb)) 
            db = (1/self.batch_size)*(np.sum(h-yb)) 

            self.weight -=  dw * self.learning_rate
            self.bias -=  db * self.learning_rate


    def predict(self,X):
        return (self.sigmoid(np.dot(X,self.weight) + self.bias)>=0.5).astype(int)



In [173]:
df = pd.read_csv("D:/ML Projects/Logistic_Regression_From_Scratch/data/raw/weatherAUS.csv")

df.shape



(142193, 24)

In [174]:
df['MinTemp']

Q1 = df['MinTemp'].quantile(0.25)
Q3 = df['MinTemp'].quantile(0.75)
IQR = Q3 - Q1

lower = Q1 - 1.5 * IQR
upper = Q3 + 1.5 * IQR

outliers = df[(df['MinTemp'] < lower) | (df['MinTemp'] > upper)]
print(outliers.empty) 



False


In [175]:
##since Min temp has outliers we'll use median to fill empty values

df['MinTemp'] = df['MinTemp'].fillna(df['MinTemp'].median())
df['MinTemp'].isna().sum()

np.int64(0)

Do the same for max temp and rain today

In [176]:
col = ['MaxTemp']

Q1 = df[col].quantile(0.25)
Q3 = df[col].quantile(0.75)
IQR = Q3 - Q1

lower = Q1 - 1.5 * IQR
upper = Q3 + 1.5 * IQR

outliers = df[(df[col] < lower) | (df[col] > upper)]
print(outliers.empty) 


False


In [177]:
##since max temp has outliers we'll use median to fill empty values

df[col] = df[col].fillna(df[col].median())
df[col].isna().sum()

MaxTemp    0
dtype: int64

In [178]:
mode_val = df['RainToday'].mode(dropna=True)[0]
df['RainToday'] = df['RainToday'].fillna(mode_val)
df.isna().sum()

Date                 0
Location             0
MinTemp              0
MaxTemp              0
Rainfall          1406
Evaporation      60843
Sunshine         67816
WindGustDir       9330
WindGustSpeed     9270
WindDir9am       10013
WindDir3pm        3778
WindSpeed9am      1348
WindSpeed3pm      2630
Humidity9am       1774
Humidity3pm       3610
Pressure9am      14014
Pressure3pm      13981
Cloud9am         53657
Cloud3pm         57094
Temp9am            904
Temp3pm           2726
RainToday            0
RISK_MM              0
RainTomorrow         0
dtype: int64

Do for Humidity3pm, Pressure3pm, Temp3pm, Cloud3pm

In [179]:
col = ['Humidity3pm']

Q1 = df[col].quantile(0.25)
Q3 = df[col].quantile(0.75)
IQR = Q3 - Q1

lower = Q1 - 1.5 * IQR
upper = Q3 + 1.5 * IQR

outliers = df[(df[col] < lower) | (df[col] > upper)]
print(outliers.empty) 

False


In [180]:
##since Humidity3pm has outliers we'll use median to fill empty values

df[col] = df[col].fillna(df[col].median())
df[col].isna().sum()

Humidity3pm    0
dtype: int64

Pressure3pm

In [181]:
col = ['Pressure3pm']

Q1 = df[col].quantile(0.25)
Q3 = df[col].quantile(0.75)
IQR = Q3 - Q1

lower = Q1 - 1.5 * IQR
upper = Q3 + 1.5 * IQR

outliers = df[(df[col] < lower) | (df[col] > upper)]
print(outliers.empty) 

False


In [182]:
##since Pressure3pm has outliers we'll use median to fill empty values

df[col] = df[col].fillna(df[col].median())
df[col].isna().sum()

Pressure3pm    0
dtype: int64

Temp3pm

In [183]:
col = ['Temp3pm']

Q1 = df[col].quantile(0.25)
Q3 = df[col].quantile(0.75)
IQR = Q3 - Q1

lower = Q1 - 1.5 * IQR
upper = Q3 + 1.5 * IQR

outliers = df[(df[col] < lower) | (df[col] > upper)]
print(outliers.empty) 

False


In [184]:
##since Temp3pm has outliers we'll use median to fill empty values

df[col] = df[col].fillna(df[col].median())
df[col].isna().sum()

Temp3pm    0
dtype: int64

Cloud3pm

In [185]:
col = ['Cloud3pm']

Q1 = df[col].quantile(0.25)
Q3 = df[col].quantile(0.75)
IQR = Q3 - Q1

lower = Q1 - 1.5 * IQR
upper = Q3 + 1.5 * IQR

outliers = df[(df[col] < lower) | (df[col] > upper)]
print(outliers.empty) 

False


In [186]:
##since Cloud3pm has outliers we'll use median to fill empty values

df[col] = df[col].fillna(df[col].median())
df[col].isna().sum()

Cloud3pm    0
dtype: int64

In [187]:
df.isna().sum()

Date                 0
Location             0
MinTemp              0
MaxTemp              0
Rainfall          1406
Evaporation      60843
Sunshine         67816
WindGustDir       9330
WindGustSpeed     9270
WindDir9am       10013
WindDir3pm        3778
WindSpeed9am      1348
WindSpeed3pm      2630
Humidity9am       1774
Humidity3pm          0
Pressure9am      14014
Pressure3pm          0
Cloud9am         53657
Cloud3pm             0
Temp9am            904
Temp3pm              0
RainToday            0
RISK_MM              0
RainTomorrow         0
dtype: int64

Use MinTemp, MaxTemp, Humidity3pm, Pressure3pm, Cloud3pm, Temp3pm, RainToday

Convert to numeric

In [188]:
col =['MinTemp', 'MaxTemp', 'Humidity3pm', 'Pressure3pm', 'Cloud3pm', 'Temp3pm', 'RainToday']

df[col] = df[col].apply(pd.to_numeric, errors='coerce')

In [189]:
df['RainToday'] = df['RainToday'].map({'No': 0, 'Yes': 1})
df['RainTomorrow'] = df['RainTomorrow'].map({'No': 0, 'Yes': 1})

In [190]:
col =['MinTemp', 'MaxTemp', 'Humidity3pm', 'Pressure3pm', 'Cloud3pm', 'Temp3pm', 'RainToday']
X=df[col]
y = df['RainTomorrow']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

  updated_mean = (last_sum + new_sum) / updated_sample_count
  T = new_sum / new_sample_count
  new_unnormalized_variance -= correction**2 / new_sample_count


In [191]:
model = LogisticRegressionfromScratch(learning_rate=0.01,iteration=1000)
model.fit(X_train,y_train)

predictions = model.predict(X_test)
accuracy = np.mean(predictions == y_test.to_numpy())
print(f"Model Accuracy: {accuracy:.2f}")

Model Accuracy: 0.78
