# Importing Libraries


In [13]:
!pip install -q pgmpy
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import scipy
import warnings
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler,LabelEncoder
from sklearn.exceptions import ConvergenceWarning
from pgmpy.models import BayesianNetwork
from pgmpy.estimators import BayesianEstimator
from pgmpy.estimators import MaximumLikelihoodEstimator
from pgmpy.inference import VariableElimination


warnings.filterwarnings("ignore")

# Data Preprocessing

**Importing the dataset**

In [14]:
df = pd.read_csv('weatherAUS.csv')
df.head()

Unnamed: 0,Date,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,...,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RainTomorrow
0,2008-12-01,Albury,13.4,22.9,0.6,,,W,44.0,W,...,71.0,22.0,1007.7,1007.1,8.0,,16.9,21.8,No,No
1,2008-12-02,Albury,7.4,25.1,0.0,,,WNW,44.0,NNW,...,44.0,25.0,1010.6,1007.8,,,17.2,24.3,No,No
2,2008-12-03,Albury,12.9,25.7,0.0,,,WSW,46.0,W,...,38.0,30.0,1007.6,1008.7,,2.0,21.0,23.2,No,No
3,2008-12-04,Albury,9.2,28.0,0.0,,,NE,24.0,SE,...,45.0,16.0,1017.6,1012.8,,,18.1,26.5,No,No
4,2008-12-05,Albury,17.5,32.3,1.0,,,W,41.0,ENE,...,82.0,33.0,1010.8,1006.0,7.0,8.0,17.8,29.7,No,No


In [15]:
df.dtypes

Date              object
Location          object
MinTemp          float64
MaxTemp          float64
Rainfall         float64
Evaporation      float64
Sunshine         float64
WindGustDir       object
WindGustSpeed    float64
WindDir9am        object
WindDir3pm        object
WindSpeed9am     float64
WindSpeed3pm     float64
Humidity9am      float64
Humidity3pm      float64
Pressure9am      float64
Pressure3pm      float64
Cloud9am         float64
Cloud3pm         float64
Temp9am          float64
Temp3pm          float64
RainToday         object
RainTomorrow      object
dtype: object

**Printing Missing Values**

In [16]:
missing=df.isna().sum()
print(missing)

Date                 0
Location             0
MinTemp           1317
MaxTemp           1124
Rainfall          2929
Evaporation      51935
Sunshine         57088
WindGustDir       9862
WindGustSpeed     9836
WindDir9am        9155
WindDir3pm        3927
WindSpeed9am      1618
WindSpeed3pm      2909
Humidity9am       2430
Humidity3pm       3533
Pressure9am      10867
Pressure3pm      10832
Cloud9am         44795
Cloud3pm         47632
Temp9am           1640
Temp3pm           2803
RainToday         2930
RainTomorrow      2932
dtype: int64


Dropping 'Date' as it does not contribute much for classification

In [17]:
df=df.drop(columns='Date',axis=1)

**Splitting into Numerical and Categorical Features**

In [18]:
categorical_features = df.select_dtypes(include=['object']).columns.tolist()
numerical_features = df.select_dtypes(exclude=['object']).columns.tolist()
print("Numerical: ",numerical_features)
print("Catgorical: ",categorical_features)

Numerical:  ['MinTemp', 'MaxTemp', 'Rainfall', 'Evaporation', 'Sunshine', 'WindGustSpeed', 'WindSpeed9am', 'WindSpeed3pm', 'Humidity9am', 'Humidity3pm', 'Pressure9am', 'Pressure3pm', 'Cloud9am', 'Cloud3pm', 'Temp9am', 'Temp3pm']
Catgorical:  ['Location', 'WindGustDir', 'WindDir9am', 'WindDir3pm', 'RainToday', 'RainTomorrow']


Dropping the rows having null values in RainTomorrow column since we can't fill the null values for a column which we are going to predict


In [19]:
df.dropna(subset=['RainTomorrow'], inplace=True)

**Filling the null values of Numercal Columns with median**

In [20]:
for column in numerical_features:
  df[column].fillna(df[column].median(), inplace=True)

**Filling the null values of Categorical Columns with most frequent value**

In [21]:
for column in categorical_features:
  df[column].fillna(df[column].mode()[0],inplace=True)

In [22]:
df.isna().sum()

Location         0
MinTemp          0
MaxTemp          0
Rainfall         0
Evaporation      0
Sunshine         0
WindGustDir      0
WindGustSpeed    0
WindDir9am       0
WindDir3pm       0
WindSpeed9am     0
WindSpeed3pm     0
Humidity9am      0
Humidity3pm      0
Pressure9am      0
Pressure3pm      0
Cloud9am         0
Cloud3pm         0
Temp9am          0
Temp3pm          0
RainToday        0
RainTomorrow     0
dtype: int64

**Encoding Categorical Columns**

Using label encoder for RainToday and RainTomorrow, and using one hot encoding for other categorical columns

In [23]:
label_encoder = LabelEncoder()
for i in categorical_features:
  df[i] = label_encoder.fit_transform(df[i])
df.head()

Unnamed: 0,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,WindDir3pm,...,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RainTomorrow
0,2,13.4,22.9,0.6,4.4,8.2,13,44.0,13,14,...,71.0,22.0,1007.7,1007.1,8.0,5.0,16.9,21.8,0,0
1,2,7.4,25.1,0.0,4.4,8.2,14,44.0,6,15,...,44.0,25.0,1010.6,1007.8,5.0,5.0,17.2,24.3,0,0
2,2,12.9,25.7,0.0,4.4,8.2,15,46.0,13,15,...,38.0,30.0,1007.6,1008.7,5.0,2.0,21.0,23.2,0,0
3,2,9.2,28.0,0.0,4.4,8.2,4,24.0,9,0,...,45.0,16.0,1017.6,1012.8,5.0,5.0,18.1,26.5,0,0
4,2,17.5,32.3,1.0,4.4,8.2,13,41.0,1,7,...,82.0,33.0,1010.8,1006.0,7.0,8.0,17.8,29.7,0,0


# Splitting into Train and Test

In [24]:
X = df.drop(columns = ['RainTomorrow','Location'])
y = df['RainTomorrow']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 8)

# Initialize the StandardScaler
scaler = StandardScaler()

# Fit and transform the training set, transform the testing set
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

Using standard scaled data for some of the models

# Linear Models

**Linear Models**

Linear models assume a linear relationship between input features and the target variable. Examples include Linear Regression and Logistic Regression. They are interpretable, computationally efficient, and work well when relationships are approximately linear.

**Logistic Regression vs Naive Bayes**

Logistic Regression is a linear model that predicts probabilities based on input features, suitable for linear relationships. Naive Bayes is a probabilistic model assuming feature independence, often used for text classification. Logistic Regression offers interpretability, while Naive Bayes provides computational efficiency, particularly for high-dimensional data like text.

## Logistic Regression

**Logistic Regression before and after tuning hyperparameters**

In [25]:
# Logistic Regression before tuning hyperparameters
lr_before = LogisticRegression(max_iter = 1500, solver = 'liblinear')
lr_before.fit(X_train, y_train)
y_pred_before = lr_before.predict(X_test)
accuracy_before = accuracy_score(y_test, y_pred_before)
print("Accuracy before hyperparameter tuning:", accuracy_before)

param_grid = {'C': [0.001, 0.01, 0.1, 1, 10, 100]}

# Logistic Regression after tuning hyperparameters
lr_after = GridSearchCV(LogisticRegression(max_iter = 2000, solver = 'liblinear'), param_grid, cv=5)
lr_after.fit(X_train, y_train)
y_pred_after = lr_after.predict(X_test)
accuracy_after = accuracy_score(y_test, y_pred_after)
print("Accuracy after hyperparameter tuning:", accuracy_after)
print("Best hyperparameters:", lr_after.best_params_)

Accuracy before hyperparameter tuning: 0.8364508558807289
Accuracy after hyperparameter tuning: 0.8370375483158476
Best hyperparameters: {'C': 0.1}


## Naive Bayes

**Naive Bayes before and after tuning hyperparameters**

In [26]:
# Naive Bayes before tuning hyperparameters
gnb_before = GaussianNB()
gnb_before.fit(X_train, y_train)
y_pred_before = gnb_before.predict(X_test)
accuracy_before = accuracy_score(y_test, y_pred_before)
print("Accuracy before hyperparameter tuning:", accuracy_before)

param_grid = {'var_smoothing': np.logspace(0,-9, num=100)}

# Naive Bayes after tuning hyperparameters
gnb_after = GridSearchCV(GaussianNB(), param_grid, cv=5)
gnb_after.fit(X_train, y_train)
y_pred_after = gnb_after.predict(X_test)
accuracy_after = accuracy_score(y_test, y_pred_after)
print("Accuracy after hyperparameter tuning:", accuracy_after)
print("Best hyperparameters:", gnb_after.best_params_)

Accuracy before hyperparameter tuning: 0.8001449475427941
Accuracy after hyperparameter tuning: 0.8246134732192159
Best hyperparameters: {'var_smoothing': 0.12328467394420659}


## Inference

**Inference**

Hyperparameter tuning improved accuracy slightly for Logistic Regression (from 83.64% to 83.70%) with C=0.1. For Naive Bayes, tuning notably enhanced accuracy (from 80.01% to 82.46%) with var_smoothing=0.123. Logistic Regression performs slightly over Naive Bayes

# Non-Linear Models

**Non-Linear Models**


Non-linear models capture complex relationships between variables, unlike linear models. Examples include Decision Trees, Neural Networks, and Support Vector Machines with non-linear kernels. They are suitable for data with intricate patterns and interactions.

**Decision Trees vs Neural Networks**

Decision Trees are interpretable, non-linear models suitable for tabular data. Neural Networks are complex, non-linear models capable of learning intricate patterns but less interpretable. Decision Trees excel with smaller datasets and interpretable rules, while Neural Networks perform well with large datasets and complex relationships but require more computational resources.


## Decision Trees

**Decision Tree before and after tuning hyperparameters**

In [27]:
# Decision Tree before tuning hyperparameters
dt_before = DecisionTreeClassifier(random_state=42)
dt_before.fit(X_train, y_train)
y_pred_before = dt_before.predict(X_test)
accuracy_before = accuracy_score(y_test, y_pred_before)
print("Accuracy before hyperparameter tuning:", accuracy_before)

param_grid = {
    'max_depth': [None, 5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['auto', 'sqrt', 'log2']
}

# Decision Tree after tuning hyperparameters
dt_after = GridSearchCV(DecisionTreeClassifier(random_state=42), param_grid, cv=5)
dt_after.fit(X_train, y_train)
y_pred_after = dt_after.predict(X_test)
accuracy_after = accuracy_score(y_test, y_pred_after)
print("Accuracy after hyperparameter tuning:", accuracy_after)
print("Best hyperparameters:", dt_after.best_params_)

Accuracy before hyperparameter tuning: 0.7768152954168968
Accuracy after hyperparameter tuning: 0.8224737713970183
Best hyperparameters: {'max_depth': 10, 'max_features': 'auto', 'min_samples_leaf': 1, 'min_samples_split': 10}


## Neural Netwoks

**Neural Network before and tuning hyperparameters**

In [28]:
# Neural Network before tuning hyperparameters
mlp_before = MLPClassifier(random_state=42)
mlp_before.fit(X_train_scaled, y_train)
y_pred_before = mlp_before.predict(X_test_scaled)
accuracy_before = accuracy_score(y_test, y_pred_before)
print("Accuracy before hyperparameter tuning:", accuracy_before)

param_grid =  {
    'hidden_layer_sizes': [(50,),(100,)]
}

# Neural Network after tuning hyperparameters
mlp_after = GridSearchCV(MLPClassifier(random_state=42), param_grid,n_jobs=-1, cv=5)
mlp_after.fit(X_train_scaled, y_train)
y_pred_after = mlp_after.predict(X_test_scaled)
accuracy_after = accuracy_score(y_test, y_pred_after)
print("Accuracy after hyperparameter tuning:", accuracy_after)
print("Best hyperparameters:", mlp_after.best_params_)

Accuracy before hyperparameter tuning: 0.8473909442297074
Accuracy after hyperparameter tuning: 0.8484262838210933
Best hyperparameters: {'hidden_layer_sizes': (50,)}


## Inference

**Inference**

Hyperparameter tuning notably improved Decision Tree accuracy (from 77.68% to 82.25%) with max_depth=10, max_features='auto', min_samples_leaf=1, min_samples_split=10. For MLP, accuracy slightly increased (from 84.74% to 84.84%) with hidden_layer_sizes=(50,). Decision Tree model has lower accuracy when compared to Neural Network

# Hybrid Models

**Hybrid Models**

Hybrid models combine elements of different types of models, such as combining neural networks with decision trees or blending linear and non-linear models. They aim to leverage the strengths of each component to improve overall performance and interpretability.

**Support Vector Machines vs Bayesian Networks**

Support Vector Machines (SVMs) find optimal hyperplanes for classification, suitable for high-dimensional data. Bayesian Networks model probabilistic dependencies between variables using graph structures, offering insights into causality. SVMs are powerful for classification, while Bayesian Networks excel in probabilistic reasoning and causal inference.

## Support Vector Machines

**Support Vector Machine before and after tuning hyperparameters**

In [30]:
# Support Vector Machine before tuning hyperparameters
svm_before = SVC(random_state=42)
svm_before.fit(X_train_scaled, y_train)
y_pred_before = svm_before.predict(X_test_scaled)
accuracy_before = accuracy_score(y_test, y_pred_before)
print("Accuracy before hyperparameter tuning:", accuracy_before)

param_grid = {'C': [0.1, 1, 10, 100],
              'gamma': [1, 0.1, 0.01, 0.001]}

# Support Vector Machine after tuning hyperparameters
svm_after = GridSearchCV(SVC(), param_grid, cv=5)
svm_after.fit(X_train_scaled, y_train)
y_pred_after = svm_after.predict(X_test_scaled)
accuracy_after = accuracy_score(y_test, y_pred_after)
print("Accuracy after hyperparameter tuning:", accuracy_after)
print("Best hyperparameters:", svm_after.best_params_)

Accuracy before hyperparameter tuning: 0.8568
Accuracy after hyperparameter tuning: 0.8566
Best hyperparameters: {'C': 100, 'gamma': 0.01}


## Bayesian Network

In [31]:
X = df[['RainToday', 'MaxTemp', 'Rainfall','RainTomorrow']]
print("Attributes passed to the model",X.columns.tolist())

X_train, X_test = train_test_split(X, test_size=0.2, random_state=42)

model = BayesianNetwork([
    ('RainToday', 'RainTomorrow'),
    ('MaxTemp', 'RainTomorrow'),
    ('Rainfall', 'RainTomorrow')
])

model.fit(X_train, estimator=BayesianEstimator, n_jobs=-1)

Rain_infer = VariableElimination(model)

print('\n 1. Probability of RainTomorrow given RainToday= 1')
q1=Rain_infer.query(variables=['RainTomorrow'],evidence={'RainToday':1})
print(q1)


Attributes passed to the model ['RainToday', 'MaxTemp', 'Rainfall', 'RainTomorrow']

 1. Probability of RainTomorrow given RainToday= 1
+-----------------+---------------------+
| RainTomorrow    |   phi(RainTomorrow) |
| RainTomorrow(0) |              0.5124 |
+-----------------+---------------------+
| RainTomorrow(1) |              0.4876 |
+-----------------+---------------------+


## Inference

**Inference**

Hyperparameter tuning marginally affected the Support Vector Machine's accuracy, with the best parameters being C=100 and gamma=0.01, maintaining accuracy around 85.68% before and 85.66% after tuning.


The given Bayesian Network model predicts the probability of rainfall tomorrow based on today's rain status, maximum temperature, and rainfall. For instance, if it rains today (RainToday=1), the probability of rain tomorrow is approximately 48.76%, otherwise 51.24%.