**Title: Rain Prediction Model**

**Description**: This model forecast rainfall using variety of input variables such as temperature, humidity, wind speed, cloud cover and atmospheric pressure.

**Steps involved in Model:**

**Step 1:** Importing the required libraries and dataset

**Step 2:** Data cleaning - checking for missing values of any of the input of output parameters

**Step 3:** Selecting the feauters using different methodologies - check for impacting and non-impacting feature, droping the least impactful

**Step 4:** Training the model

**Step 5:** Testing the model

**Step 6:** Finding accuracy and confusion matrix of different models



---



**Importing Required Libraries**

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import preprocessing
import seaborn as sb
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.svm import SVC

**Importing Dataset**

In [None]:
url = "https://raw.githubusercontent.com/shravyapendyala/CCE_Assignment_1/refs/heads/main/weather_forecast_data.csv" # dataset link
dataset=pd.read_csv(url)
#data=pd.read_csv(url)
dataset.head(10)


**Data Cleaning**

In [None]:
# Check for Missing Values of Parameters in dataset
print("\nMissing Values in Each Column:")
print(dataset.isnull().sum())

# Label Encoding on Rain Column
label_encoder=preprocessing.LabelEncoder()
dataset['Rain']=label_encoder.fit_transform(dataset['Rain']) # Converting Rain Column to integer value
dataset.head(5)

**Feature Selection using Correlation matrix**

In [None]:
# Correlation Matrix
correl_mat=dataset.corr()

# Plot heat map
sb.heatmap(correl_mat,annot=True)

**Feature Selection using Extra Tree Classifier**

In [None]:
x=dataset.iloc[:, 0:5]
y=dataset.iloc[:, -1]

model=ExtraTreesClassifier()
model.fit(x,y)
imp=pd.Series(model.feature_importances_)
feature_importance = model.feature_importances_
plt.bar(x.columns, feature_importance)
plt.title('Feature Importance')

**Feature Selection using Random Forest**

In [None]:
x=dataset.iloc[:, 0:5]
y=dataset.iloc[:, -1]

rf_model = RandomForestClassifier()
rf_model.fit(x,y)
important_features = rf_model.feature_importances_

plt.figure(figsize=(6, 6))
plt.pie(important_features, labels=x.columns, autopct='%1.1f%%', startangle=90)
plt.axis('equal')
plt.show()

**Dimentionality reduction by removing Wind_speed column**

In [None]:
dataset.columns
new_dataset=dataset.drop(columns=['Wind_Speed'])

**Model Training & Testing**

**Splitting data into Training and Testing sets**

In [None]:
from sklearn.model_selection import train_test_split

# Defining new dataset after dropping one feature
X=new_dataset.iloc[:, 0:4]
Y=new_dataset.iloc[:, -1]

# Splitting data into testing and training, testing 30 % & training 70 %
x_train,x_test,y_train,y_test=train_test_split(X,Y,test_size=0.3,random_state=0)

y_test=label_encoder.inverse_transform(y_test)

**Training & Testing the model using Random Forest Classifier**

In [None]:
from sklearn.ensemble import RandomForestClassifier
# Initialize the RF classifier (Logistic Regression)
classifier1=RandomForestClassifier(n_estimators=100, random_state=0)

# Train the model on the training data
classifier1.fit(x_train,y_train)

# Making prediction on the test set
y_pred1=classifier1.predict(x_test)

# Inversing the label encoding done to convert rain column to interger values
y_pred1=label_encoder.inverse_transform(y_pred1)
#y_test=label_encoder.inverse_transform(y_test)

# Reshape the predictions and the actual labels (if needed, depending on how you want to compare them)
y_pred1=y_pred1.reshape(-1,1)
y_test=y_test.reshape(-1,1)
y_dataframe1=np.concatenate((y_pred1,y_test),axis=1)


# Accuracy of the model
accuracy = accuracy_score(y_pred1,y_test)
print(f"Accuracy of the RF model: {accuracy:.4f}")

# Generate the confusion matrix
cm = confusion_matrix(y_test, y_pred1)
plt.figure(figsize=(6, 4))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=["No Rain", "Rain"], yticklabels=["No Rain", "Rain"])
plt.xlabel("Predicted Label")
plt.ylabel("True Label")
plt.title("Confusion Matrix for Random Forest Classifier Model")
plt.show()

**Training & Testing the model using Logistic Regression**

In [None]:
from sklearn.linear_model import LogisticRegression
import seaborn as sns
# Initialize the LR classifier (Logistic Regression)
classifier2=LogisticRegression( random_state=0)

# Train the model on the training data
classifier2.fit(x_train,y_train)

# Making prediction on the test set
y_pred2=classifier2.predict(x_test)

# Inversing the label encoding done to convert rain column to interger values
y_pred2=label_encoder.inverse_transform(y_pred2)
#y_test=label_encoder.inverse_transform(y_test)

# Reshape the predictions and the actual labels (if needed, depending on how you want to compare them)
y_pred2=y_pred2.reshape(-1,1)
y_test=y_test.reshape(-1,1)
y_dataframe2=np.concatenate((y_pred2,y_test),axis=1)

# Accuracy of the model
accuracy = accuracy_score(y_pred2,y_test)
print(f"Accuracy of the LR model: {accuracy:.4f}")

# Generate the confusion matrix
cm = confusion_matrix(y_test, y_pred2)
plt.figure(figsize=(6, 4))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=["No Rain", "Rain"], yticklabels=["No Rain", "Rain"])
plt.xlabel("Predicted Label")
plt.ylabel("True Label")
plt.title("Confusion Matrix for Logistic Regression Model")
plt.show()

**Training & Testing the Model using SVM**

In [None]:
# Initialize the SVM classifier (Support Vector Classifier)
classifier3 = SVC(random_state=0)

# Train the model on the training data
classifier3.fit(x_train, y_train)

# You can now make predictions on the test set
y_pred3 = classifier3.predict(x_test)

# Inversing the label encoding done to convert rain column to interger values
y_pred3=label_encoder.inverse_transform(y_pred3)
#y_test=label_encoder.inverse_transform(y_test)

# Reshape the predictions and the actual labels (if needed, depending on how you want to compare them)
y_pred3=y_pred3.reshape(-1,1)
y_test=y_test.reshape(-1,1)

# Accuracy of the model
accuracy = accuracy_score(y_test, y_pred3)
print(f"Accuracy of the SVM model: {accuracy:.4f}")

# Generate the confusion matrix
cm = confusion_matrix(y_test, y_pred3)
plt.figure(figsize=(6, 4))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=["No Rain", "Rain"], yticklabels=["No Rain", "Rain"])
plt.xlabel("Predicted Label")
plt.ylabel("True Label")
plt.title("Confusion Matrix for SVM Model")
plt.show()

**Training & Testing the Model using XGBoost**

In [None]:
import xgboost as xgb

# Initialize the XGBoost classifier
classifier4 = xgb.XGBClassifier(random_state=0)

# Train the model on the training data
classifier4.fit(x_train, y_train)

# You can now make predictions on the test set
y_pred4 = classifier4.predict(x_test)

# # Inversing the label encoding done to convert rain column to interger values
y_pred4 = label_encoder.inverse_transform(y_pred4)
#y_test=label_encoder.inverse_transform(y_test)

# Reshape the predictions and the actual labels (if needed, depending on how you want to compare them)
y_pred4 = y_pred4.reshape(-1, 1)
y_test = y_test.reshape(-1, 1)

# Calculate accuracy of the model
accuracy = accuracy_score(y_test, y_pred4)
print(f"Accuracy of the XGBoost model: {accuracy:.4f}")

# Generate the confusion matrix
cm = confusion_matrix(y_test, y_pred4)

# Plot the confusion matrix using a heatmap
plt.figure(figsize=(6, 4))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=["No Rain", "Rain"], yticklabels=["No Rain", "Rain"])
plt.xlabel("Predicted Label")
plt.ylabel("True Label")
plt.title("Confusion Matrix for XGBoost Model")
plt.show()




------------------------







**Complete Code**

In [None]:
# Dataset Location
url = "https://raw.githubusercontent.com/shravyapendyala/CCE_Assignment_1/refs/heads/main/weather_forecast_data.csv"

# Reading Dataset
dataset=pd.read_csv(url)

# Dataset Overview
print("\n Data Overview\n")
print(dataset.head(5))

# Check for Missing Values of Parameters in dataset
print("\nMissing Values in Each Column:\n")
print(dataset.isnull().sum())

# Label Encoding on Rain Column
label_encoder=preprocessing.LabelEncoder()
dataset['Rain']=label_encoder.fit_transform(dataset['Rain']) # Converting Rain Column to integer value

print("\nConverted dataset \n")
print(dataset.head(5))

from sklearn.model_selection import train_test_split

dataset.columns
new_dataset=dataset.drop(columns=['Wind_Speed'])

print("\n Improved dataset \n" )
print(new_dataset.head(5))
# Defining new dataset after dropping one feature
X=new_dataset.iloc[:, 0:4]
Y=new_dataset.iloc[:, -1]

# Splitting data into testing and training, testing 30 % & training 70 %
x_train,x_test,y_train,y_test=train_test_split(X,Y,test_size=0.3,random_state=0)

from sklearn.ensemble import RandomForestClassifier
# Initialize the RF classifier (Logistic Regression)
classifier=RandomForestClassifier(n_estimators=100, random_state=0)

# Train the model on the training data
classifier.fit(x_train,y_train)

# Making prediction on the test set
y_pred=classifier.predict(x_test)

# Inversing the label encoding done to convert rain column to interger values
y_pred=label_encoder.inverse_transform(y_pred)
y_test=label_encoder.inverse_transform(y_test)

# Reshape the predictions and the actual labels (if needed, depending on how you want to compare them)
y_pred=y_pred1.reshape(-1,1)
y_test=y_test.reshape(-1,1)
y_dataframe1=np.concatenate((y_pred,y_test),axis=1)


# Accuracy of the model
accuracy = accuracy_score(y_pred,y_test)
print(f"\n Accuracy of the RF model: {accuracy:.4f}\n")

# Generate the confusion matrix
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(6, 4))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=["No Rain", "Rain"], yticklabels=["No Rain", "Rain"])
plt.xlabel("Predicted Label")
plt.ylabel("True Label")
plt.title("Confusion Matrix for Random Forest Classifier Model")
plt.show()