In [None]:
import pandas as pd
import numpy as np
import plotly.express as px
import matplotlib.pyplot as plt

In [None]:
#Loading dataset

In [None]:
data = pd.read_csv('final_data_in_ML.csv',parse_dates=['Standardized_Date'])

In [None]:
data=data[['Standardized_Date','RAW WATER FLOW IN ML',
       'CLEAR WATER SUMP LEVEL IN Meter', 'CLEAR WATER PUMPING FLOW ML',
       'TREATED WATER PRODUCTION IN ML', 'remarks category']]
data

In [None]:
data['remarks category'].value_counts()

# Encoding

In [None]:
data.select_dtypes(include=object).nunique()

In [None]:
from sklearn.preprocessing import LabelEncoder

label_en = LabelEncoder()

a = ['remarks category']
for i in a:
    data[i] = label_en.fit_transform(data[i])
    # Display the mapping of labels to original values
    label_mapping = dict(zip(label_en.classes_, label_en.transform(label_en.classes_)))
    print(f"Label mapping for column '{i}':")
    print(label_mapping)


In [None]:
data.set_index('Standardized_Date', inplace=True)
df=data[['CLEAR WATER PUMPING FLOW ML',
        'remarks category']]

In [None]:
df.plot(subplots=True)

In [None]:
df['remarks category'].plot(kind='density')

In [None]:
df['remarks category'].hist()

# Feature Reduction

In [None]:
data.corr()

In [None]:
import seaborn as sns
plt.figure(figsize=(20,15))
sns.heatmap(data.corr(),vmin = -0.99,vmax = 0.99,cmap='YlGnBu',linewidths =0.1,annot=True)

In [None]:
data=data.drop(['TREATED WATER PRODUCTION IN ML'],axis=1)

# Feature Scaling

In [None]:
y=data['CLEAR WATER PUMPING FLOW ML']
X=data.drop(['CLEAR WATER PUMPING FLOW ML'],axis=1)

In [None]:
from sklearn.preprocessing import MinMaxScaler
min_max = MinMaxScaler(feature_range =(0,1))
X = min_max.fit_transform(X)
X=pd.DataFrame(X)
X.describe()

# Modelling

# LinearRegression

In [None]:
#splitting the dataset into train and test 
import warnings
warnings.filterwarnings("ignore")
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,random_state=42,test_size=0.25)

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

model = LinearRegression()

# Fit the model to the training data
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Evaluate the model's performance
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

print("Mean Squared Error:", mse)
print("Root Mean Squared Error:", rmse)
print("R-squared:", r2)
y_pred[-10:]

In [None]:
#Since we have continuous values in our target variable,and classification models won't perform the continuous variables, so splitting the target column values into 0,1,2 and 3 and 4
y[(y>0)&(y<=2)]=0
y[(y>2)&(y<=4)]=1
y[(y>4)&(y<=6)]=2
y[(y>6)&(y<=8)]=3
y[(y>8)&(y<=10)]=4
#Checking the imbalance of the dataset
y.value_counts(normalize=True).to_frame()


In [None]:
#Here we can see that the data is highly imbalanced. 
#So we have to make the dataset balanced
#Here we will go for oversampling as in undersampling data drop will be there .
#Oversampling will create new examples in minority class and can keep the data in dataset

In [None]:
from imblearn.over_sampling import SMOTE
smt=SMOTE(k_neighbors=2)
X_os, y_os = smt.fit_resample(X, y)
sns.countplot(y_os)
plt.show()

In [None]:
y_os.value_counts(normalize=True).to_frame()

In [None]:
#splitting the dataset into train and test 
import warnings
warnings.filterwarnings("ignore")
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X_os,y_os,random_state=42,test_size=0.25)

# Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import  classification_report
logit_reg=LogisticRegression()
model=logit_reg.fit(X_train,y_train)
predictions1=model.predict(X_test)

In [None]:
print("Results from logistic regression are as below")
log_result = classification_report(predictions1,y_test,output_dict=True)
log_result = pd.DataFrame(log_result).transpose()
log_result.style.background_gradient(cmap="BuPu")

# Decision Tree Classifier

In [None]:
from sklearn.tree import DecisionTreeClassifier
dt_model=DecisionTreeClassifier()
dt_model.fit(X_train,y_train)
predictions2=dt_model.predict(X_test)

In [None]:
print("Results from Decision Tree are as below")
DT_result = classification_report(predictions2,y_test,output_dict=True)
DT_result = pd.DataFrame(DT_result).transpose()
DT_result.style.background_gradient(cmap="BuPu")

# Random Forest Classifier

In [None]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier()
rf.fit(X_train,y_train)
predictions5 = rf.predict(X_test)
predictions5

In [None]:
print("Results from Random Forest are as below")
RF_result = classification_report(predictions5,y_test,output_dict=True)
RF_result = pd.DataFrame(RF_result).transpose()
RF_result.style.background_gradient(cmap="BuPu")

# Gradient Boosting Classifier

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
GB = GradientBoostingClassifier()
model = GB.fit(X_train,y_train)
predictions6 = model.predict(X_test)
print("Results from Gradient Boosting are as below")
GB_result = classification_report(predictions6,y_test,output_dict=True)
GB_result = pd.DataFrame(GB_result).transpose()
GB_result.style.background_gradient(cmap="BuPu")

# Ada Boost Classifier

In [None]:
from sklearn.ensemble import AdaBoostClassifier
AD=AdaBoostClassifier()
AD_model= AD.fit(X_train , y_train)
predictions7=AD_model.predict(X_test)
print("Results from ADA booster are as below")
AD_result = classification_report(predictions7,y_test,output_dict=True)
AD_result = pd.DataFrame(AD_result).transpose()
AD_result.style.background_gradient(cmap="BuPu")

# GaussianNB

In [None]:
from sklearn.naive_bayes import GaussianNB
GNB=GaussianNB()
GNB_model=GNB.fit(X_train , y_train)
predictions8=GNB_model.predict(X_test)
print("Results from Gaussian Naive bayer are as below")
GNB_result = classification_report(predictions8,y_test,output_dict=True)
GNB_result = pd.DataFrame(GNB_result).transpose()
GNB_result.style.background_gradient(cmap="BuPu")

# BaggingClassifier

In [None]:
from sklearn.ensemble import BaggingClassifier
BC=BaggingClassifier()
BC_model=BC.fit(X_train , y_train)
predictions9=BC_model.predict(X_test)
print("Results from Bagging classifier are as below")
BC_result = classification_report(predictions9,y_test,output_dict=True)
BC_result = pd.DataFrame(BC_result).transpose()
BC_result.style.background_gradient(cmap="BuPu")