# Water Quality using LightGBM

In [None]:
import pandas as pd
import numpy as np

# ploting
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

# preprocessing
from sklearn.impute import KNNImputer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# machine learning
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
%matplotlib inline

#metrics
from sklearn.metrics import classification_report, confusion_matrix,accuracy_score

In [None]:
df = pd.read_csv('/kaggle/input/water-potability/water_potability.csv')

In [None]:
df.head()

In [None]:
df.info()

 **Data Visulization**

In [None]:
plt.figure(figsize=(16, 9))
sns.heatmap(df.isna(), cmap='viridis')

**We can see that from heatmap; ph, Sulfate and Trihalomethanes have null values**

In [None]:
px.pie(df['Potability'].value_counts(), values='Potability', names=df['Potability'].value_counts().index, template='ggplot2')

**Potability is not much balanced**

In [None]:
#corralation between features
plt.figure(figsize=(16, 9))
sns.heatmap(df.corr(), annot=True, linewidths=.3)

**There is not much corralation between features and target(Potability)**

In [None]:
sns.set_style("whitegrid")
fig, axes = plt.subplots(3, 3, figsize=(24,12))
row = 0
for col, column in enumerate(df.columns[:-1]): 

    sns.kdeplot(ax=axes[row][col % 3], x=column, data=df.loc[df['Potability']==1], label='Potability 1')
    sns.kdeplot(ax=axes[row][col % 3], x=column, data=df.loc[df['Potability']==0], label='Potability 0')
    if col % 3 == 2:
        row += 1

There is not much difference in features according to kde plots

# Preprocessing

**We can fill null variables using KNNImputer, but there is not much corrolation between features so we can just drop the null values**

In [None]:
df.dropna(inplace=True)

In [None]:
# set X,y then split
X = df.drop('Potability', axis=1)
y = df['Potability']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=101)

In [None]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Machine Learning 

**---- LogisticRegression, DecisionTreeClassifier, SVC, KNeighborsClassifier,RandomForestClassifier**

In [None]:
def classify(X_train, y_train, X_test, y_test):
    param_grid = {'C': [0.1,1, 10, 100, 1000], 'gamma': [1,0.1,0.01,0.001,0.0001], 'kernel': ['rbf']} 
    rfc_param_grid = {'criterion':['gini', 'entropy'],
                'n_estimators': [100, 200, 300, 500],
                'max_features': ['auto', 'sqrt', 'log2'],
                     'bootstrap': [True, False]}

    classifiers = [(RandomForestClassifier(criterion='entropy', n_estimators=200, max_features='auto'), 'RFC'),
                (LogisticRegression(), 'Logistic Regression'),
                  (DecisionTreeClassifier(), 'Decision Tree Classifier'),
                   (GridSearchCV(SVC(),param_grid,refit=True,verbose=0, n_jobs=-1), 'Grid Search CV'),
                  (KNeighborsClassifier(n_neighbors=1), 'KNN'),
                   ]
    for classifier, name in classifiers:
        classifier.fit(X_train, y_train)
        pred = classifier.predict(X_test)
        print('-------',name, '-------')
        print(classification_report(y_test, pred))
        print(accuracy_score(y_test, pred))
        print('------------------------------------------')

In [None]:
classify(X_train, y_train, X_test, y_test)

**Random Forest Classifier and SVC has %70 accuracy**