In [None]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import roc_auc_score, plot_roc_curve, classification_report
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
water = pd.read_csv('/kaggle/input/water-potability/water_potability.csv')

In [None]:
water.isnull().sum()

In [None]:
water.fillna({'ph': water.ph.median(),
'Sulfate': water.Sulfate.median(),
'Trihalomethanes': water.Trihalomethanes.median()}, inplace=True)

In [None]:
water.head()

In [None]:
colormap = plt.cm.RdBu
plt.figure(figsize=(32,10))
plt.title('Pearson Correlation of Features', y=1.05, size=15)
sns.heatmap(water.corr(),linewidths=0.1,vmax=1.0, 
            square=True, cmap=colormap, linecolor='white', annot=True)

In [None]:
X = water.drop(['Potability'], axis=1)
Y = water['Potability']

In [None]:
X_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.30, random_state=0)

In [None]:
rf = RandomForestClassifier(random_state=0)

In [None]:
params = {'n_estimators': range(30, 50), 'max_depth': range(1, 10)}

In [None]:
grid = GridSearchCV(rf, params, cv=5, verbose=1)

In [None]:
grid.fit(X_train,y_train)

In [None]:
y_pred = grid.predict(x_test)

In [None]:
grid.score(x_test, y_test)

In [None]:
plot_roc_curve(grid, x_test, y_test)

In [None]:
print(classification_report(y_test, y_pred))