In [None]:
import numpy as np
import pandas as pd
df = pd.read_csv("../input/water-potability/water_potability.csv")

In [None]:
df.isna().sum()

In [None]:
df = df.dropna()
df.isna().sum()

In [None]:
df.head(5)

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
ax = sns.countplot(df.Potability,label="Count") 
not_potable, potable = df.Potability.value_counts()
print('Number of Potable: ',potable)
print('Number of Not Potable : ',not_potable)

In [None]:
X = df.drop(columns = ["Potability"])
y = df.Potability

In [None]:
data_dia = y
data = X
data_n_2 = (data - data.mean()) / (data.std())              
data = pd.concat([y,data_n_2.iloc[:,0:10]],axis=1)
data = pd.melt(data,id_vars="Potability",
                    var_name="features",
                    value_name='value')
plt.figure(figsize=(10,10))
sns.violinplot(x="features", y="value", hue="Potability", data=data,split=True, inner="quart")
plt.xticks(rotation=90)

**From this figure we can say data not good for training. If we look Sulfate and Organic_carbon feature medians nearly same for both models maybe we can drop this features**

In [None]:
#correlation map
f,ax = plt.subplots(figsize=(18, 18))
sns.heatmap(X.corr(), annot=True, linewidths=.5, fmt= '.1f',ax=ax)

**There is no correlacition between features. Our data not good for training but we can not drop features because we have very less feature**

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score,confusion_matrix
from sklearn.metrics import accuracy_score

x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
accuricies = []
n_estimators = []
max_acc = 0
max_i = 0
for i in range(1,100):
    clf_rf = RandomForestClassifier(random_state=42, n_estimators=i)      
    clr_rf = clf_rf.fit(x_train,y_train)
    ac = accuracy_score(y_test,clf_rf.predict(x_test))
    if ac > max_acc:
        max_acc = ac
        max_i = i
    accuricies.append(ac)
    n_estimators.append(i)
plt.plot(n_estimators, accuricies)
print("Max : ", max_acc, max_i)

**We reached 0.7 accuracy with n = 34. Next, We will try SMOTE beacuse our data unbalanced**

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
ax = sns.countplot(y_train,label="Count") 
not_potable, potable = y_train.value_counts()
print('Number of Potable: ',potable)
print('Number of Not Potable : ',not_potable)

In [None]:
from imblearn.over_sampling import SMOTE
sm = SMOTE(random_state = 42)
x_train_res, y_train_res = sm.fit_resample(x_train, y_train.ravel())

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
ax = sns.countplot(y_train_res,label="Count") 
not_potable, potable = y_train_res.value_counts()
print('Number of Potable: ',potable)
print('Number of Not Potable : ',not_potable)

In [None]:
accuricies = []
n_estimators = []
max_acc = 0
max_i = 0
for i in range(1,100):
    clf_rf = RandomForestClassifier(random_state=42, n_estimators=i)      
    clr_rf = clf_rf.fit(x_train_res,y_train_res)
    ac = accuracy_score(y_test,clf_rf.predict(x_test))
    if ac > max_acc:
        max_acc = ac
        max_i = i
    accuricies.append(ac)
    n_estimators.append(i)
plt.plot(n_estimators, accuricies)
print("Max : ", max_acc, max_i)

**Conclusion: We reached 0.7 acc but I think this data not appropiriate for classification**