In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import statsmodels.api as sm
import seaborn as sns
from sklearn import linear_model
import seaborn as sns
import scipy.stats as stats
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn import ensemble
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score

waterqual_df = pd.read_csv('../input/water-potability/water_potability.csv')

In [None]:
waterqual_df.shape

In [None]:
waterqual_df.head()

In [None]:
# percentage of null values for each variable
(waterqual_df.isnull().sum()*100)/(waterqual_df.isnull().count())

In [None]:
# continuous float variables...will fill null values with mean
waterqual_df['ph'] = waterqual_df['ph'].fillna(waterqual_df['ph'].mean())
waterqual_df['Sulfate'] = waterqual_df['Sulfate'].fillna(waterqual_df['Sulfate'].mean())
waterqual_df['Trihalomethanes'] = waterqual_df['Trihalomethanes'].fillna(waterqual_df['Trihalomethanes'].mean())

In [None]:
#check to see if null values have been removed
(waterqual_df.isnull().sum()*100)/(waterqual_df.isnull().count())

In [None]:
# compare distribution of data with/without null values
plt.figure(figsize=(18,12))

var_list = waterqual_df.columns.unique()

for index, column in enumerate(var_list):
  plt.subplot(3,5,index+1)
  plt.hist(waterqual_df[column])
  plt.title('Distribution of {}'.format(column))

plt.tight_layout()
plt.show()

In [None]:
# normality of distributions
print(stats.describe(waterqual_df['ph']))
print(stats.describe(waterqual_df['Sulfate']))
print(stats.describe(waterqual_df['Hardness']))
print(stats.describe(waterqual_df['Solids']))
print(stats.describe(waterqual_df['Chloramines']))
print(stats.describe(waterqual_df['Conductivity']))
print(stats.describe(waterqual_df['Organic_carbon']))
print(stats.describe(waterqual_df['Trihalomethanes']))
print(stats.describe(waterqual_df['Turbidity']))

In [None]:
corr_with_qual = waterqual_df.corrwith(waterqual_df['Potability']).sort_values(ascending=False)
print(corr_with_qual)

In [None]:
# split data into features (X) and target (Y)
X = waterqual_df.drop(columns='Potability') # model features
Y = waterqual_df['Potability'] # target 

# scale all values 
scaler = StandardScaler()
X = scaler.fit_transform(X)
X = pd.DataFrame(X)

# add a constant
X = sm.add_constant(X)

# split into train and test data
X_train, X_test, y_train, y_test = train_test_split(X,Y, test_size=0.2, random_state=42)

In [None]:
X.describe() # check that data scale successful

In [None]:
# feature selection
sklearn_pca = PCA(n_components=9)
sklearn_pca.fit(X)
sklearn_pca.explained_variance_ratio_

In [None]:
sum(sklearn_pca.explained_variance_ratio_[0:8])

In [None]:
X = sklearn_pca.transform(X)

In [None]:
lr = LogisticRegression(solver='lbfgs', penalty='l2', max_iter=500, random_state=42)
lr.fit(X_train, y_train)

print('Training score: %s' % lr.score(X_train, y_train))
print('Test score: %s' % lr.score(X_test, y_test))
print('Number of iterations: %s' % lr.n_iter_[0])

In [None]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors = 45)
knn.fit(X_train,y_train)

print('training score: {:.3f}'.format(knn.score(X_train, y_train)))
print('test score: {:.3f}'.format(knn.score(X_test, y_test)))

In [None]:
svm = SVC(kernel='linear')
svm.fit(X_train, y_train)

print('training score: {:.3f}'.format(svm.score(X_train, y_train)))
print('test score: {:.3f}'.format(svm.score(X_test, y_test)))

In [None]:
rfc = ensemble.RandomForestClassifier()
rfc.fit(X_train, y_train)

print('Training score: %s' % np.mean(cross_val_score(rfc, X_train, y_train, cv=5)))
print('Test score: %s' % np.mean(cross_val_score(rfc, X_test, y_test, cv=5)))