In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

#### https://www.kaggle.com/adityakadiwal/water-potability

1. Predict if water is safe for Human consumption
2. EDA for water potability

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
import lightgbm as lgb

from sklearn.ensemble import StackingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold
from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.model_selection import RandomizedSearchCV

import warnings
warnings.simplefilter('ignore')

### First, We try 1st task

#### so, we have to make model to predict Potability whether 1 or 0

In [None]:
water_potability = pd.read_csv('/kaggle/input/water-potability/water_potability.csv')
water_potability.head()

In [None]:
water_potability.describe()

In [None]:
# let's seek information
water_potability.info()

In [None]:
# look null percentage
water_potability.isnull().sum() / len(water_potability) * 100

there are many nulls in some columns

so, we have to fill those null with some representive value(mean? median? predictions??)


because the each rate of null is higher than 1%...

In [None]:
# to decide the value, we have to know how this data look like

var = 'ph'
sns.displot(x=var, data=water_potability, kde=True, bins=30)

PH is great normal distributions!!

keep seeking and show all in bulk

In [None]:
columns = water_potability.columns

fig, axes = plt.subplots(3, 3, figsize=(10, 10))


from itertools import product

product_indexes = product([0, 1, 2], [0, 1, 2])

for idx, col in zip(product_indexes, columns):
    sns.histplot(x=col, data=water_potability, kde=True, bins=30, ax=axes[idx[0], idx[1]])

#### all right, every parameters distributes normal distribution
#### so, I think I can fill the null with each mean value 

In [None]:
# fillna
water_potability.ph = water_potability.ph.fillna(np.mean(water_potability.ph))
water_potability.Sulfate = water_potability.Sulfate.fillna(np.mean(water_potability.Sulfate))
water_potability.Trihalomethanes = water_potability.Trihalomethanes.fillna(np.mean(water_potability.Trihalomethanes))

water_potability.isnull().sum()

all right! we've done!

In [None]:
# Potability hist
sns.countplot(x='Potability', data=water_potability)

#### Next, we seek correlation because we have to care about multico

In [None]:
fig = plt.figure(figsize=(10, 10))
cormat = water_potability.corr()

sns.heatmap(cormat, annot=True)

#### OK! We can probably use all columns!
#### So, we try to predict potability once

In [None]:
scaler = StandardScaler()

X = water_potability.drop('Potability', axis=1).values
y = water_potability['Potability'].values

In [None]:
X_scaled = scaler.fit_transform(X)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, stratify=y)

In [None]:
scores = cross_val_score(estimator=LogisticRegression(), 
                        X=X_train, y=y_train, 
                        scoring='accuracy',
                        cv=10, n_jobs=1)

print(f'First Accuracy Try is {np.mean(scores):.3f} +/- {np.std(scores)}')

#### So, only LR is not enough to predict
#### we try lightGBM or some ensemble or PCA action

In [None]:
# we have to use only train data so we use kfold
kfold = StratifiedKFold(n_splits=10).split(X_train, y_train)


scores = []

for (train, test) in kfold:
    # firstly we set tempolary.
    params = {
        'task': 'train',
        'boosting_type': 'gbdt',
        'objective': 'multiclass',
        'num_class': 2
    }

    train_data = lgb.Dataset(X_train[train], label=y_train[train])
    eval_data = lgb.Dataset(X_train[test], label=y_train[test], reference= train_data)
    
    gbm = lgb.train(
        params,
        train_data,
        valid_sets=eval_data,
        num_boost_round=50
    )
    

    preds = gbm.predict(X_train[test])
    y_pred = []
    for x in preds:
      y_pred.append(np.argmax(x))
    
    score = accuracy_score(y_train[test], y_pred)
    scores.append(score)

In [None]:
np.mean(scores)

#### well we should keep trying to predict more collectly.

### We use RandomizeSeach and ensemble to find the best hyperparameters!

#### still not enough accuracy

In [None]:
estimators = Pipeline([
    ('pca', PCA()),
    ('lr', LogisticRegression(max_iter=1000))
])

param_dist = [
    {'lr__penalty': ['l1', 'l2']},
    {'lr__C': [0.01, 0.1, 1.0, 10]},
#     {'lr__solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']},
    {'pca__n_components': [2, 3, 4, 5, 6]}
]

rs = RandomizedSearchCV(estimator=estimators, 
                       param_distributions=param_dist,
                       scoring='accuracy', n_iter=100, 
                       cv=10, refit=True)

rs = rs.fit(X_train, y_train)

print(f"best param's are {rs.best_params_}")

rs.best_score_

### I'm getting to think about those features can't help the model's accuary growing..

### So, we move to use ensemble

In [None]:

estimators = [
   ('rf', RandomForestClassifier(n_estimators=100)),
   ('pipeline', Pipeline([
                        ('pca', PCA(n_components=2)),
                        ('lr', LogisticRegression(max_iter=500))
                    ])
   )
]



stack = StackingClassifier(estimators=estimators, 
                           final_estimator=LogisticRegression(max_iter=1000))

scores = cross_val_score(estimator=stack,
                        X=X_train, y=y_train,
                        cv=10, scoring='accuracy')

np.mean(scores)

### Yes! a little higher..

In [None]:
# let's try use test data

scores = cross_val_score(estimator=stack,
                        X=X_test, y=y_test,
                        cv=10, scoring='accuracy')

np.mean(scores)

### That's not overfitting
### So, I suspend 1st task.