# Imports

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas_profiling as pf
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler, StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from category_encoders import OrdinalEncoder as oe
from catboost import CatBoostClassifier
from catboost import Pool, cv
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import SVMSMOTE
from sklearn.metrics import roc_curve, roc_auc_score
import pandas_profiling
import optuna
import csv
import shap
import pandas as pd
import numpy as np
from collections import Counter
import matplotlib.pyplot as plt
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.model_selection import train_test_split
from mlxtend.feature_selection import SequentialFeatureSelector as SFS
from mlxtend.plotting import plot_sequential_feature_selection as plot_sfs
from catboost import CatBoostRegressor, Pool, EShapCalcType, EFeaturesSelectionAlgorithm
import statsmodels.api as sm
from mpl_toolkits.basemap import Basemap


%matplotlib inline

# Load data

In [None]:
train = pd.read_csv('../input/water-pump/Pump_it_Up_Data_Mining_the_Water_Table_-_Training_set_values.csv',index_col="id")
labels = pd.read_csv('../input/water-pump/Pump_it_Up_Data_Mining_the_Water_Table_-_Training_set_labels.csv',index_col="id")
test = pd.read_csv('../input/water-pump/Pump_it_Up_Data_Mining_the_Water_Table_-_Test_set_values.csv',index_col="id")

In [None]:
train.info()

In [None]:
pandas_profiling.ProfileReport(train)

# Data Preprocessing and Feature Engineering 

#### convert date_recorded into day,month,year format

In [None]:
train["date_recorded"] = pd.to_datetime(train["date_recorded"])
train["day"] = train['date_recorded'].map(lambda x: x.day).astype("object")
train["month"] = train['date_recorded'].map(lambda x: x.month).astype("object")
train["year"] = train['date_recorded'].map(lambda x: x.year).astype("object")

test["date_recorded"] = pd.to_datetime(test["date_recorded"])
test["day"] = test['date_recorded'].map(lambda x: x.day).astype("object")
test["month"] = test['date_recorded'].map(lambda x: x.month).astype("object")
test["year"] = test['date_recorded'].map(lambda x: x.year).astype("object")

In [None]:
train.drop(['recorded_by'], axis=1, inplace=True)
test.drop(['recorded_by'], axis=1, inplace=True)
train.drop(['funder', 'installer', 'lga', 'scheme_name', 'subvillage', 'ward', 'wpt_name'], axis=1, inplace=True)
test.drop(['funder', 'installer', 'lga', 'scheme_name', 'subvillage', 'ward', 'wpt_name'], axis=1, inplace=True)

#### Tanzania GPS boundaries 

In [None]:
mean_longitude = train['longitude'].mean(skipna=True)
mean_latitude = train['latitude'].mean(skipna=True)
train['longitude']=train.longitude.mask(train.longitude == 0,mean_longitude)
train['latitude']=train.latitude.mask(train.latitude == 0,mean_longitude)

mean_longitude = test['longitude'].mean(skipna=True)
mean_latitude = test['latitude'].mean(skipna=True)
test['longitude']=test.longitude.mask(test.longitude == 0,mean_longitude)
test['latitude']=test.latitude.mask(test.latitude == 0,mean_longitude)

#### check region and region_code are same

In [None]:
rc=train['region_code'].astype('category').cat.codes
r=train['region'].astype('category').cat.codes
cor = rc.corr(r)
r_len = len(r.unique())
rc_len = len(rc.unique())
print(cor,r_len,rc_len)

#### Dealing with missing values

In [None]:
train['permit'].fillna(train['permit'].mode()[0], inplace=True)
test['permit'].fillna(test['permit'].mode()[0], inplace=True)

In [None]:
train['public_meeting'].fillna(train['public_meeting'].mode()[0], inplace=True)
test['public_meeting'].fillna(test['public_meeting'].mode()[0], inplace=True)

In [None]:
train['scheme_management'].fillna(train['scheme_management'].mode()[0], inplace=True)
test['scheme_management'].fillna(test['scheme_management'].mode()[0], inplace=True)

In [None]:
cols = train.columns
num_cols = train.select_dtypes('number').columns
cat_cols = list(set(cols) - set(num_cols))
assert len(num_cols)+len(cat_cols) == train.shape[1]

In [None]:
for col in (cat_cols):
    if train[col].isnull().values.any():
        train[col] = train[col].fillna(value='Unknown')
    unique_lenght = len(train[col].unique())
    print('\033[1m'+col+'\033[0m'+" unique value count : "+ str(unique_lenght))    
for col in (cat_cols):
    if test[col].isnull().values.any():
        test[col] = test[col].fillna(value='Unknown')
    unique_lenght = len(test[col].unique())
    print('\033[1m'+col+'\033[0m'+" unique value count : "+ str(unique_lenght))    



#### Standardize Numerical columns 

In [None]:
for col in num_cols:
    print('\033[1m'+col+'\033[0m')
    scale = StandardScaler().fit(train[[col]])
    train[col] = scale.transform(train[[col]]) 
for col in num_cols:
    print('\033[1m'+col+'\033[0m')
    scale = StandardScaler().fit(test[[col]])
    test[col] = scale.transform(test[[col]])     

#### encoding categorical columns

In [None]:
cols = train.columns
num_cols = train.select_dtypes('number').columns
cat_cols = list(set(cols) - set(num_cols))
assert len(num_cols)+len(cat_cols) == train.shape[1]
from sklearn import preprocessing

encoder = preprocessing.LabelEncoder()
train[cat_cols] = train[cat_cols].astype(str).apply(encoder.fit_transform)
test[cat_cols] = test[cat_cols].astype(str).apply(encoder.fit_transform)
labels = labels.apply(encoder.fit_transform)

### Split Data

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split( train, labels, test_size=0.3, random_state=42)

# Feature Selection

#### measure feature importance 

In [None]:
train_pool = Pool(data=X_train, label=y_train,cat_features=cat_cols)
test_pool = Pool(data=X_test, label=y_test,cat_features=cat_cols) 
model = CatBoostClassifier(
    iterations=1000,
    learning_rate=0.1,
    random_strength=0.1,
    depth=8,
    loss_function='MultiClass',
    eval_metric='TotalF1',
    leaf_estimation_method='Newton',
)
model.fit(train_pool,plot=True,eval_set=test_pool,early_stopping_rounds=10)

In [None]:
shap.initjs()
explainer = shap.TreeExplainer(model)

In [None]:
categorical_features_indices = np.where(train.dtypes != np.float)[0]
categorical_features_indices

In [None]:
shap_values = explainer.shap_values(X_train)

In [None]:
shap.summary_plot(shap_values, X_train)

#### feature selection

In [None]:
feature_names = list(X_train.columns)
train_pool = Pool(data=X_train, label=y_train,feature_names=feature_names,cat_features=cat_cols)
test_pool = Pool(data=X_test, label=y_test,feature_names=feature_names,cat_features=cat_cols)  

def select_features_adult(algorithm: EFeaturesSelectionAlgorithm, steps: int = 1):
    print('Algorithm:', algorithm)
    model = CatBoostClassifier(
    iterations=1000,
    learning_rate=0.1,
    random_strength=0.1,
    depth=8,
    loss_function='MultiClass',
    eval_metric='Accuracy',
    leaf_estimation_method='Newton',)
    summary = model.select_features(
        train_pool,
        eval_set=test_pool,
        features_for_select=list(range(train_pool.num_col())),
        num_features_to_select=30,
        steps=steps,
        algorithm=algorithm,
        shap_calc_type=EShapCalcType.Regular,
        train_final_model=True,
        logging_level='Silent',
        plot=False
    )
    print('Selected features:', summary['selected_features_names'])
    return summary
adult_shap_summary = select_features_adult(algorithm=EFeaturesSelectionAlgorithm.RecursiveByShapValues, steps=3)

### Hyperparameter tuning

In [None]:
def objective(trial):
    param = {
        'iterations':trial.suggest_categorical('iterations', [100,200,300,500,1000,1200,1500]),
        'learning_rate':trial.suggest_float("learning_rate", 0.001, 0.3),
        'random_strength':trial.suggest_int("random_strength", 1,10),
        'bagging_temperature':trial.suggest_int("bagging_temperature", 0,10),
        'max_bin':trial.suggest_categorical('max_bin', [4,5,6,8,10,20,30]),
        'grow_policy':trial.suggest_categorical('grow_policy', ['SymmetricTree', 'Depthwise', 'Lossguide']),
        'min_data_in_leaf':trial.suggest_int("min_data_in_leaf", 1,10),
        'od_type' : "Iter",
        'od_wait' : 100,
        "depth": trial.suggest_int("max_depth", 2,10),
        "l2_leaf_reg": trial.suggest_loguniform("l2_leaf_reg", 1e-8, 100),
         'one_hot_max_size':trial.suggest_categorical('one_hot_max_size', [5,10,12,100,500,1024]),
        'custom_metric' : ['Accuracy','TotalF1'],
        "loss_function": "MultiClass",
        "leaf_estimation_method":'Newton',
        }

    scores = cv(train_dataset,
            param,
            fold_count=5, 
            early_stopping_rounds=10,         
            plot=False, verbose=False)

    return scores['test-TotalF1-mean'].max()
categorical_features_indices = [train.columns.get_loc(c) for c in cat_cols if c in train]
train_dataset = Pool(data=X_train,
                     label=y_train,
                     cat_features = cat_cols)
sampler = optuna.samplers.TPESampler(seed=68)  # Make the sampler behave in a deterministic way.
study = optuna.create_study(direction="maximize", sampler=sampler)
study.optimize(objective, n_trials=100)

In [None]:
train_pool = Pool(data=X_train, label=y_train,cat_features=cat_cols)
test_pool = Pool(data=X_test, label=y_test,cat_features=cat_cols) 

In [None]:
model = CatBoostClassifier(
    loss_function='MultiClass',
    eval_metric='Accuracy',
    iterations=500, 
    learning_rate =  0.27233395356783285, 
    random_strength = 5,
    bagging_temperature= 0, 
    max_bin = 8,
    grow_policy= 'SymmetricTree',
    min_data_in_leaf = 8, 
    max_depth = 3, 
    l2_leaf_reg = 7.948852984476994e-05,
    one_hot_max_size = 1024
)

In [None]:
model.fit(train_pool,plot=True,eval_set=test_pool,early_stopping_rounds=10)

In [None]:
final = model.predict(test)

In [None]:
submission_df = pd.read_csv("../input/water-pump/Pump_it_Up_Data_Mining_the_Water_Table_-_Submission_format.csv", 
                            index_col="id")

In [None]:
np.testing.assert_array_equal(test.index.values, 
                              submission_df.index.values)

In [None]:
submission_df["status_group"] = final

In [None]:
submission_df["status_group"] = encoder.inverse_transform(submission_df["status_group"])

In [None]:
submission_df.head()

In [None]:
submission_df.to_csv('catboost_tuned24.csv', index=True)