In [1]:
import numpy as np
import joblib
import pandas as pd
import xgboost as xgb
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import optuna

In [2]:
df_x = pd.read_csv('/kaggle/input/pump-it-up-challenge-driven-data/training_Set_values.csv')
df_x = df_x.sort_values(by=['id'], ascending=True)
df_y = pd.read_csv('/kaggle/input/pump-it-up-challenge-driven-data/training_Set_labels.csv')
df_y = df_y.sort_values(by=['id'], ascending=True)['status_group']

In [3]:
pd.concat([df_x.nunique(axis=0), df_x.isna().sum(axis=0), df_x.dtypes], axis=1).rename(columns={0: 'uniques', 1:'na', 2:'type'})

Unnamed: 0,uniques,na,type
id,59400,0,int64
amount_tsh,98,0,float64
date_recorded,356,0,object
funder,1897,3635,object
gps_height,2428,0,int64
installer,2145,3655,object
longitude,57516,0,float64
latitude,57517,0,float64
wpt_name,37400,0,object
num_private,65,0,int64


In [4]:
df_x['district_code'] = df_x['district_code'].astype(object)
df_x['region_code'] = df_x['region_code'].astype(object)
df_x['date_recorded'] = pd.to_datetime(df_x['date_recorded'])
df_x['day'] = df_x['date_recorded'].dt.day
df_x['month'] = df_x['date_recorded'].dt.month
df_x['year'] = df_x['date_recorded'].dt.year
df_x = df_x.drop(columns=['date_recorded', 'id', 'wpt_name', 'installer', 'funder', 'subvillage', 'ward', 'scheme_name', 'recorded_by'])

In [5]:
x_train, x_test, y_train, y_test = train_test_split(df_x, df_y, test_size=0.1, random_state=1)

In [6]:
missing_dict = {'gps_height': 0,
                'longitude': 0, 
                'construction_year': 0,
                'latitude': -2.00E-08,
                'scheme_management': 'None',
                'management': 'unknown',
                'management_group': 'unknown',
                'payment': 'unknown',
                'payment_type': 'unknown',
                'water_quality': 'unknown',
                'quality_group': 'unknown',
                'quantity': 'unknown',
                'quantity_group': 'unknown',
                'source': 'unknown',
                'source_class': 'unknown'}

for feature, val in zip(missing_dict.keys(), missing_dict.values()):
    x_train[feature] = x_train[feature].replace(val, np.nan)

In [7]:
numeric_features = list(x_train.loc[:, x_train.dtypes != object])
numeric_imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
numeric_imputer.fit(x_train.loc[:, numeric_features])
x_train.loc[:, numeric_features] = numeric_imputer.transform(x_train.loc[:, numeric_features])

In [8]:
categorical_features = list(x_train.loc[:, x_train.dtypes == object])
cat_imputer = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
cat_imputer.fit(x_train.loc[:, categorical_features])
x_train.loc[:, categorical_features] = cat_imputer.transform(x_train.loc[:, categorical_features])

In [9]:
label_encoder = LabelEncoder()
label_encoder.fit(y_train)
y_train = label_encoder.transform(y_train)

In [10]:
cat_encoder = OneHotEncoder()
cat_encoder.fit(x_train.loc[:, categorical_features])
transformed = cat_encoder.transform(x_train.loc[:, categorical_features].to_numpy())
ohe = pd.DataFrame(transformed.toarray(), columns=cat_encoder.get_feature_names_out())
x_train = x_train.reset_index()
x_train = pd.concat([x_train, ohe], axis=1)
x_train = x_train.drop(columns=categorical_features)

  "X does not have valid feature names, but"


In [11]:
def objective(trial, x, y):
    xx_train, xx_val, yy_train, yy_val = train_test_split(x, y, test_size=0.1, random_state=1)
    dtrain = xgb.DMatrix(xx_train, label=yy_train)
    dval = xgb.DMatrix(xx_val, label=yy_val)

    param = {
        "objective": "multi:softmax",
        "num_class": 3,
        "booster": trial.suggest_categorical("booster", ["gbtree", "dart"]),
        "lambda": trial.suggest_float("lambda", 1e-8, 1.0, log=True),
        "alpha": trial.suggest_float("alpha", 1e-8, 1.0, log=True),
        "max_depth": trial.suggest_int("max_depth", 5, 20),
        "gamma": trial.suggest_float("gamma", 1e-8, 1.0, log=True),
        "grow_policy": trial.suggest_categorical("grow_policy", ["depthwise", "lossguide"]),
        "learning_rate": trial.suggest_float("learning_rate", 1e-5, 1.0, log=True)
    }

    bst = xgb.train(param, dtrain, num_boost_round=50)
    preds = np.rint(bst.predict(dval))
    accuracy = accuracy_score(yy_val, preds)
    return accuracy

study = optuna.create_study(direction="maximize")
study.optimize(lambda trial: objective(trial, x_train, y_train), n_trials=1)
print("Number of finished trials: ", len(study.trials))
print("Best trial:")
trial = study.best_trial

print("  Value: {}".format(trial.value))
print("  Params: ")
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))

[32m[I 2023-07-02 09:58:06,572][0m A new study created in memory with name: no-name-d07ef0ec-2fee-4893-a0ae-c5b358917f86[0m
[32m[I 2023-07-02 10:00:53,530][0m Trial 0 finished with value: 0.7762813318368874 and parameters: {'booster': 'gbtree', 'lambda': 0.0022165170260433475, 'alpha': 6.288670636012322e-08, 'max_depth': 19, 'gamma': 1.3576561395977631e-07, 'grow_policy': 'depthwise', 'learning_rate': 0.0007130870955017573}. Best is trial 0 with value: 0.7762813318368874.[0m


Number of finished trials:  1
Best trial:
  Value: 0.7762813318368874
  Params: 
    booster: gbtree
    lambda: 0.0022165170260433475
    alpha: 6.288670636012322e-08
    max_depth: 19
    gamma: 1.3576561395977631e-07
    grow_policy: depthwise
    learning_rate: 0.0007130870955017573


In [12]:
model = xgb.XGBClassifier(objective='multi:softmax',
                          n_estimators=200,
                          num_class=3,
                          reg_lambda=0.0014991193227735851,
                          alpha=4.858747180371873e-08,
                          max_depth=14,
                          gamma=2.0834449412192234e-07,
                          grow_policy='depthwise',
                          booster='gbtree',
                          learning_rate=0.09898213693048583)
    
model.fit(x_train, y_train)

XGBClassifier(alpha=4.858747180371873e-08, base_score=0.5, booster='gbtree',
              callbacks=None, colsample_bylevel=1, colsample_bynode=1,
              colsample_bytree=1, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None,
              gamma=2.0834449412192234e-07, gpu_id=-1, grow_policy='depthwise',
              importance_type=None, interaction_constraints='',
              learning_rate=0.09898213693048583, max_bin=256,
              max_cat_to_onehot=4, max_delta_step=0, max_depth=14, max_leaves=0,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=200, n_jobs=0, num_class=3, num_parallel_tree=1,
              objective='multi:softmax', predictor='auto', ...)

In [13]:
joblib.dump(model, 'model.joblib')
joblib.dump(numeric_imputer, 'numeric_imputer.joblib')
joblib.dump(cat_imputer, 'cat_imputer.joblib')
joblib.dump(cat_encoder, 'cat_encoder.joblib')
joblib.dump(label_encoder, 'label_encoder.joblib')
joblib.dump(numeric_features, 'numeric_features.joblib')
joblib.dump(categorical_features, 'categorical_features.joblib')
joblib.dump(missing_dict, 'missing_dict.joblib')

['missing_dict.joblib']

In [14]:
model = joblib.load('model.joblib')
numeric_imputer = joblib.load('numeric_imputer.joblib')
cat_imputer = joblib.load('cat_imputer.joblib')
cat_encoder = joblib.load('cat_encoder.joblib')
label_encoder = joblib.load('label_encoder.joblib')
numeric_features = joblib.load('numeric_features.joblib')
categorical_features = joblib.load('categorical_features.joblib')
missing_dict = joblib.load('missing_dict.joblib')

df_inference = x_test.copy()

for feature, val in zip(missing_dict.keys(), missing_dict.values()):
    x_test[feature] = x_test[feature].replace(val, np.nan)

x_test.loc[:, numeric_features] = numeric_imputer.transform(x_test.loc[:, numeric_features])
x_test.loc[:, categorical_features] = cat_imputer.transform(x_test.loc[:, categorical_features])
y_test = label_encoder.transform(y_test)
transformed = cat_encoder.transform(x_test.loc[:, categorical_features].to_numpy())
ohe = pd.DataFrame(transformed.toarray(), columns=cat_encoder.get_feature_names_out())
x_test = x_test.reset_index()
x_test = pd.concat([x_test, ohe], axis=1)
x_test = x_test.drop(columns=categorical_features)

  "X does not have valid feature names, but"


In [15]:
test_pred = model.predict(x_test)
df_inference['pred'] = label_encoder.inverse_transform(test_pred)
df_inference.to_csv('df_inference.csv', index=False)
print(accuracy_score(test_pred, y_test))

0.8063973063973064
