In [1]:
# Import Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import sklearn
from sklearn.preprocessing import  MinMaxScaler,OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.ensemble import ExtraTreesClassifier
import lightgbm as lgb

from sklearn.metrics import accuracy_score


scaler_minmax= MinMaxScaler()                                                             # Create a MinMaxScaler object
ohe = OneHotEncoder(sparse_output=False, handle_unknown='ignore',drop='first')            # Create a OneHotEncoder object


# Read CSV files

df=pd.read_csv("../data/Training_Set_Values.csv")       # Read the Training data CSV file
name_featrures=df.columns                       # Get the features name
len_features=len(name_featrures)                # Get the length of features
labels=pd.read_csv("../data/Training_Set_labels.csv")   # Read the labels (target) CSV file
labels.head()
df['target'] = labels['status_group']           # Add the target column to the dataframe
#print(df.shape)                                 # Print the shape of the dataframe
#df.head()
#df.info()


In [2]:
# Columns to be dropped for the baseline models
columns_drop=['id','amount_tsh','num_private','subvillage','recorded_by','scheme_name',
              'extraction_type_group','extraction_type_class',
              'management','payment_type','quality_group','quantity_group','source','waterpoint_type_group',
              'funder','installer','wpt_name','ward','scheme_management']

df = df.drop(columns=columns_drop)
print('Shape after dropping columns:', df.shape)

Shape after dropping columns: (59400, 22)


In [3]:
# Convert the target column to labels 
#print(df['target'].unique())
target_map_dict={'functional': 2, 'functional needs repair': 1, 'non functional': 0} # Defined the mapping of labels to numbers (integers)
#print(df['target'].head())
df['target'] =df['target'].map(target_map_dict) # transform the target column (labels) to  numbers (integers)
#df['target'].head()

In [4]:
# Separate features and target and perform train test split
X = df.drop(columns=['target'])  # Features only
y = df['target']                 # Target column
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.04, random_state=42, stratify=y)  # 2376 records for the test

In [5]:
le = LabelEncoder()
y_enc = le.fit_transform(y)

In [6]:
# Import the custom transformers form helper_function.py
# The helper_function.py file contains the definitions for StringConverter, YearExtractor, IQRCapper, and ConstructionYearTransformer
from helper_function import (
    StringConverter,
    YearExtractor,
    IQRCapper,
    ConstructionYearTransformer,
    ObjectToNumericConverter
)
    
#pipeline transformers
date_recorded_transformer_pipeline=Pipeline([
    
    ('year_extractor',YearExtractor()),
    ('onehot', OneHotEncoder(sparse_output=False, handle_unknown='ignore',drop='first'))
])


oulier_minmax_pipeline_clip = Pipeline(steps=[
    ('iqr_cap', IQRCapper(strategy='clip')),
    ('scaler', MinMaxScaler())
])

oulier_minmax_pipeline_mean = Pipeline(steps=[
    ('iqr_cap', IQRCapper(strategy='mean')),
    ('scaler', MinMaxScaler())
])

oulier_minmax_pipeline_median = Pipeline(steps=[
    ('iqr_cap', IQRCapper(strategy='median')),
    ('scaler', MinMaxScaler())
])


cat_pipeline = Pipeline([
     ('string_converter', StringConverter()),
    ('imputer', SimpleImputer(strategy='constant', fill_value='Unknown')),
    ('ohe', OneHotEncoder(sparse_output=False, handle_unknown='ignore',drop='first'))
])

constructionyear_pipeline = Pipeline(steps=[
    ('replace_zeros_with_median', ConstructionYearTransformer()),
    ('minmax_scaling', MinMaxScaler())
])
# ColumnTransformer and full pipeline setup for feature preprocessing
# The ColumnTransformer allows us to apply different preprocessing steps to different columns of the DataFrame
preprocessor = ColumnTransformer(
    transformers=[
        ('date', date_recorded_transformer_pipeline, ['date_recorded']),
        #('gps_height', scaler_minmax, ['gps_height']),
        ('outlier_minmax_gps_height', oulier_minmax_pipeline_mean, ['gps_height']),
        ('outlier_minmax_longitude', oulier_minmax_pipeline_mean, ['longitude']),
        ('outlier_minmax_latitude', oulier_minmax_pipeline_mean, ['latitude']),
         ('cat_ohe', cat_pipeline, ['basin','region','region_code','lga','public_meeting','permit','extraction_type','management_group','payment','water_quality','quantity','source_type','source_class','waterpoint_type']),
        ('outlier_minmax_population', oulier_minmax_pipeline_clip, ['population']),
        ('constructionyear', constructionyear_pipeline, ['construction_year'])


    ],
    remainder='passthrough',
    verbose_feature_names_out=False
)



preprocess_pipeline = Pipeline([
    ('preprocessing', preprocessor),
    ('object_to_numeric', ObjectToNumericConverter())  # your custom step
])


In [None]:
# models (Decision Tree, Random Forest, XGBoost) to be used
models = {    
    "Decision Tree": DecisionTreeClassifier(
        max_depth=10,  # You can tune this
        random_state=42
    ),
    "Random Forest": RandomForestClassifier(
        n_estimators=100,
        max_depth=15,  # You can tune this too
        random_state=42
    ),"XGBoost": XGBClassifier(
        n_estimators=100,
        max_depth=11,
        learning_rate=0.1,
        subsample=0.9,
        colsample_bytree=0.8,
        #use_label_encoder=False,
        eval_metric='mlogloss',    # good for multi-class
        objective='multi:softmax', # directly outputs class labels
        num_class=3,               # number of target classes
        random_state=42
    ),
    "SVC": SVC(
    kernel='rbf',
    C=1.0,
    class_weight='balanced',
    probability=True  # if you need .predict_proba
    ),
    "Gradient Boosting": GradientBoostingClassifier(
    n_estimators=100,
    learning_rate=0.1,
    random_state=42
    ),
    "Extra Trees": ExtraTreesClassifier(
    n_estimators=100,
    random_state=42
    ),
    "LightGBM": lgb.LGBMClassifier(
    objective='multiclass',
    num_class=len(le.classes_),
    random_state=42
    ),
    "Polynomial": Pipeline([
    ('poly', PolynomialFeatures(degree=2)),
    ('scaler', StandardScaler()),        # often useful after poly
    ('clf', LogisticRegression(
        multi_class='multinomial',
        solver='saga',
        max_iter=500
    ))
    ]),
    "Ridge": LogisticRegression(
    penalty='l2',
    C=1.0,
    multi_class='multinomial',
    solver='saga'
    ),
    "Lasso": LogisticRegression(
    penalty='l1',
    C=1.0,
    multi_class='multinomial',
    solver='saga'
    ),
    "ElasticNet": LogisticRegression(
    penalty='elasticnet',
    l1_ratio=0.5,
    C=1.0,
    multi_class='multinomial',
    solver='saga'
    )
}

# Results and feature importances storage
results = {}
feature_importances = {}

# Loop through each model
for name, model in models.items():
    full_pipeline = Pipeline([
        ('preprocessing', preprocessor),
        ('model', model)
                            ])


    full_pipeline.fit(X_train, y_train) 
    """
    # Step 1: Each transformer in the preprocessing pipeline Computes and stores necessary statistics 
    # (e.g., quartiles, medians, scalers) from X_train only.
    #Step 2: The transformers are applied (transformed) to X_train to produce the final preprocessed training features.
    # Step 3: The model is trained using these transformed features and y_train.
    """ 


    # Predictions
    y_train_pred = full_pipeline.predict(X_train)
    y_test_pred = full_pipeline.predict(X_test)


    """
    The stored training statistics are used to transform X_train again and X_test (no re-fitting!).

    Feeds this transformed data to the already-trained model.

    Outputs predictions (y_train_pred, y_test_pred).

    """

    # Accuracy scores
    train_accuracy = accuracy_score(y_train, y_train_pred)
    test_accuracy = accuracy_score(y_test, y_test_pred)
    results[name] = {
        "Train Accuracy": train_accuracy,
        "Test Accuracy": test_accuracy
    }
    # Extract feature importances
    fitted_model = full_pipeline.named_steps['model']
    
    if hasattr(fitted_model, 'feature_importances_'):
        # Get transformed feature names from preprocessor
        feature_names = full_pipeline.named_steps['preprocessing'].get_feature_names_out()
        importances = fitted_model.feature_importances_
        feature_importances[name] = sorted(
            zip(feature_names, importances),
            key=lambda x: x[1],
            reverse=True
        )

# Print results
print("\nModel Comparison:")
print("{:<15} {:<15} {:<15}".format("Model", "Train Acc", "Test Acc"))
print("-" * 45)
for model_name, scores in results.items():
    print("{:<15} {:<15.4f} {:<15.4f}".format(model_name, scores["Train Accuracy"], scores["Test Accuracy"]))

# Print top features
print("\nTop 10 Important Features:")
for model_name, importance_list in feature_importances.items():
    print(f"\n{model_name}:")
    for feature, importance in importance_list[:10]:
        print(f"{feature:<30} {importance:.4f}")


In [7]:
# Cell 2: Decision Tree
from sklearn.tree import DecisionTreeClassifier

dt_pipeline = Pipeline([
    ('preprocessing', preprocessor),
    ('model', DecisionTreeClassifier(max_depth=10, random_state=42))
])

dt_pipeline.fit(X_train, y_train)
y_train_pred = dt_pipeline.predict(X_train)
y_test_pred  = dt_pipeline.predict(X_test)

print("Decision Tree ▶️",
      "Train Acc:", accuracy_score(y_train, y_train_pred),
      "Test Acc:",  accuracy_score(y_test, y_test_pred))

# Feature importances
feat_names = dt_pipeline.named_steps['preprocessing'].get_feature_names_out()
importances = dt_pipeline.named_steps['model'].feature_importances_
top = sorted(zip(feat_names, importances), key=lambda x: x[1], reverse=True)[:10]
print("Top 10 features:")
for f, imp in top:
    print(f"  {f:<30} {imp:.4f}")


Decision Tree ▶️ Train Acc: 0.7667999438832772 Test Acc: 0.7483164983164983
Top 10 features:
  waterpoint_type_other          0.2024
  quantity_seasonal              0.1304
  quantity_enough                0.1123
  quantity_insufficient          0.0860
  longitude                      0.0614
  construction_year              0.0598
  latitude                       0.0412
  waterpoint_type_communal standpipe multiple 0.0378
  population                     0.0157
  gps_height                     0.0151


In [8]:
# Cell 3: Random Forest
from sklearn.ensemble import RandomForestClassifier

rf_pipeline = Pipeline([
    ('preprocessing', preprocessor),
    ('model', RandomForestClassifier(
        n_estimators=100, max_depth=15, random_state=42
    ))
])

rf_pipeline.fit(X_train, y_train)
y_train_pred = rf_pipeline.predict(X_train)
y_test_pred  = rf_pipeline.predict(X_test)

print("Random Forest ▶️",
      "Train Acc:", accuracy_score(y_train, y_train_pred),
      "Test Acc:",  accuracy_score(y_test, y_test_pred))

feat_names = rf_pipeline.named_steps['preprocessing'].get_feature_names_out()
importances = rf_pipeline.named_steps['model'].feature_importances_
top = sorted(zip(feat_names, importances), key=lambda x: x[1], reverse=True)[:10]
print("Top 10 features:")
for f, imp in top:
    print(f"  {f:<30} {imp:.4f}")


Random Forest ▶️ Train Acc: 0.8136574074074074 Test Acc: 0.7777777777777778
Top 10 features:
  quantity_enough                0.0774
  waterpoint_type_other          0.0748
  longitude                      0.0678
  latitude                       0.0634
  construction_year              0.0610
  extraction_type_other          0.0586
  gps_height                     0.0421
  population                     0.0306
  quantity_insufficient          0.0248
  waterpoint_type_communal standpipe 0.0199


In [9]:
# Cell 4: XGBoost
from xgboost import XGBClassifier

xgb_pipeline = Pipeline([
    ('preprocessing', preprocessor),
    ('model', XGBClassifier(
        n_estimators=100,
        max_depth=11,
        learning_rate=0.1,
        subsample=0.9,
        colsample_bytree=0.8,
        objective='multi:softmax',
        num_class=len(le.classes_),
        eval_metric='mlogloss',
        random_state=42
    ))
])

xgb_pipeline.fit(X_train, y_train)
y_train_pred = xgb_pipeline.predict(X_train)
y_test_pred  = xgb_pipeline.predict(X_test)

print("XGBoost ▶️",
      "Train Acc:", accuracy_score(y_train, y_train_pred),
      "Test Acc:",  accuracy_score(y_test, y_test_pred))

feat_names = xgb_pipeline.named_steps['preprocessing'].get_feature_names_out()
importances = xgb_pipeline.named_steps['model'].feature_importances_
top = sorted(zip(feat_names, importances), key=lambda x: x[1], reverse=True)[:10]
print("Top 10 features:")
for f, imp in top:
    print(f"  {f:<30} {imp:.4f}")


XGBoost ▶️ Train Acc: 0.8602518237934904 Test Acc: 0.8055555555555556
Top 10 features:
  waterpoint_type_other          0.0654
  lga_Bariadi                    0.0275
  quantity_seasonal              0.0269
  region_Iringa                  0.0254
  extraction_type_other          0.0164
  region_code_11                 0.0155
  region_code_15                 0.0151
  lga_Ngara                      0.0147
  lga_Rombo                      0.0145
  lga_Chunya                     0.0122


In [None]:
# Cell 5: SVC
from sklearn.svm import SVC

svc_pipeline = Pipeline([
    ('preprocessing', preprocessor),
    ('model', SVC(
        kernel='rbf', C=1.0,
        class_weight='balanced',
        probability=True
    ))
])

svc_pipeline.fit(X_train, y_train)
y_train_pred = svc_pipeline.predict(X_train)
y_test_pred  = svc_pipeline.predict(X_test)

print("SVC ▶️",
      "Train Acc:", accuracy_score(y_train, y_train_pred),
      "Test Acc:",  accuracy_score(y_test, y_test_pred))


In [11]:
# Cell 6: Gradient Boosting
from sklearn.ensemble import GradientBoostingClassifier

gb_pipeline = Pipeline([
    ('preprocessing', preprocessor),
    ('model', GradientBoostingClassifier(
        n_estimators=100, learning_rate=0.1, random_state=42
    ))
])

gb_pipeline.fit(X_train, y_train)
y_train_pred = gb_pipeline.predict(X_train)
y_test_pred  = gb_pipeline.predict(X_test)

print("Gradient Boosting ▶️",
      "Train Acc:", accuracy_score(y_train, y_train_pred),
      "Test Acc:",  accuracy_score(y_test, y_test_pred))

feat_names = gb_pipeline.named_steps['preprocessing'].get_feature_names_out()
importances = gb_pipeline.named_steps['model'].feature_importances_
top = sorted(zip(feat_names, importances), key=lambda x: x[1], reverse=True)[:10]
print("Top 10 features:")
for f, imp in top:
    print(f"  {f:<30} {imp:.4f}")


Gradient Boosting ▶️ Train Acc: 0.7565761784511784 Test Acc: 0.7554713804713805
Top 10 features:
  waterpoint_type_other          0.1380
  quantity_insufficient          0.1350
  quantity_enough                0.1288
  extraction_type_other          0.0769
  construction_year              0.0765
  quantity_seasonal              0.0506
  waterpoint_type_communal standpipe multiple 0.0393
  longitude                      0.0320
  latitude                       0.0232
  payment_pay per bucket         0.0187


In [10]:
# Cell 7: Extra Trees
from sklearn.ensemble import ExtraTreesClassifier

et_pipeline = Pipeline([
    ('preprocessing', preprocessor),
    ('model', ExtraTreesClassifier(n_estimators=100, random_state=42))
])

et_pipeline.fit(X_train, y_train)
y_train_pred = et_pipeline.predict(X_train)
y_test_pred  = et_pipeline.predict(X_test)

print("Extra Trees ▶️",
      "Train Acc:", accuracy_score(y_train, y_train_pred),
      "Test Acc:",  accuracy_score(y_test, y_test_pred))

feat_names = et_pipeline.named_steps['preprocessing'].get_feature_names_out()
importances = et_pipeline.named_steps['model'].feature_importances_
top = sorted(zip(feat_names, importances), key=lambda x: x[1], reverse=True)[:10]
print("Top 10 features:")
for f, imp in top:
    print(f"  {f:<30} {imp:.4f}")


Extra Trees ▶️ Train Acc: 0.995493125701459 Test Acc: 0.7882996632996633
Top 10 features:
  latitude                       0.1439
  longitude                      0.1436
  gps_height                     0.0687
  construction_year              0.0527
  quantity_enough                0.0498
  population                     0.0460
  waterpoint_type_other          0.0324
  quantity_insufficient          0.0283
  extraction_type_other          0.0280
  payment_pay per bucket         0.0140


In [10]:
# Cell 8: LightGBM
import lightgbm as lgb

lgb_pipeline = Pipeline([
    ('preprocessing', preprocessor),
    ('model', lgb.LGBMClassifier(
        objective='multiclass',
        num_class=len(le.classes_),
        random_state=42
    ))
])

lgb_pipeline.fit(X_train, y_train)
y_train_pred = lgb_pipeline.predict(X_train)
y_test_pred  = lgb_pipeline.predict(X_test)

print("LightGBM ▶️",
      "Train Acc:", accuracy_score(y_train, y_train_pred),
      "Test Acc:",  accuracy_score(y_test, y_test_pred))

feat_names = lgb_pipeline.named_steps['preprocessing'].get_feature_names_out()
importances = lgb_pipeline.named_steps['model'].feature_importances_
top = sorted(zip(feat_names, importances), key=lambda x: x[1], reverse=True)[:10]
print("Top 10 features:")
for f, imp in top:
    print(f"  {f:<30} {imp:.4f}")


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.008996 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1523
[LightGBM] [Info] Number of data points in the train set: 57024, number of used features: 237
[LightGBM] [Info] Start training from score -0.956483
[LightGBM] [Info] Start training from score -2.621811
[LightGBM] [Info] Start training from score -0.610486




LightGBM ▶️ Train Acc: 0.7993476430976431 Test Acc: 0.7891414141414141
Top 10 features:
  latitude                       915.0000
  longitude                      913.0000
  construction_year              632.0000
  gps_height                     591.0000
  population                     356.0000
  district_code                  203.0000
  quantity_enough                202.0000
  quantity_insufficient          143.0000
  payment_pay per bucket         125.0000
  quantity_seasonal              109.0000




In [None]:
# Cell 9: Polynomial + LogisticRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LogisticRegression

poly_pipeline = Pipeline([
    ('preprocessing', preprocessor),
    ('poly', PolynomialFeatures(degree=2)),
    ('scaler', StandardScaler()),
    ('model', LogisticRegression(
        multi_class='multinomial', solver='saga', max_iter=500
    ))
])

poly_pipeline.fit(X_train, y_train)
y_train_pred = poly_pipeline.predict(X_train)
y_test_pred  = poly_pipeline.predict(X_test)

print("Polynomial LR ▶️",
      "Train Acc:", accuracy_score(y_train, y_train_pred),
      "Test Acc:",  accuracy_score(y_test, y_test_pred))




In [10]:
# Cell 10: Ridge (L2 penalty)
ridge_pipeline = Pipeline([
    ('preprocessing', preprocessor),
    ('model', LogisticRegression(
        penalty='l2', C=1.0,
        multi_class='multinomial', solver='saga'
    ))
])

ridge_pipeline.fit(X_train, y_train)
y_train_pred = ridge_pipeline.predict(X_train)
y_test_pred  = ridge_pipeline.predict(X_test)

print("Ridge LR ▶️",
      "Train Acc:", accuracy_score(y_train, y_train_pred),
      "Test Acc:",  accuracy_score(y_test, y_test_pred))





Ridge LR ▶️ Train Acc: 0.7493160774410774 Test Acc: 0.7470538720538721


In [11]:
# Cell 11: Lasso (L1 penalty)
lasso_pipeline = Pipeline([
    ('preprocessing', preprocessor),
    ('model', LogisticRegression(
        penalty='l1', C=1.0,
        multi_class='multinomial', solver='saga'
    ))
])

lasso_pipeline.fit(X_train, y_train)
y_train_pred = lasso_pipeline.predict(X_train)
y_test_pred  = lasso_pipeline.predict(X_test)

print("Lasso LR ▶️",
      "Train Acc:", accuracy_score(y_train, y_train_pred),
      "Test Acc:",  accuracy_score(y_test, y_test_pred))




Lasso LR ▶️ Train Acc: 0.749175785634119 Test Acc: 0.7483164983164983


In [12]:
# Cell 12: Elastic Net
enet_pipeline = Pipeline([
    ('preprocessing', preprocessor),
    ('model', LogisticRegression(
        penalty='elasticnet', l1_ratio=0.5, C=1.0,
        multi_class='multinomial', solver='saga'
    ))
])

enet_pipeline.fit(X_train, y_train)
y_train_pred = enet_pipeline.predict(X_train)
y_test_pred  = enet_pipeline.predict(X_test)

print("Elastic Net LR ▶️",
      "Train Acc:", accuracy_score(y_train, y_train_pred),
      "Test Acc:",  accuracy_score(y_test, y_test_pred))




Elastic Net LR ▶️ Train Acc: 0.7493336139169473 Test Acc: 0.7470538720538721
