In [63]:
# Import Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import sklearn
from sklearn.preprocessing import  MinMaxScaler,OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

from sklearn.metrics import accuracy_score




scaler_minmax= MinMaxScaler()                                                             # Create a MinMaxScaler object
ohe = OneHotEncoder(sparse_output=False, handle_unknown='ignore',drop='first')            # Create a OneHotEncoder object


# Read CSV files

df=pd.read_csv("Training_Set_Values.csv")       # Read the Training data CSV file
name_featrures=df.columns                       # Get the features name
len_features=len(name_featrures)                # Get the length of features
labels=pd.read_csv("Training_Set_Labels.csv")   # Read the labels (target) CSV file
labels.head()
df['target'] = labels['status_group']           # Add the target column to the dataframe
#print(df.shape)                                 # Print the shape of the dataframe
#df.head()
#df.info()


In [None]:
# Columns to be dropped for baseline model
columns_drop=['id','amount_tsh','num_private','subvillage','recorded_by','scheme_name',
              'extraction_type_group','extraction_type_class',
              'management','payment_type','quality_group','quantity_group','source','waterpoint_type_group',
              'funder','installer','wpt_name','ward','scheme_management']

df = df.drop(columns=columns_drop)
print('Shape after dropping columns:', df.shape)

Shape after dropping columns: (59400, 22)


In [65]:
# Convert the target column to labels 
#print(df['target'].unique())
target_map_dict={'functional': 2, 'functional needs repair': 1, 'non functional': 0} # Defined the mapping of labels to numbers (integers)
#print(df['target'].head())
df['target'] =df['target'].map(target_map_dict) # transform the target column (labels) to  numbers (integers)
#df['target'].head()

In [None]:
# Separate features and target and perform train test split
X = df.drop(columns=['target'])  # Features only
y = df['target']                 # Target column
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.04, random_state=42, stratify=y)  # 2376 records for the test

In [67]:
# Import the custom transformers form helper_function.py
# The helper_function.py file contains the definitions for StringConverter, YearExtractor, IQRCapper, and ConstructionYearTransformer
from helper_function import (
    StringConverter,
    YearExtractor,
    IQRCapper,
    ConstructionYearTransformer,
    ObjectToNumericConverter
)
    
#pipeline transformers
date_recorded_transformer_pipeline=Pipeline([
    
    ('year_extractor',YearExtractor()),
    ('onehot', OneHotEncoder(sparse_output=False, handle_unknown='ignore',drop='first'))
])


oulier_minmax_pipeline_clip = Pipeline(steps=[
    ('iqr_cap', IQRCapper(strategy='clip')),
    ('scaler', MinMaxScaler())
])

oulier_minmax_pipeline_mean = Pipeline(steps=[
    ('iqr_cap', IQRCapper(strategy='mean')),
    ('scaler', MinMaxScaler())
])

oulier_minmax_pipeline_median = Pipeline(steps=[
    ('iqr_cap', IQRCapper(strategy='median')),
    ('scaler', MinMaxScaler())
])


cat_pipeline = Pipeline([
     ('string_converter', StringConverter()),
    ('imputer', SimpleImputer(strategy='constant', fill_value='Unknown')),
    ('ohe', OneHotEncoder(sparse_output=False, handle_unknown='ignore',drop='first'))
])

constructionyear_pipeline = Pipeline(steps=[
    ('replace_zeros_with_median', ConstructionYearTransformer()),
    ('minmax_scaling', MinMaxScaler())
])
# ColumnTransformer and full pipeline setup for feature preprocessing
# The ColumnTransformer allows us to apply different preprocessing steps to different columns of the DataFrame
preprocessor = ColumnTransformer(
    transformers=[
        ('date', date_recorded_transformer_pipeline, ['date_recorded']),
        #('gps_height', scaler_minmax, ['gps_height']),
        ('outlier_minmax_gps_height', oulier_minmax_pipeline_mean, ['gps_height']),
        ('outlier_minmax_longitude', oulier_minmax_pipeline_mean, ['longitude']),
        ('outlier_minmax_latitude', oulier_minmax_pipeline_mean, ['latitude']),
         ('cat_ohe', cat_pipeline, ['basin','region','region_code','lga','public_meeting','permit','extraction_type','management_group','payment','water_quality','quantity','source_type','source_class','waterpoint_type']),
        ('outlier_minmax_population', oulier_minmax_pipeline_clip, ['population']),
        ('constructionyear', constructionyear_pipeline, ['construction_year'])


    ],
    remainder='passthrough',
    verbose_feature_names_out=False
)



preprocess_pipeline = Pipeline([
    ('preprocessing', preprocessor),
    ('object_to_numeric', ObjectToNumericConverter())  # your custom step
])


In [68]:
# Define your models
models = {    
    "Decision Tree": DecisionTreeClassifier(
        max_depth=10,  # You can tune this
        random_state=42
    ),
    "Random Forest": RandomForestClassifier(
        n_estimators=100,
        max_depth=15,  # You can tune this too
        random_state=42
    ),"XGBoost": XGBClassifier(
        n_estimators=100,
        max_depth=11,
        learning_rate=0.1,
        subsample=0.9,
        colsample_bytree=0.8,
        use_label_encoder=False,
        eval_metric='mlogloss',    # good for multi-class
        objective='multi:softmax', # directly outputs class labels
        num_class=3,               # number of target classes
        random_state=42
    )
}

# Store results
results = {}

# Loop through each model
for name, model in models.items():
    full_pipeline = Pipeline([
        ('preprocessing', preprocessor),
        ('model', model)
                            ])


    full_pipeline.fit(X_train, y_train) 
    """
    # Step 1: Each transformer in the preprocessing pipeline Computes and stores necessary statistics 
    # (e.g., quartiles, medians, scalers) from X_train only.
    #Step 2: The transformers are applied (transformed) to X_train to produce the final preprocessed training features.
    # Step 3: The model is trained using these transformed features and y_train.
    """ 


    # Predictions
    y_train_pred = full_pipeline.predict(X_train)
    y_test_pred = full_pipeline.predict(X_test)


    """
    The stored training statistics are used to transform X_train again and X_test (no re-fitting!).

    Feeds this transformed data to the already-trained model.

    Outputs predictions (y_train_pred, y_test_pred).

    """

    # Accuracy scores
    train_accuracy = accuracy_score(y_train, y_train_pred)
    test_accuracy = accuracy_score(y_test, y_test_pred)
    results[name] = {
        "Train Accuracy": train_accuracy,
        "Test Accuracy": test_accuracy
    }

# Print results
print("\nModel Comparison:")
print("{:<15} {:<15} {:<15}".format("Model", "Train Acc", "Test Acc"))
print("-" * 45)
for model_name, scores in results.items():
    print("{:<15} {:<15.4f} {:<15.4f}".format(model_name, scores["Train Accuracy"], scores["Test Accuracy"]))


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)



Model Comparison:
Model           Train Acc       Test Acc       
---------------------------------------------
Decision Tree   0.7668          0.7483         
Random Forest   0.8137          0.7778         
XGBoost         0.8598          0.8085         


In [69]:
Training Accuracy: 0.8499
 Test Accuracy:     0.8095

SyntaxError: invalid syntax (2862310629.py, line 1)

In [None]:
Training Accuracy: 0.8598
 Test Accuracy:     0.8085

In [None]:
# Get feature importances
importances = model.feature_importances_
feature_names = X.columns

# Create a DataFrame
importance_df = pd.DataFrame({
    'Feature': feature_names,
    'Importance': importances
}).sort_values(by='Importance', ascending=False)

# Display  important features
print(" Important Features:")
print(importance_df.head())

# Optional: Plot
plt.figure(figsize=(10, 6))
plt.barh(importance_df['Feature'][:10][::-1], importance_df['Importance'][:10][::-1])
plt.xlabel("Feature Importance")
plt.title("Feature importance in Random Forest")
plt.tight_layout()
plt.show()




ValueError: All arrays must be of the same length

In [None]:
importance_df.sort_values(by='Importance').head(No_of_col_to_drop)

importances = model.feature_importances_
feature_names = X.columns
len(importances),len(importance_df['Importance']),len(importance_df['Feature'])

In [None]:
importance_df.sort_values(by='Importance', ascending=False)[['Feature', 'Importance']]
importance_df['Feature'].head(20).tolist()
