## Importing all Dependencies

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.compose import ColumnTransformer, make_column_selector
from sklearn.preprocessing import StandardScaler, OrdinalEncoder, FunctionTransformer, PolynomialFeatures
from sklearn.impute import SimpleImputer
from sklearn.model_selection import cross_val_score, cross_val_predict, GridSearchCV
from sklearn.metrics import precision_recall_curve, precision_score, recall_score, confusion_matrix, accuracy_score, log_loss
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier, VotingClassifier, HistGradientBoostingClassifier
from sklearn.linear_model import LogisticRegression

## Read in Training and Testing Data

In [2]:
from sklearn.model_selection import train_test_split
file_path_train = "/kaggle/input/icr-identify-age-related-conditions/train.csv"
file_path_test = "/kaggle/input/icr-identify-age-related-conditions/test.csv"

train = pd.read_csv(file_path_train)
final_test = pd.read_csv(file_path_test)

train_full, test = train_test_split(train, random_state=42, stratify=train["Class"])
train, valid = train_test_split(train_full, random_state=42, stratify=train_full["Class"])

X_train, y_train = train.drop(["Class"], axis=1), train["Class"].copy()
X_test, y_test = test.drop(["Class"], axis=1), test["Class"].copy()
X_valid, y_valid = valid.drop(["Class"], axis=1), valid["Class"].copy()

## Preprocessing

In [3]:
class input_outlier(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
    def fit(self, X, y=None):
        self.feature_names_in = X.columns
        self.upper_limit = X.quantile(0.99)
        self.lower_limit = X.quantile(0.01)
        return self
    def transform(self, X, y=None):
        fixed_X = np.where(X>self.upper_limit, self.upper_limit, np.where(X<self.lower_limit, self.lower_limit, X))
        return fixed_X
    def get_feature_names_out(self, names=None):
        return self.feature_names_in

In [4]:
log_transformer = FunctionTransformer(np.log, feature_names_out="one-to-one")

num_pipeline = make_pipeline(
    input_outlier(),
    SimpleImputer(strategy="median"),
    log_transformer,
    StandardScaler()
)

cat_pipeline = make_pipeline(
    SimpleImputer(strategy="most_frequent"),
    OrdinalEncoder()
)

cleaning = ColumnTransformer([
    ("num", num_pipeline, make_column_selector(dtype_include=np.number)),
    ("cat", cat_pipeline, ["EJ"]),
])

preprocessing = Pipeline([
    ("clean", cleaning),
    ("poly_feat", PolynomialFeatures(degree=3))
])

## Building Model using Voting classifier

In [5]:
X_train_clean = preprocessing.fit_transform(X_train)
log_reg_ada = LogisticRegression(penalty="l2", max_iter=100, class_weight={0:1, 1:8}, tol=0.1, C=7)

ada_log_reg = AdaBoostClassifier(log_reg_ada)

voting_clf = VotingClassifier(
    estimators=[
        ("lr", LogisticRegression(penalty="l2", max_iter=100, class_weight={0:1, 1:8}, tol=0.1, C=7)),
        ("ada", ada_log_reg),
        ("gbrt", GradientBoostingClassifier(n_estimators=100)),
        ("hgbrt", HistGradientBoostingClassifier(max_iter=90))
    ]
, voting="soft")

In [6]:
cross_validation_score = cross_val_score(voting_clf, X_train_clean, y_train, cv=3, n_jobs=-1, scoring="accuracy")
clean_vote_pred = cross_val_predict(voting_clf, X_train_clean, y_train, cv=3, n_jobs=-1)
precision_vote, recall_vote, threshold_vote = precision_recall_curve(y_train, clean_vote_pred)
clean_vote_proba = cross_val_predict(voting_clf, X_train_clean, y_train, method="predict_proba", n_jobs=-1)
vote_log_loss = log_loss(y_train, clean_vote_proba)

print(f"Model Log Loss:{vote_log_loss}")
print("-----------------------")

print(f"Training Cross Validation :{cross_validation_score.mean()}")
print("-----------------------")
print(f"Training Precision :{precision_score(y_train, clean_vote_pred)}\nTraining Recall: {recall_score(y_train, clean_vote_pred)}")
print("-----------------------")
print()
print("Training Confusion Matrix")
pd.DataFrame(confusion_matrix(y_train, clean_vote_pred), columns=["Pred 0", "Pred 1"], index=["True 0", "True 1"])

Model Log Loss:0.2320475392562561
-----------------------
Training Cross Validation :0.9306346826586706
-----------------------
Training Precision :0.8723404255319149
Training Recall: 0.6721311475409836
-----------------------

Training Confusion Matrix


Unnamed: 0,Pred 0,Pred 1
True 0,279,6
True 1,20,41


In [7]:
voting_clf.fit(X_train_clean, y_train)
print()




In [8]:
X_valid_clean = preprocessing.transform(X_valid)
y_valid_pred_vote = voting_clf.predict(X_valid_clean)
y_valid_pred_proba_vote = voting_clf.predict_proba(X_valid_clean)
print(f"Log Loss Validation set: {log_loss(y_valid, y_valid_pred_proba_vote)}")
print(f"Accuracy on Validation set: {accuracy_score(y_valid, y_valid_pred_vote)}")
print()
print("Validation Confusion Matrix")
pd.DataFrame(confusion_matrix(y_valid, y_valid_pred_vote), columns=["Pred 0", "Pred 1"], index=["True 0", "True 1"])

Log Loss Validation set: 0.21112380647668297
Accuracy on Validation set: 0.9568965517241379

Validation Confusion Matrix


Unnamed: 0,Pred 0,Pred 1
True 0,96,0
True 1,5,15


In [9]:
X_test_clean = preprocessing.transform(X_test)
y_test_pred_vote = voting_clf.predict(X_test_clean)
y_test_pred_proba_vote = voting_clf.predict_proba(X_test_clean)
print(f"Log Loss Test set: {log_loss(y_test, y_test_pred_proba_vote)}")
print(f"Accuracy on Test set: {accuracy_score(y_test, y_test_pred_vote)}")
print()
print("Test Confusion Matrix")
pd.DataFrame(confusion_matrix(y_test, y_test_pred_vote), columns=["Pred 0", "Pred 1"], index=["True 0", "True 1"])

Log Loss Test set: 0.23703712084384757
Accuracy on Test set: 0.9225806451612903

Test Confusion Matrix


Unnamed: 0,Pred 0,Pred 1
True 0,124,4
True 1,8,19


In [10]:
test_clean = preprocessing.transform(final_test)
test_pred = voting_clf.predict_proba(test_clean)
test_pred_df = pd.DataFrame(np.c_[final_test["Id"], test_pred], columns=["Id", "class_0", "class_1"])
test_pred_df.to_csv("submission.csv", index=False)