In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        pass
    #print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from imblearn.over_sampling import RandomOverSampler
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import VotingClassifier
from xgboost import XGBClassifier
import warnings

warnings.filterwarnings('ignore', category=UserWarning)

# Open file with pd.read_csv
df_train = pd.read_csv("/kaggle/input/icr-identify-age-related-conditions/train.csv")
df_test = pd.read_csv("/kaggle/input/icr-identify-age-related-conditions/test.csv")

# Convert 'A' and 'B' values in 'EJ' column to 0 and 1 respectively
df_train['EJ'] = df_train['EJ'].map({'A': 0, 'B': 1})
df_test['EJ'] = df_test['EJ'].map({'A': 0, 'B': 1})

# Split the training data into features (X) and target variable (y)
X = df_train.drop(["Class", "Id"], axis=1)  # Exclude non-numeric columns
y = df_train["Class"]

# Split the test data into features (X_test)
X_test = df_test.drop("Id", axis=1)

# Identify columns with missing values
columns_with_missing = X.columns[X.isna().any()].tolist()


# Impute missing values with the mean of each column
imputer = SimpleImputer(strategy='mean')
X_imputed = imputer.fit_transform(X)
X_test_imputed = imputer.transform(X_test)

# Scale the features using StandardScaler
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_imputed)
X_test_scaled = scaler.transform(X_test_imputed)

# Handling class imbalance using oversampling
oversampler = RandomOverSampler(random_state=42)
X_scaled, y = oversampler.fit_resample(X_scaled, y)

# Hyperparameter tuning for Random Forest Classifier
rfc = RandomForestClassifier(n_estimators=100, random_state=42)
param_grid_rfc = {'max_depth': [None, 5, 10], 'min_samples_split': [2, 5, 10]}
grid_search_rfc = GridSearchCV(rfc, param_grid_rfc, cv=5, scoring='neg_log_loss')
grid_search_rfc.fit(X_scaled, y)
best_rfc = grid_search_rfc.best_estimator_

# Hyperparameter tuning for Gradient Boosting Classifier
gbc = GradientBoostingClassifier(n_estimators=100, random_state=42)
param_grid_gbc = {'max_depth': [3, 5, 7], 'learning_rate': [0.01, 0.1, 1.0]}
grid_search_gbc = GridSearchCV(gbc, param_grid_gbc, cv=5, scoring='neg_log_loss')
grid_search_gbc.fit(X_scaled, y)
best_gbc = grid_search_gbc.best_estimator_

# Hyperparameter tuning for SVM
svm = SVC(probability=True, random_state=42)
param_grid_svm = {'C': [0.1, 1, 10], 'gamma': [0.1, 0.01, 0.001]}
grid_search_svm = GridSearchCV(svm, param_grid_svm, cv=5, scoring='neg_log_loss')
grid_search_svm.fit(X_scaled, y)
best_svm = grid_search_svm.best_estimator_

# Hyperparameter tuning for Naive Bayes
nb = GaussianNB()
param_grid_nb = {}
grid_search_nb = GridSearchCV(nb, param_grid_nb, cv=5, scoring='neg_log_loss')
grid_search_nb.fit(X_scaled, y)
best_nb = grid_search_nb.best_estimator_

# Hyperparameter tuning for XGBoost Classifier
xgb = XGBClassifier(random_state=42, use_label_encoder = False, eval_metric='error')
#xgb_model = XGBClassifier(use_label_encoder=False)
param_grid_xgb = {'max_depth': [3, 5, 7], 'learning_rate': [0.01, 0.1, 1.0]}
grid_search_xgb = GridSearchCV(xgb, param_grid_xgb, cv=5, scoring='neg_log_loss')
grid_search_xgb.fit(X_scaled, y)
best_xgb = grid_search_xgb.best_estimator_

# Ensemble of models
ensemble_model = VotingClassifier(estimators=[('rfc', best_rfc), ('gbc', best_gbc), ('svm', best_svm), ('nb', best_nb), ('xgb', best_xgb)], voting='soft')
#print(ensemble_model)
ensemble_model.fit(X_scaled, y)

# Predict probabilities for each class in the test set
ensemble_pred_proba = ensemble_model.predict_proba(X_test_scaled)

# Create a DataFrame to store the predictions
predictions_df = pd.DataFrame({'Id': df_test['Id'],
                               'class_0': ensemble_pred_proba[:, 0],
                               'class_1': ensemble_pred_proba[:, 1]})

# Save the predictions to a CSV file
predictions_df.to_csv('submission.csv', index=False)