In [1]:
# Import modules for data manipulation, modeling, and hyperparameter searching.

import pandas as pd
import category_encoders as ce
from collections import Counter
from xgboost import XGBClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV

In [2]:
# Show the file names in the directory.

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/icr-identify-age-related-conditions/sample_submission.csv
/kaggle/input/icr-identify-age-related-conditions/greeks.csv
/kaggle/input/icr-identify-age-related-conditions/train.csv
/kaggle/input/icr-identify-age-related-conditions/test.csv


In [3]:
# Read in the training set, and print the first five lines.

df = pd.read_csv("/kaggle/input/icr-identify-age-related-conditions/train.csv")
df.head()

Unnamed: 0,Id,AB,AF,AH,AM,AR,AX,AY,AZ,BC,...,FL,FR,FS,GB,GE,GF,GH,GI,GL,Class
0,000ff2bfdfe9,0.209377,3109.03329,85.200147,22.394407,8.138688,0.699861,0.025578,9.812214,5.555634,...,7.298162,1.73855,0.094822,11.339138,72.611063,2003.810319,22.136229,69.834944,0.120343,1
1,007255e47698,0.145282,978.76416,85.200147,36.968889,8.138688,3.63219,0.025578,13.51779,1.2299,...,0.173229,0.49706,0.568932,9.292698,72.611063,27981.56275,29.13543,32.131996,21.978,0
2,013f2bd269f5,0.47003,2635.10654,85.200147,32.360553,8.138688,6.73284,0.025578,12.82457,1.2299,...,7.70956,0.97556,1.198821,37.077772,88.609437,13676.95781,28.022851,35.192676,0.196941,0
3,043ac50845d5,0.252107,3819.65177,120.201618,77.112203,8.138688,3.685344,0.025578,11.053708,1.2299,...,6.122162,0.49706,0.284466,18.529584,82.416803,2094.262452,39.948656,90.493248,0.155829,0
4,044fb8a146ec,0.380297,3733.04844,85.200147,14.103738,8.138688,3.942255,0.05481,3.396778,102.15198,...,8.153058,48.50134,0.121914,16.408728,146.109943,8524.370502,45.381316,36.262628,0.096614,1


In [4]:
# Drop the patient identification number which is useless for training.

df.drop(['Id'], axis=1, inplace=True)
df.head()

Unnamed: 0,AB,AF,AH,AM,AR,AX,AY,AZ,BC,BD,...,FL,FR,FS,GB,GE,GF,GH,GI,GL,Class
0,0.209377,3109.03329,85.200147,22.394407,8.138688,0.699861,0.025578,9.812214,5.555634,4126.58731,...,7.298162,1.73855,0.094822,11.339138,72.611063,2003.810319,22.136229,69.834944,0.120343,1
1,0.145282,978.76416,85.200147,36.968889,8.138688,3.63219,0.025578,13.51779,1.2299,5496.92824,...,0.173229,0.49706,0.568932,9.292698,72.611063,27981.56275,29.13543,32.131996,21.978,0
2,0.47003,2635.10654,85.200147,32.360553,8.138688,6.73284,0.025578,12.82457,1.2299,5135.78024,...,7.70956,0.97556,1.198821,37.077772,88.609437,13676.95781,28.022851,35.192676,0.196941,0
3,0.252107,3819.65177,120.201618,77.112203,8.138688,3.685344,0.025578,11.053708,1.2299,4169.67738,...,6.122162,0.49706,0.284466,18.529584,82.416803,2094.262452,39.948656,90.493248,0.155829,0
4,0.380297,3733.04844,85.200147,14.103738,8.138688,3.942255,0.05481,3.396778,102.15198,5728.73412,...,8.153058,48.50134,0.121914,16.408728,146.109943,8524.370502,45.381316,36.262628,0.096614,1


In [5]:
# List columns with categorical data and summarize them.

categorical_columns = df.select_dtypes(include=['object']).columns.tolist()
def summarize_categorical(column_names):
    for values in column_names:
        print("Summary for {0}".format(values))
        print(df[values].value_counts())
        print("--------------------------")
summarize_categorical(categorical_columns)

Summary for EJ
EJ
B    395
A    222
Name: count, dtype: int64
--------------------------


In [6]:
# Fill any missing values with the mean of the column.
# Then, check if any values are still missing.

column_names = list(df.columns.values)
def fill_nan(column_names):
    for column in column_names:
        if df[column].isnull().sum() != 0:
            mean = df[column].mean()
            df[column].fillna(mean, inplace = True)
fill_nan(column_names)
print("There are still missing values is", df.isnull().values.any())

There are still missing values is False


In [7]:
# Split off the input values from the training set dataframe

X = df.iloc[:, :-1]
X.head()

Unnamed: 0,AB,AF,AH,AM,AR,AX,AY,AZ,BC,BD,...,FI,FL,FR,FS,GB,GE,GF,GH,GI,GL
0,0.209377,3109.03329,85.200147,22.394407,8.138688,0.699861,0.025578,9.812214,5.555634,4126.58731,...,3.58345,7.298162,1.73855,0.094822,11.339138,72.611063,2003.810319,22.136229,69.834944,0.120343
1,0.145282,978.76416,85.200147,36.968889,8.138688,3.63219,0.025578,13.51779,1.2299,5496.92824,...,10.358927,0.173229,0.49706,0.568932,9.292698,72.611063,27981.56275,29.13543,32.131996,21.978
2,0.47003,2635.10654,85.200147,32.360553,8.138688,6.73284,0.025578,12.82457,1.2299,5135.78024,...,11.626917,7.70956,0.97556,1.198821,37.077772,88.609437,13676.95781,28.022851,35.192676,0.196941
3,0.252107,3819.65177,120.201618,77.112203,8.138688,3.685344,0.025578,11.053708,1.2299,4169.67738,...,14.852022,6.122162,0.49706,0.284466,18.529584,82.416803,2094.262452,39.948656,90.493248,0.155829
4,0.380297,3733.04844,85.200147,14.103738,8.138688,3.942255,0.05481,3.396778,102.15198,5728.73412,...,13.666727,8.153058,48.50134,0.121914,16.408728,146.109943,8524.370502,45.381316,36.262628,0.096614


In [8]:
# Split off the target values from the training set dataframe

y = df.iloc[:,-1]
y

0      1
1      0
2      0
3      0
4      1
      ..
612    0
613    0
614    0
615    0
616    0
Name: Class, Length: 617, dtype: int64

In [9]:
# How many negatives and positives are there in the data set?

counter = Counter(y)
print(counter)

Counter({0: 509, 1: 108})


In [10]:
# Replace the categorical data with a one hot encoding.

enc = ce.OneHotEncoder(cols=["EJ"])
enc.fit(X)
encoded_x = enc.transform(X)
encoded_x.head()

Unnamed: 0,AB,AF,AH,AM,AR,AX,AY,AZ,BC,BD,...,FI,FL,FR,FS,GB,GE,GF,GH,GI,GL
0,0.209377,3109.03329,85.200147,22.394407,8.138688,0.699861,0.025578,9.812214,5.555634,4126.58731,...,3.58345,7.298162,1.73855,0.094822,11.339138,72.611063,2003.810319,22.136229,69.834944,0.120343
1,0.145282,978.76416,85.200147,36.968889,8.138688,3.63219,0.025578,13.51779,1.2299,5496.92824,...,10.358927,0.173229,0.49706,0.568932,9.292698,72.611063,27981.56275,29.13543,32.131996,21.978
2,0.47003,2635.10654,85.200147,32.360553,8.138688,6.73284,0.025578,12.82457,1.2299,5135.78024,...,11.626917,7.70956,0.97556,1.198821,37.077772,88.609437,13676.95781,28.022851,35.192676,0.196941
3,0.252107,3819.65177,120.201618,77.112203,8.138688,3.685344,0.025578,11.053708,1.2299,4169.67738,...,14.852022,6.122162,0.49706,0.284466,18.529584,82.416803,2094.262452,39.948656,90.493248,0.155829
4,0.380297,3733.04844,85.200147,14.103738,8.138688,3.942255,0.05481,3.396778,102.15198,5728.73412,...,13.666727,8.153058,48.50134,0.121914,16.408728,146.109943,8524.370502,45.381316,36.262628,0.096614


In [11]:
# The competition is evaluated based on a weighted loss function to balance the
# importance of getting positives and negatives right.  Therefore, we use a classifier
# model trained with the custom scale_pos_weight.

scale_pos_weight = 509/108
model = XGBClassifier(scale_pos_weight = scale_pos_weight, random_state = 0)

In [12]:
# Do a grid search of four hyperparameters using a stratified 5-fold cross validation.

param_grid = {'colsample_bylevel': [.2, .4, .6, .8, 1.0],
              'colsample_bytree': [.2, .4, .6, .8, 1.0],
              'max_depth': [2, 4, 6, 8, 10],
              'n_estimators': [50, 100, 200, 400, 800]}
cv = StratifiedKFold(shuffle = True, random_state = 0)
grid = GridSearchCV(estimator = model, param_grid = param_grid, cv = cv)
grid_result = grid.fit(encoded_x, y)

In [13]:
# Get the best model from the search.

the_best_model = grid_result.best_estimator_

In [14]:
# Print info on the best model.

the_best_model

In [15]:
# Read in the test data for the competition.

test_df = pd.read_csv("/kaggle/input/icr-identify-age-related-conditions/test.csv")
test_df.head(2)

Unnamed: 0,Id,AB,AF,AH,AM,AR,AX,AY,AZ,BC,...,FI,FL,FR,FS,GB,GE,GF,GH,GI,GL
0,00eed32682bb,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,010ebe33f668,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [16]:
# Check if the test data has any missing values that need to be imputed.

test_df.isnull().values.any()

False

In [17]:
# Drop the useless patient id again.

new_test_df = test_df.drop(['Id'], axis = 1)
new_test_df.head(2)

Unnamed: 0,AB,AF,AH,AM,AR,AX,AY,AZ,BC,BD,...,FI,FL,FR,FS,GB,GE,GF,GH,GI,GL
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [18]:
# Encode the categorical data again.

encoded_test_df = enc.transform(new_test_df)
encoded_test_df.head(2)

Unnamed: 0,AB,AF,AH,AM,AR,AX,AY,AZ,BC,BD,...,FI,FL,FR,FS,GB,GE,GF,GH,GI,GL
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [19]:
# Make the predictions on the test data for submission to the competition.

class_0_pred = the_best_model.predict_proba(encoded_test_df)[:,0]
class_1_pred = the_best_model.predict_proba(encoded_test_df)[:,1]

In [20]:
# Put the predictions into a CSV for submission.

test_df["class_0"] = pd.Series(class_0_pred).values
test_df["class_1"] = pd.Series(class_1_pred).values
submission_df = test_df[["Id", "class_0", "class_1"]]
submission_df.to_csv("/kaggle/working/submission.csv", index=False)