<a href="https://www.kaggle.com/code/quangnhatbui/age-conditions?scriptVersionId=132299113" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline

In [2]:
df_train = pd.read_csv("/kaggle/input/icr-identify-age-related-conditions/train.csv")
df_greek = pd.read_csv("/kaggle/input/icr-identify-age-related-conditions/greeks.csv")

In [3]:
df_train.head()

Unnamed: 0,Id,AB,AF,AH,AM,AR,AX,AY,AZ,BC,...,FL,FR,FS,GB,GE,GF,GH,GI,GL,Class
0,000ff2bfdfe9,0.209377,3109.03329,85.200147,22.394407,8.138688,0.699861,0.025578,9.812214,5.555634,...,7.298162,1.73855,0.094822,11.339138,72.611063,2003.810319,22.136229,69.834944,0.120343,1
1,007255e47698,0.145282,978.76416,85.200147,36.968889,8.138688,3.63219,0.025578,13.51779,1.2299,...,0.173229,0.49706,0.568932,9.292698,72.611063,27981.56275,29.13543,32.131996,21.978,0
2,013f2bd269f5,0.47003,2635.10654,85.200147,32.360553,8.138688,6.73284,0.025578,12.82457,1.2299,...,7.70956,0.97556,1.198821,37.077772,88.609437,13676.95781,28.022851,35.192676,0.196941,0
3,043ac50845d5,0.252107,3819.65177,120.201618,77.112203,8.138688,3.685344,0.025578,11.053708,1.2299,...,6.122162,0.49706,0.284466,18.529584,82.416803,2094.262452,39.948656,90.493248,0.155829,0
4,044fb8a146ec,0.380297,3733.04844,85.200147,14.103738,8.138688,3.942255,0.05481,3.396778,102.15198,...,8.153058,48.50134,0.121914,16.408728,146.109943,8524.370502,45.381316,36.262628,0.096614,1


In [4]:
df_greek.head()

Unnamed: 0,Id,Alpha,Beta,Gamma,Delta,Epsilon
0,000ff2bfdfe9,B,C,G,D,3/19/2019
1,007255e47698,A,C,M,B,Unknown
2,013f2bd269f5,A,C,M,B,Unknown
3,043ac50845d5,A,C,M,B,Unknown
4,044fb8a146ec,D,B,F,B,3/25/2020


In [5]:
print(df_train.shape)
print(df_greek.shape)

(617, 58)
(617, 6)


# Data Preprocessing

First we will deal with NaNs values

In [6]:
df_train.isnull().sum()

Id        0
AB        0
AF        0
AH        0
AM        0
AR        0
AX        0
AY        0
AZ        0
BC        0
BD        0
BN        0
BP        0
BQ       60
BR        0
BZ        0
CB        2
CC        3
CD        0
CF        0
CH        0
CL        0
CR        0
CS        0
CU        0
CW        0
DA        0
DE        0
DF        0
DH        0
DI        0
DL        0
DN        0
DU        1
DV        0
DY        0
EB        0
EE        0
EG        0
EH        0
EJ        0
EL       60
EP        0
EU        0
FC        1
FD        0
FE        0
FI        0
FL        1
FR        0
FS        2
GB        0
GE        0
GF        0
GH        0
GI        0
GL        1
Class     0
dtype: int64

Since the dataset have 617 instances, drop more than 60 instaces would be too much, hence we will opt for the option of replacing them with means since all of the missing values are in numerical columns

In [7]:
df_train = df_train.fillna(df_train.mean(numeric_only=True))

In [8]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 617 entries, 0 to 616
Data columns (total 58 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Id      617 non-null    object 
 1   AB      617 non-null    float64
 2   AF      617 non-null    float64
 3   AH      617 non-null    float64
 4   AM      617 non-null    float64
 5   AR      617 non-null    float64
 6   AX      617 non-null    float64
 7   AY      617 non-null    float64
 8   AZ      617 non-null    float64
 9   BC      617 non-null    float64
 10  BD      617 non-null    float64
 11  BN      617 non-null    float64
 12  BP      617 non-null    float64
 13  BQ      617 non-null    float64
 14  BR      617 non-null    float64
 15  BZ      617 non-null    float64
 16  CB      617 non-null    float64
 17  CC      617 non-null    float64
 18  CD      617 non-null    float64
 19  CF      617 non-null    float64
 20  CH      617 non-null    float64
 21  CL      617 non-null    float64
 22  CR

There are 1 categorical column, let's investigate

In [9]:
np.unique(df_train["EJ"].values)

array(['A', 'B'], dtype=object)

We will use one hot encoding since these are nominal data. A common practice of one-hot encoding is to remove 1 column to reduce multicollinearity

In [10]:
ej_df = pd.get_dummies(df_train["EJ"])
if len(ej_df.columns) > 1:
    ej_df = ej_df.iloc[:,1:]

In [11]:
X_train = df_train.drop("EJ", axis=1)
X_train.reset_index(drop=True, inplace=True)
X_train = pd.concat([X_train, ej_df], ignore_index=True, axis=1)

In [12]:
X_train

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,48,49,50,51,52,53,54,55,56,57
0,000ff2bfdfe9,0.209377,3109.03329,85.200147,22.394407,8.138688,0.699861,0.025578,9.812214,5.555634,...,1.73855,0.094822,11.339138,72.611063,2003.810319,22.136229,69.834944,0.120343,1,1
1,007255e47698,0.145282,978.76416,85.200147,36.968889,8.138688,3.632190,0.025578,13.517790,1.229900,...,0.49706,0.568932,9.292698,72.611063,27981.562750,29.135430,32.131996,21.978000,0,0
2,013f2bd269f5,0.470030,2635.10654,85.200147,32.360553,8.138688,6.732840,0.025578,12.824570,1.229900,...,0.97556,1.198821,37.077772,88.609437,13676.957810,28.022851,35.192676,0.196941,0,1
3,043ac50845d5,0.252107,3819.65177,120.201618,77.112203,8.138688,3.685344,0.025578,11.053708,1.229900,...,0.49706,0.284466,18.529584,82.416803,2094.262452,39.948656,90.493248,0.155829,0,1
4,044fb8a146ec,0.380297,3733.04844,85.200147,14.103738,8.138688,3.942255,0.054810,3.396778,102.151980,...,48.50134,0.121914,16.408728,146.109943,8524.370502,45.381316,36.262628,0.096614,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
612,fd3dafe738fd,0.149555,3130.05946,123.763599,9.513984,13.020852,3.499305,0.077343,8.545512,2.804172,...,1.26092,0.067730,8.967128,217.148554,8095.932828,24.640462,69.191944,21.978000,0,0
613,fd895603f071,0.435846,5462.03438,85.200147,46.551007,15.973224,5.979825,0.025882,12.622906,3.777550,...,1.24236,0.426699,35.896418,496.994214,3085.308063,29.648928,124.808872,0.145340,0,1
614,fd8ef6377f76,0.427300,2459.10720,130.138587,55.355778,10.005552,8.070549,0.025578,15.408390,1.229900,...,0.49706,0.067730,19.962092,128.896894,6474.652866,26.166072,119.559420,21.978000,0,0
615,fe1942975e40,0.363205,1263.53524,85.200147,23.685856,8.138688,7.981959,0.025578,7.524588,1.229900,...,0.78764,0.670527,24.594488,72.611063,1965.343176,25.116750,37.155112,0.184622,0,1


In [13]:
y_train = X_train[56]
X_train.drop(56, axis=1, inplace=True)

In [14]:
X_train = X_train.iloc[:, 1:]

In [15]:
pca = PCA(n_components=None)
X_train_trans = pca.fit_transform(X_train.iloc[:, 1:])
pca.explained_variance_

array([3.74734006e+08, 1.28517353e+08, 5.72759445e+07, 1.16549328e+07,
       5.18623693e+06, 3.10744145e+06, 1.80004928e+06, 1.57308690e+05,
       9.79483569e+04, 3.34418842e+04, 2.83241563e+04, 2.36079536e+04,
       1.45937949e+04, 1.35787724e+04, 7.96511566e+03, 6.29997691e+03,
       4.14717887e+03, 3.33425454e+03, 2.50828938e+03, 2.29662943e+03,
       1.74147532e+03, 1.19698659e+03, 1.01102415e+03, 6.64421899e+02,
       5.52401256e+02, 3.40184993e+02, 2.84671560e+02, 1.95305722e+02,
       1.18537117e+02, 1.05415631e+02, 8.76484361e+01, 7.73514039e+01,
       6.16590773e+01, 4.58297672e+01, 4.30108473e+01, 2.35334172e+01,
       2.08983847e+01, 1.40947934e+01, 1.08980608e+01, 8.55375520e+00,
       6.27258275e+00, 4.04250949e+00, 3.07677319e+00, 2.22568841e+00,
       1.59911672e+00, 1.47410143e+00, 2.01886040e-01, 1.21629337e-01,
       9.02072837e-02, 7.56652906e-02, 5.88119682e-02, 3.70131717e-02,
       8.78774308e-03, 6.76257853e-03, 6.20047888e-05])

In [16]:
explained_var = pca.explained_variance_ratio_

In [17]:
explained_ratio = list(explained_var*100 / sum(explained_var))
explained_ratio_int = []
for i in range(len(explained_ratio)):
    explained_ratio_int.append(int(explained_ratio[i]))

In [18]:
explained_ratio_int

[64,
 22,
 9,
 2,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0]

In [19]:
pipe_lr = make_pipeline(StandardScaler(), PCA(n_components=3), LogisticRegression(random_state=1, solver='lbfgs'))

In [20]:
pipe_lr.fit(X_train, y_train.values.flatten())

In [21]:
df_test = pd.read_csv("/kaggle/input/icr-identify-age-related-conditions/test.csv")

In [22]:
df_test

Unnamed: 0,Id,AB,AF,AH,AM,AR,AX,AY,AZ,BC,...,FI,FL,FR,FS,GB,GE,GF,GH,GI,GL
0,00eed32682bb,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,010ebe33f668,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,02fa521e1838,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,040e15f562a2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,046e85c7cc7f,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [23]:
ej_df = pd.get_dummies(df_test["EJ"])

In [24]:
if len(ej_df.columns) > 1:
    ej_df = ej_df.iloc[:, 1:]

In [25]:
X_test = df_test.drop("EJ", axis=1)

In [26]:
X_test.reset_index(drop=True, inplace=True)

In [27]:
X_test = pd.concat([X_test, ej_df], ignore_index=True, axis=1)

In [28]:
X_test = X_test.iloc[:, 1:]

In [29]:
X_test

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,47,48,49,50,51,52,53,54,55,56
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1


In [30]:
output = pipe_lr.predict_proba(X_test)

In [31]:
df_test["class_0"] = output[:, 0]

In [32]:
df_test["class_1"] = output[:, 1]

In [33]:
output = df_test[["Id", "class_0", "class_1"]]

In [34]:
output.to_csv('submission.csv', index=False)

In [35]:
output

Unnamed: 0,Id,class_0,class_1
0,00eed32682bb,0.956029,0.043971
1,010ebe33f668,0.956029,0.043971
2,02fa521e1838,0.956029,0.043971
3,040e15f562a2,0.956029,0.043971
4,046e85c7cc7f,0.956029,0.043971
