In [45]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import recall_score
from sklearn.preprocessing import LabelEncoder

In [46]:
# Read the training dataset
df = pd.read_csv("/kaggle/input/icr-identify-age-related-conditions/train.csv")
df=df.iloc[:,1:]
df.head()

Unnamed: 0,AB,AF,AH,AM,AR,AX,AY,AZ,BC,BD,...,FL,FR,FS,GB,GE,GF,GH,GI,GL,Class
0,0.209377,3109.03329,85.200147,22.394407,8.138688,0.699861,0.025578,9.812214,5.555634,4126.58731,...,7.298162,1.73855,0.094822,11.339138,72.611063,2003.810319,22.136229,69.834944,0.120343,1
1,0.145282,978.76416,85.200147,36.968889,8.138688,3.63219,0.025578,13.51779,1.2299,5496.92824,...,0.173229,0.49706,0.568932,9.292698,72.611063,27981.56275,29.13543,32.131996,21.978,0
2,0.47003,2635.10654,85.200147,32.360553,8.138688,6.73284,0.025578,12.82457,1.2299,5135.78024,...,7.70956,0.97556,1.198821,37.077772,88.609437,13676.95781,28.022851,35.192676,0.196941,0
3,0.252107,3819.65177,120.201618,77.112203,8.138688,3.685344,0.025578,11.053708,1.2299,4169.67738,...,6.122162,0.49706,0.284466,18.529584,82.416803,2094.262452,39.948656,90.493248,0.155829,0
4,0.380297,3733.04844,85.200147,14.103738,8.138688,3.942255,0.05481,3.396778,102.15198,5728.73412,...,8.153058,48.50134,0.121914,16.408728,146.109943,8524.370502,45.381316,36.262628,0.096614,1


In [47]:
# Check which columns are of object type
object_columns = df.columns[df.dtypes == "object"]

# Perform label encoding for object columns
encoder = LabelEncoder()
for column in object_columns:
    df[column] = encoder.fit_transform(df[column])

In [48]:
# Fill missing values
df['EL'] = df['EL'].fillna(df['EL'].mean())
df['BQ'] = df['BQ'].fillna(df['BQ'].mean())

# Drop rows with missing values
df.dropna(inplace=True)

In [49]:
# Split data into features and target
X_train = df.iloc[:, :-1]
Y_train = df["Class"]

In [50]:
# Define parameter grid for grid search
parameter_grid = {
    'penalty': ['l1', 'l2'],
    'C': [0.1, 1, 10],
    'solver': ['liblinear', 'saga'],
    'class_weight': [None, 'balanced']
}

In [51]:
# Initialize logistic regression model
regressor = LogisticRegression()

In [52]:
# Perform grid search to find best parameters
grid_search = GridSearchCV(regressor, parameter_grid, cv=5, scoring='recall')
grid_search.fit(X_train, Y_train)

In [53]:
# Training accuracy
Y_predict = grid_search.predict(X_train)
training_recall = recall_score(Y_train, Y_predict)
print("Training Recall Score:", training_recall)

Training Recall Score: 0.9439252336448598


In [54]:
# Read the test dataset
test_df = pd.read_csv("/kaggle/input/icr-identify-age-related-conditions/test.csv")

# Perform label encoding for object columns in the test dataset
for column in object_columns:
    test_df[column] = encoder.transform(test_df[column])

# Fill missing values in the test dataset
test_df['EL'] = test_df['EL'].fillna(test_df['EL'].mean())
test_df['BQ'] = test_df['BQ'].fillna(test_df['BQ'].mean())

# Predict probabilities for the test dataset
y_predict = grid_search.predict_proba(test_df.iloc[:, 1:])

In [55]:
# Create submission dataframe
submission = pd.DataFrame()
submission["Id"] = test_df['Id']
submission["class_0"] = pd.Series([y_predict[i, 0] for i in range(len(y_predict))])
submission["class_1"] = pd.Series([y_predict[i, 1] for i in range(len(y_predict))])
submission.to_csv("submission.csv", index=False)