In [11]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/widsdatathon2024-challenge1/sample_submission.csv
/kaggle/input/widsdatathon2024-challenge1/training.csv
/kaggle/input/widsdatathon2024-challenge1/test.csv


In [12]:
import matplotlib.pyplot as plt
import seaborn as sns
from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler

In [13]:
train = pd.read_csv("/kaggle/input/widsdatathon2024-challenge1/training.csv")
test = pd.read_csv("/kaggle/input/widsdatathon2024-challenge1/test.csv")
sample_submission = pd.read_csv("/kaggle/input/widsdatathon2024-challenge1/sample_submission.csv")

In [14]:
train.shape

(12906, 83)

In [15]:
train.head()

Unnamed: 0,patient_id,patient_race,payer_type,patient_state,patient_zip3,patient_age,patient_gender,bmi,breast_cancer_diagnosis_code,breast_cancer_diagnosis_desc,...,disabled,poverty,limited_english,commute_time,health_uninsured,veteran,Ozone,PM25,N02,DiagPeriodL90D
0,475714,,MEDICAID,CA,924,84,F,,C50919,Malignant neoplasm of unsp site of unspecified...,...,12.871429,22.542857,10.1,27.814286,11.2,3.5,52.23721,8.650555,18.606528,1
1,349367,White,COMMERCIAL,CA,928,62,F,28.49,C50411,Malig neoplm of upper-outer quadrant of right ...,...,8.957576,10.109091,8.057576,30.606061,7.018182,4.10303,42.301121,8.487175,20.113179,1
2,138632,White,COMMERCIAL,TX,760,43,F,38.09,C50112,Malignant neoplasm of central portion of left ...,...,11.253333,9.663333,3.356667,31.394915,15.066667,7.446667,40.108207,7.642753,14.839351,1
3,617843,White,COMMERCIAL,CA,926,45,F,,C50212,Malig neoplasm of upper-inner quadrant of left...,...,8.845238,8.688095,5.280952,27.561905,4.404762,4.809524,42.070075,7.229393,15.894123,0
4,817482,,COMMERCIAL,ID,836,55,F,,1749,"Malignant neoplasm of breast (female), unspeci...",...,15.276,11.224,1.946,26.170213,12.088,13.106,41.356058,4.110749,11.722197,0


In [16]:
test.head()

Unnamed: 0,patient_id,patient_race,payer_type,patient_state,patient_zip3,patient_age,patient_gender,bmi,breast_cancer_diagnosis_code,breast_cancer_diagnosis_desc,...,hispanic,disabled,poverty,limited_english,commute_time,health_uninsured,veteran,Ozone,PM25,N02
0,573710,White,MEDICAID,IN,467,54,F,,C50412,Malig neoplasm of upper-outer quadrant of left...,...,3.564516,13.996774,7.985484,0.969355,24.955357,10.83871,8.080645,38.724876,7.947165,11.157161
1,593679,,COMMERCIAL,FL,337,52,F,,C50912,Malignant neoplasm of unspecified site of left...,...,10.261538,16.020513,13.602564,2.836842,23.952632,10.579487,9.302564,36.918257,7.838973,13.599985
2,184532,Hispanic,MEDICAID,CA,917,61,F,,C50911,Malignant neoplasm of unsp site of right femal...,...,47.726087,9.895652,10.515217,12.745652,32.530435,7.263043,3.81087,47.310325,9.595719,20.084231
3,447383,Hispanic,MEDICARE ADVANTAGE,CA,917,64,F,,C50912,Malignant neoplasm of unspecified site of left...,...,47.726087,9.895652,10.515217,12.745652,32.530435,7.263043,3.81087,47.310325,9.595719,20.084231
4,687972,Black,,CA,900,40,F,23.0,C50412,Malig neoplasm of upper-outer quadrant of left...,...,45.526154,11.901538,20.76,14.7375,30.709375,10.341538,3.030769,41.186992,11.166898,21.644261


In [26]:
# Separate features and target from the training dataset
X_train = train.drop(['DiagPeriodL90D'], axis=1)
y_train = train['DiagPeriodL90D']

# Impute missing values for categorical columns with the most frequent value
categorical_cols = X_train.select_dtypes(include=['object']).columns
imputer = SimpleImputer(strategy="most_frequent")
X_train[categorical_cols] = imputer.fit_transform(X_train[categorical_cols])

# Impute missing values for numerical columns with the mean
numerical_cols = X_train.select_dtypes(include=['int64', 'float64']).columns
mean_imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
X_train[numerical_cols] = mean_imputer.fit_transform(X_train[numerical_cols])

# Encode categorical variables using one-hot encoding
X_train = pd.get_dummies(X_train, drop_first=True)

In [27]:
desc = pd.DataFrame(index = list(train))
desc['type'] = train.dtypes
desc['null'] = train.isnull().sum()
desc.sort_values(by=['type','null']).style.background_gradient(axis=0)

Unnamed: 0,type,null
patient_id,float64,0
patient_zip3,float64,0
patient_age,float64,0
bmi,float64,0
population,float64,0
density,float64,0
age_median,float64,0
age_under_10,float64,0
age_10_to_19,float64,0
age_20s,float64,0


In [28]:
# Apply the same imputation to the test dataset
test[categorical_cols] = imputer.transform(test[categorical_cols])
test[numerical_cols] = mean_imputer.transform(test[numerical_cols])

# Encode categorical variables for the test dataset
# Note: Ensure to apply the same encoding as the training dataset
test = pd.get_dummies(test, drop_first=True)

# Align the test dataset columns with the training dataset
# Add missing dummy variables to the test set
for col in X_train.columns:
    if col not in test.columns:
        test[col] = 0

# Ensure the order of columns in the test set matches that of the training set
test = test.reindex(columns=X_train.columns, fill_value=0)

In [29]:
desc = pd.DataFrame(index = list(train))
desc['type'] = train.dtypes
desc['null'] = train.isnull().sum()
desc.sort_values(by=['type','null']).style.background_gradient(axis=0)

Unnamed: 0,type,null
patient_id,float64,0
patient_zip3,float64,0
patient_age,float64,0
bmi,float64,0
population,float64,0
density,float64,0
age_median,float64,0
age_under_10,float64,0
age_10_to_19,float64,0
age_20s,float64,0


In [30]:
# Split the training data for validation
X_train_split, X_val, y_train_split, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

# Initialize and train the model
model = RandomForestClassifier(random_state=42)
model.fit(X_train_split, y_train_split)

# Predict on the validation set
predictions = model.predict(X_val)

# Evaluate the model
print("Accuracy:", accuracy_score(y_val, predictions))
print("ROC-AUC Score:", roc_auc_score(y_val, predictions))

Accuracy: 0.7780790085205267
ROC-AUC Score: 0.7382073884664558


In [33]:
# Generate predictions for the test set
test_predictions = model.predict_proba(test)[:, 1]

# Create submission DataFrame
submission_df = pd.DataFrame({'patient_id': test.index, 'Probability': test_predictions})

print(test_predictions)

[0.9  0.7  0.7  ... 0.87 0.03 0.8 ]
