Mock Challenge 1: Data Science (Classification)
Problem

You are working for a health analytics company. You’ve been given anonymized patient records and need to predict whether a patient has diabetes (binary classification).

Train dataset: train.csv

Test dataset: test.csv

Both contain features like age, bmi, blood_pressure, glucose_level.
The target column in train is diabetes (0 = no, 1 = yes).

You must submit a CSV with two columns: id, prediction.

In [3]:
import pandas as pd

# Replace with your actual file path
data = pd.read_csv("diabetes_prediction_dataset.csv")

In [4]:
print(data.head())

   gender   age  hypertension  heart_disease smoking_history    bmi  \
0  Female  80.0             0              1           never  25.19   
1  Female  54.0             0              0         No Info  27.32   
2    Male  28.0             0              0           never  27.32   
3  Female  36.0             0              0         current  23.45   
4    Male  76.0             1              1         current  20.14   

   HbA1c_level  blood_glucose_level  diabetes  
0          6.6                  140         0  
1          6.6                   80         0  
2          5.7                  158         0  
3          5.0                  155         0  
4          4.8                  155         0  


In [5]:
print(data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 9 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   gender               100000 non-null  object 
 1   age                  100000 non-null  float64
 2   hypertension         100000 non-null  int64  
 3   heart_disease        100000 non-null  int64  
 4   smoking_history      100000 non-null  object 
 5   bmi                  100000 non-null  float64
 6   HbA1c_level          100000 non-null  float64
 7   blood_glucose_level  100000 non-null  int64  
 8   diabetes             100000 non-null  int64  
dtypes: float64(3), int64(4), object(2)
memory usage: 6.9+ MB
None


In [6]:
print(data['diabetes'].value_counts())

diabetes
0    91500
1     8500
Name: count, dtype: int64


In [7]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

In [8]:
# Separate features and target
X = data.drop(columns=['diabetes'])
y = data['diabetes']

In [9]:
# Identify categorical and numeric columns
cat_cols = ['gender', 'smoking_history']
num_cols = [c for c in X.columns if c not in cat_cols and c != 'id']

In [10]:
# Preprocessors
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

In [11]:
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

In [12]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, num_cols),
        ('cat', categorical_transformer, cat_cols)
    ])

In [13]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

In [14]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [15]:
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('clf', RandomForestClassifier(n_estimators=200, random_state=42))
])

In [16]:
model.fit(X_train, y_train)

In [17]:
val_preds = model.predict(X_val)

In [18]:
print("Validation Accuracy:", accuracy_score(y_val, val_preds))

Validation Accuracy: 0.9705


In [19]:
print(classification_report(y_val, val_preds))

              precision    recall  f1-score   support

           0       0.97      1.00      0.98     18300
           1       0.95      0.69      0.80      1700

    accuracy                           0.97     20000
   macro avg       0.96      0.84      0.89     20000
weighted avg       0.97      0.97      0.97     20000



In [20]:
# Simulated "test" data = validation set
test = X_val.copy()
test['id'] = range(len(test))  # create fake IDs if not present

In [21]:
preds = model.predict(test.drop(columns=['id']))

In [22]:
# Submission file
submission = pd.DataFrame({
    'id': test['id'],
    'prediction': preds
})

submission.to_csv("submission.csv", index=False)
print(submission.head())

       id  prediction
69456   0           0
86614   1           0
61660   2           0
8558    3           0
4619    4           0
