In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

fname = "diabetes.csv"

# search current directory and parent directories for the file
matches = []
for root in [Path(".")] + list(Path.cwd().parents):
	matches.extend(list(root.rglob(fname)))

if matches:
	path = matches[0]
	print(f"Found file at: {path}")
	dataset = pd.read_csv(path)
	print(dataset.head(10))
else:
	print(f"File '{fname}' not found. Checked current and parent directories.")

Found file at: c:\Users\Noman Traders\Documents\GitHub\ml-daily-journal\datasets\diabetes.csv
   Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
0            6      148             72             35        0  33.6   
1            1       85             66             29        0  26.6   
2            8      183             64              0        0  23.3   
3            1       89             66             23       94  28.1   
4            0      137             40             35      168  43.1   
5            5      116             74              0        0  25.6   
6            3       78             50             32       88  31.0   
7           10      115              0              0        0  35.3   
8            2      197             70             45      543  30.5   
9            8      125             96              0        0   0.0   

   DiabetesPedigreeFunction  Age  Outcome  
0                     0.627   50        1  
1                     0.3

In [2]:
dataset.head(3)

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1


In [3]:
input_data = dataset.iloc[:, :-1]
output_data = dataset["Outcome"]

In [4]:
from sklearn.model_selection import train_test_split

In [5]:
X_train, X_test, y_train, y_test = train_test_split(
				input_data, output_data, test_size=0.25)

In [7]:
print("X_train:", X_train.shape, "X_test:", X_test.shape)

X_train: (576, 8) X_test: (192, 8)


In [8]:
# features and target
X = dataset.drop("Outcome", axis=1)
y = dataset["Outcome"]

# stratified split to preserve class balance
X_train, X_test, y_train, y_test = train_test_split(
	X, y, test_size=0.2, random_state=42, stratify=y
)

# quick checks
print("X_train:", X_train.shape, "X_test:", X_test.shape)
print("y_train distribution:\n", y_train.value_counts(normalize=True))
print("y_test distribution:\n", y_test.value_counts(normalize=True))

X_train: (614, 8) X_test: (154, 8)
y_train distribution:
 Outcome
0    0.651466
1    0.348534
Name: proportion, dtype: float64
y_test distribution:
 Outcome
0    0.649351
1    0.350649
Name: proportion, dtype: float64
