## Load Dataset

In [13]:
import pandas as pd
import numpy as np

In [14]:
csv_path = "/Users/saifmohammed/Desktop/CSE299/ML/Test/Copy Dataset/Diabetes_Final_Data_V2.csv"
df = pd.read_csv(csv_path)
df

Unnamed: 0,age,gender,pulse_rate,systolic_bp,diastolic_bp,glucose,height,weight,bmi,family_diabetes,hypertensive,family_hypertension,cardiovascular_disease,stroke,diabetic
0,42,Female,66,110,73,5.88,1.65,70.2,25.75,0,0,0,0,0,No
1,35,Female,60,125,68,5.71,1.47,42.5,19.58,0,0,0,0,0,No
2,62,Female,57,127,74,6.85,1.52,47.0,20.24,0,0,0,0,0,No
3,73,Male,55,193,112,6.28,1.63,57.4,21.72,0,0,0,0,0,No
4,68,Female,71,150,81,5.71,1.42,36.0,17.79,0,0,0,0,0,No
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5432,74,Male,83,164,89,6.47,1.60,64.0,24.99,0,1,0,1,0,No
5433,75,Male,67,141,104,8.31,1.65,62.0,22.75,0,0,0,0,0,Yes
5434,40,Female,67,134,114,7.61,1.50,69.0,30.72,0,1,0,0,1,No
5435,36,Female,62,139,80,4.90,1.52,41.5,17.87,0,0,0,0,0,No


## Handling Categorical Data

In [15]:
# Convert 'diabetic', 'gender' column to categorical type
df['diabetic'] = df['diabetic'].astype('category')
df['gender'] = df['gender'].astype('category')

In [16]:
from sklearn.preprocessing import OrdinalEncoder

# Select the categorical columns
df_cat_diabetic = df[["diabetic"]]
df_cat_gender = df[["gender"]]

# Create an Ordinal Encoder
ordinal_encoder = OrdinalEncoder()

# Fit and transform the 'diabetic', 'gender' column
df_cat_diabetic_encoded = ordinal_encoder.fit_transform(df_cat_diabetic)
df_cat_gender_encoded = ordinal_encoder.fit_transform(df_cat_gender)

# Convert it to a DataFrame
df_cat_diabetic_encoded = pd.DataFrame(df_cat_diabetic_encoded, columns=["diabetic_encoded"])
df_cat_gender_encoded = pd.DataFrame(df_cat_gender_encoded, columns=["gender_encoded"])

# Now add these back to the original DataFrame
df["diabetic"] = df_cat_diabetic_encoded["diabetic_encoded"]
df["gender"] = df_cat_gender_encoded["gender_encoded"]

In [17]:
df

Unnamed: 0,age,gender,pulse_rate,systolic_bp,diastolic_bp,glucose,height,weight,bmi,family_diabetes,hypertensive,family_hypertension,cardiovascular_disease,stroke,diabetic
0,42,0.0,66,110,73,5.88,1.65,70.2,25.75,0,0,0,0,0,0.0
1,35,0.0,60,125,68,5.71,1.47,42.5,19.58,0,0,0,0,0,0.0
2,62,0.0,57,127,74,6.85,1.52,47.0,20.24,0,0,0,0,0,0.0
3,73,1.0,55,193,112,6.28,1.63,57.4,21.72,0,0,0,0,0,0.0
4,68,0.0,71,150,81,5.71,1.42,36.0,17.79,0,0,0,0,0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5432,74,1.0,83,164,89,6.47,1.60,64.0,24.99,0,1,0,1,0,0.0
5433,75,1.0,67,141,104,8.31,1.65,62.0,22.75,0,0,0,0,0,1.0
5434,40,0.0,67,134,114,7.61,1.50,69.0,30.72,0,1,0,0,1,0.0
5435,36,0.0,62,139,80,4.90,1.52,41.5,17.87,0,0,0,0,0,0.0


## Spliting Dataset

In [18]:
from sklearn.model_selection import train_test_split

train_set, test_set = train_test_split(df, test_size=0.2, random_state=42)

# train_set and test_set are your split data
print(f"Training set size: {len(train_set)}")
print(f"Test set size: {len(test_set)}")

Training set size: 4349
Test set size: 1088


In [19]:
from sklearn.model_selection import StratifiedShuffleSplit

# Stratified shuffle split based on 'diabetic' column
split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)

for train_index, test_index in split.split(df, df["diabetic"]):
    strat_train_set = df.loc[train_index]
    strat_test_set = df.loc[test_index]

print(f"Stratified Training set size: {len(strat_train_set)}")
print(f"Stratified Test set size: {len(strat_test_set)}")

Stratified Training set size: 4349
Stratified Test set size: 1088


In [20]:
from zlib import crc32

# Create an identifier for each row (you can also use other stable features)
df["id"] = df.index

# Function to decide if an instance should go to the test set
def test_set_check(identifier, test_ratio):
    return crc32(np.int64(identifier)) & 0xffffffff < test_ratio * 2**32

# Split based on the identifier
in_test_set = df["id"].apply(lambda id_: test_set_check(id_, 0.2))

train_set = df.loc[~in_test_set]
test_set = df.loc[in_test_set]

print(f"Train set size: {len(train_set)}")
print(f"Test set size: {len(test_set)}")

Train set size: 4348
Test set size: 1089


In [23]:
from sklearn.model_selection import train_test_split, StratifiedShuffleSplit
import pandas as pd
import numpy as np

# Create a sample dataframe
np.random.seed(42)
size = 1000  # dataset size
diabetic_data = np.random.choice([0, 1], size=size, p=[0.9, 0.1])  # 10% diabetic, 90% non-diabetic

df = pd.DataFrame({
    'diabetic': diabetic_data,
    'age': np.random.randint(20, 80, size=size),
    'bmi': np.random.uniform(18, 35, size=size)
})

# Purely random sampling
train_set_random, test_set_random = train_test_split(df, test_size=0.2, random_state=42)

# Stratified sampling (using the 'diabetic' column to maintain class proportions)
split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in split.split(df, df["diabetic"]):
    strat_train_set = df.loc[train_index]
    strat_test_set = df.loc[test_index]

# Compare distributions (diabetic proportions) between population, random test set, and stratified test set
population_distribution = df["diabetic"].value_counts(normalize=True)
random_distribution = test_set_random["diabetic"].value_counts(normalize=True)
stratified_distribution = strat_test_set["diabetic"].value_counts(normalize=True)

# Display the distributions
print("Population Distribution:")
print(population_distribution)

print("\nRandom Sampling Test Set Distribution:")
print(random_distribution)

print("\nStratified Sampling Test Set Distribution:")
print(stratified_distribution)


Population Distribution:
diabetic
0    0.9
1    0.1
Name: proportion, dtype: float64

Random Sampling Test Set Distribution:
diabetic
0    0.925
1    0.075
Name: proportion, dtype: float64

Stratified Sampling Test Set Distribution:
diabetic
0    0.9
1    0.1
Name: proportion, dtype: float64
