In [1]:
## Data download:
import dask.dataframe as dd
csv_path = '../healthcare-dataset-stroke-data.csv'

# Dask may mis-infer integer columns when some partitions contain missing values.
# Two common fixes are:
#  1) pass assume_missing=True to treat unspecified integer columns as floats, or
#  2) provide explicit dtypes for troublesome columns (e.g. {'age': 'float64'}).
# We'll try a robust approach: read with assume_missing=True and fall back to an explicit dtype if needed.
try:
    data = dd.read_csv(csv_path, assume_missing=True)
except Exception as e:
    print('dd.read_csv failed:', e)
    print('Retrying with explicit dtype for age as float64')
    data = dd.read_csv(csv_path, dtype={'age': 'float64'}, assume_missing=True)

# Experimenting with the Number of Layers
In this experiment, we will explore how varying the number of layers in a neural network affects its performance on a given task. We will use a simple feedforward neural network architecture and train it on a standard dataset, such as MNIST or CIFAR-10.

## Methodology
1. **Dataset Selection**: Choose a dataset (e.g., MNIST, CIFAR-10) for training and evaluation.
2. **Model Architecture**: Define a feedforward neural network with a variable number of layers.
3. **Training**: Train the model on the selected dataset.
4. **Evaluation**: Evaluate the model's performance using appropriate metrics (e.g., accuracy, loss).
5. **Analysis**: Analyze how the number of layers impacts performance.


In [3]:
# Preview the data
data.head()

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046.0,Male,67.0,0.0,1.0,Yes,Private,Urban,228.69,36.6,formerly smoked,1.0
1,51676.0,Female,61.0,0.0,0.0,Yes,Self-employed,Rural,202.21,,never smoked,1.0
2,31112.0,Male,80.0,0.0,1.0,Yes,Private,Rural,105.92,32.5,never smoked,1.0
3,60182.0,Female,49.0,0.0,0.0,Yes,Private,Urban,171.23,34.4,smokes,1.0
4,1665.0,Female,79.0,1.0,0.0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1.0


In [5]:
# Data needs to be cleaned to numerical values only, and some columns are categorical and not binary. We will use pandas get_dummies to convert categorical columns to binary columns.
data = data.compute()
data = dd.from_pandas(data, npartitions=4)
data = data.categorize()
data = dd.get_dummies(data, columns=['gender', 'ever_married', 'work_type', 'Residence_type', 'smoking_status'], drop_first=True)
data = data.dropna()
data = data.compute()

# Preview the cleaned data
data.head()

Unnamed: 0,id,age,hypertension,heart_disease,avg_glucose_level,bmi,stroke,gender_Male,gender_Other,ever_married_Yes,work_type_Never_worked,work_type_Private,work_type_Self-employed,work_type_children,Residence_type_Urban,smoking_status_formerly smoked,smoking_status_never smoked,smoking_status_smokes
0,9046.0,67.0,0.0,1.0,228.69,36.6,1.0,True,False,True,False,True,False,False,True,True,False,False
2,31112.0,80.0,0.0,1.0,105.92,32.5,1.0,True,False,True,False,True,False,False,False,False,True,False
3,60182.0,49.0,0.0,0.0,171.23,34.4,1.0,False,False,True,False,True,False,False,True,False,False,True
4,1665.0,79.0,1.0,0.0,174.12,24.0,1.0,False,False,True,False,False,True,False,False,False,True,False
5,56669.0,81.0,0.0,0.0,186.21,29.0,1.0,True,False,True,False,True,False,False,True,True,False,False
