In [1]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split

In [2]:
df = pd.read_csv('asthma_disease_data.csv')

In [11]:
df.shape

(2392, 29)

In [12]:
df.head()

Unnamed: 0,PatientID,Age,Gender,Ethnicity,EducationLevel,BMI,Smoking,PhysicalActivity,DietQuality,SleepQuality,...,LungFunctionFEV1,LungFunctionFVC,Wheezing,ShortnessOfBreath,ChestTightness,Coughing,NighttimeSymptoms,ExerciseInduced,Diagnosis,HighAllergenExposure
0,-1.731327,0.96574,-0.98671,0.334986,-1.455673,-1.582769,-0.406355,-1.432099,0.160113,0.971063,...,-1.368934,0.920608,-1.214986,-1.000836,0.993333,-1.006711,-1.230954,0.808131,-0.233824,0.058905
1,-1.729879,-0.747054,1.013469,1.349273,0.771363,-0.6233,-0.406355,0.291269,0.453069,-1.076746,...,-0.407132,-1.564256,0.823055,-1.000836,-1.006711,0.993333,0.812378,0.808131,-0.233824,0.924776
2,-1.72843,0.687989,-0.98671,1.349273,-0.342155,-1.229074,-0.406355,0.58133,1.434458,-0.102976,...,-0.987146,0.983019,0.823055,0.999164,0.993333,-1.006711,0.812378,0.808131,-0.233824,-1.551231
3,-1.726982,-0.09897,1.013469,1.349273,-0.342155,1.565307,-0.406355,-1.256398,0.276233,-1.59688,...,0.561114,-1.105641,0.823055,-1.000836,0.993333,0.993333,0.812378,-1.237424,-0.233824,0.047956
4,-1.725534,0.873156,-0.98671,-0.679301,1.88488,-1.105686,-0.406355,-0.154081,-0.651625,1.504976,...,1.070095,-0.516586,0.823055,0.999164,0.993333,-1.006711,-1.230954,0.808131,-0.233824,-0.012831


In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2392 entries, 0 to 2391
Data columns (total 29 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   PatientID               2392 non-null   float64
 1   Age                     2392 non-null   float64
 2   Gender                  2392 non-null   float64
 3   Ethnicity               2392 non-null   float64
 4   EducationLevel          2392 non-null   float64
 5   BMI                     2392 non-null   float64
 6   Smoking                 2392 non-null   float64
 7   PhysicalActivity        2392 non-null   float64
 8   DietQuality             2392 non-null   float64
 9   SleepQuality            2392 non-null   float64
 10  PollutionExposure       2392 non-null   float64
 11  PollenExposure          2392 non-null   float64
 12  DustExposure            2392 non-null   float64
 13  PetAllergy              2392 non-null   float64
 14  FamilyHistoryAsthma     2392 non-null   

In [3]:
# Checking for missing values
missing_values = df.isnull().sum()
print("Missing values in each column:")
print(missing_values)

Missing values in each column:
PatientID                 0
Age                       0
Gender                    0
Ethnicity                 0
EducationLevel            0
BMI                       0
Smoking                   0
PhysicalActivity          0
DietQuality               0
SleepQuality              0
PollutionExposure         0
PollenExposure            0
DustExposure              0
PetAllergy                0
FamilyHistoryAsthma       0
HistoryOfAllergies        0
Eczema                    0
HayFever                  0
GastroesophagealReflux    0
LungFunctionFEV1          0
LungFunctionFVC           0
Wheezing                  0
ShortnessOfBreath         0
ChestTightness            0
Coughing                  0
NighttimeSymptoms         0
ExerciseInduced           0
Diagnosis                 0
DoctorInCharge            0
dtype: int64


In [4]:
# Fill missing values where applicable
# For simplicity, we'll fill numerical columns with the median and categorical columns with the mode
numerical_features = df.select_dtypes(include=['float64', 'int64']).columns
categorical_features = df.select_dtypes(include=['object']).columns

In [5]:
for column in numerical_features:
    df[column].fillna(df[column].median(), inplace=True)

for column in categorical_features:
    df[column].fillna(df[column].mode()[0], inplace=True)

In [6]:
scaler = StandardScaler()
df[numerical_features] = scaler.fit_transform(df[numerical_features])

In [7]:
# Encode categorical features using one-hot encoding
df = pd.get_dummies(df, columns=categorical_features, drop_first=True)

In [8]:
# Example: Creating a feature indicating high exposure to allergens
allergen_exposure_features = ['PollenExposure', 'DustExposure', 'PetAllergy']
df['HighAllergenExposure'] = df[allergen_exposure_features].sum(axis=1)

In [9]:
# Splitting the data into training and testing sets
X = df.drop(columns=['PatientID', 'Diagnosis'])  # 'Diagnosis' is the target variable
y = df['Diagnosis']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [10]:
# Save the processed data
X_train.to_csv('X_train.csv', index=False)
X_test.to_csv('X_test.csv', index=False)
y_train.to_csv('y_train.csv', index=False)
y_test.to_csv('y_test.csv', index=False)