# Feature Engineering
- **Age Category**
- **BMI Category** : Underweight, normal Weight, Overweight, Obesity
- **Pollution Risk** : Location and Air Pollution Level
- **Smoking Status Encoding**
- **Interaction Features**
- **Location Encoding**

In [48]:
# import the libraries needed
import pandas as  pd
import numpy as np

In [49]:
#Load the data
path = r'/Users/roshanthapa/Desktop/Omdena-Capacity_Building/copd/COPD_Prediction/Data/synthetic_COPD_data.csv'
df = pd.read_csv(path)
df.head()

Unnamed: 0,Age,Gender,Smoking_Status,Biomass_Fuel_Exposure,Occupational_Exposure,Family_History_COPD,BMI,Location,Air_Pollution_Level,Respiratory_Infections_Childhood,COPD_Diagnosis
0,31,Male,Former,1,1,1,27.56,Lalitpur,84,0,0
1,60,Male,Never,1,0,0,30.3,Pokhara,131,1,0
2,33,Male,Former,0,0,1,28.45,Pokhara,123,1,0
3,36,Female,Current,1,0,0,27.49,Kathmandu,253,0,1
4,58,Male,Never,0,0,0,25.49,Pokhara,117,1,0


In [50]:
# Age categories
df['Age_Category'] = pd.cut(df['Age'], bins=[29, 39, 49, 59, 69, 79], labels=['30-39', '40-49', '50-59', '60-69', '70-79'])
df['BMI_Categories'] = pd.cut(df['BMI'], bins=[0, 18.5, 24.5, 29.9, 35], labels=['Underweight', 'Normal Weight', 'Overweight', 'Obesity'])

In [51]:
df.head()

Unnamed: 0,Age,Gender,Smoking_Status,Biomass_Fuel_Exposure,Occupational_Exposure,Family_History_COPD,BMI,Location,Air_Pollution_Level,Respiratory_Infections_Childhood,COPD_Diagnosis,Age_Category,BMI_Categories
0,31,Male,Former,1,1,1,27.56,Lalitpur,84,0,0,30-39,Overweight
1,60,Male,Never,1,0,0,30.3,Pokhara,131,1,0,60-69,Obesity
2,33,Male,Former,0,0,1,28.45,Pokhara,123,1,0,30-39,Overweight
3,36,Female,Current,1,0,0,27.49,Kathmandu,253,0,1,30-39,Overweight
4,58,Male,Never,0,0,0,25.49,Pokhara,117,1,0,50-59,Overweight


In [52]:
# Pollution risk score
df['Pollution_Risk_Score'] = np.where(df['Air_Pollution_Level'] > 150, 1, 0)

# Encode smoking status
df['Smoking_Status_Encoding'] = df['Smoking_Status'].map({'Never': 0, 'Former': 0.5, 'Current': 1}) 

#encode gender
df['Gender_Encoding'] = df['Gender'].map({'Female': 0, 'Male' : 1})

# Interaction Features : Smoking status and pollution level
df['Smmoking_Pollution_interaction'] = df['Smoking_Status_Encoding'] * df['Air_Pollution_Level']

In [53]:
df.head()

Unnamed: 0,Age,Gender,Smoking_Status,Biomass_Fuel_Exposure,Occupational_Exposure,Family_History_COPD,BMI,Location,Air_Pollution_Level,Respiratory_Infections_Childhood,COPD_Diagnosis,Age_Category,BMI_Categories,Pollution_Risk_Score,Smoking_Status_Encoding,Gender_Encoding,Smmoking_Pollution_interaction
0,31,Male,Former,1,1,1,27.56,Lalitpur,84,0,0,30-39,Overweight,0,0.5,1,42.0
1,60,Male,Never,1,0,0,30.3,Pokhara,131,1,0,60-69,Obesity,0,0.0,1,0.0
2,33,Male,Former,0,0,1,28.45,Pokhara,123,1,0,30-39,Overweight,0,0.5,1,61.5
3,36,Female,Current,1,0,0,27.49,Kathmandu,253,0,1,30-39,Overweight,1,1.0,0,253.0
4,58,Male,Never,0,0,0,25.49,Pokhara,117,1,0,50-59,Overweight,0,0.0,1,0.0


In [54]:
# One hot encoding for the location
df = pd.get_dummies(df, columns=['Location'], drop_first=True)

# Machine Learning Data

In [55]:
df = df.drop(columns=['Gender', 'Smoking_Status','Age_Category', 'BMI_Categories'])

In [56]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 21 columns):
 #   Column                            Non-Null Count  Dtype  
---  ------                            --------------  -----  
 0   Age                               1000 non-null   int64  
 1   Biomass_Fuel_Exposure             1000 non-null   int64  
 2   Occupational_Exposure             1000 non-null   int64  
 3   Family_History_COPD               1000 non-null   int64  
 4   BMI                               1000 non-null   float64
 5   Air_Pollution_Level               1000 non-null   int64  
 6   Respiratory_Infections_Childhood  1000 non-null   int64  
 7   COPD_Diagnosis                    1000 non-null   int64  
 8   Pollution_Risk_Score              1000 non-null   int64  
 9   Smoking_Status_Encoding           1000 non-null   float64
 10  Gender_Encoding                   1000 non-null   int64  
 11  Smmoking_Pollution_interaction    1000 non-null   float64
 12  Locatio

In [57]:
# Save my engineered data
df.to_csv('engineered_COPD_DATA.csv', index=False)