In [1]:
# import the libraries needed
import pandas as pd
import numpy as np

In [2]:
# Load the dataset
path = "../Data/COPD.csv"
df = pd.read_csv(path)
df.head()

Unnamed: 0,Diseases & Risk Factors,Year,Country Name,Gender,Numeric,Lower Confidence Limit,Upper Confidence Limit
0,Noncommunicable diseases,2000,Nepal,Males,19.28516,12.01887,28.61572
1,Noncommunicable diseases,2000,Nepal,Females,19.23258,11.27446,28.83658
2,Noncommunicable diseases,2001,Nepal,Males,18.83932,11.70583,28.04834
3,Noncommunicable diseases,2001,Nepal,Females,18.76374,11.07713,28.0886
4,Noncommunicable diseases,2002,Nepal,Males,17.34758,10.63618,26.09991


In [3]:
# One-hot encoding for 'Gender', 'Diseases & Risk Factors', and 'Country Name'
df_encoded = pd.get_dummies(df, columns=['Gender', 'Diseases & Risk Factors', 'Country Name'], drop_first=True)

In [4]:
# New Feature
# Confidence interval range
df['Upper Confidence Limit'] = pd.to_numeric(df['Upper Confidence Limit'], errors='coerce')
df['Lower Confidence Limit'] = pd.to_numeric(df['Lower Confidence Limit'], errors='coerce')
df['Confidence_Range'] = df['Upper Confidence Limit'] - df['Lower Confidence Limit']

# Year bins
df['Year_Bin'] = pd.cut(df['Year'], bins=[1980, 1990, 2000, 2010, 2020], labels=['1980s', '1990s', '2000s', '2010s'])

In [5]:
# Apply log transformation to normalize skewed data (only if Numeric > 0)
df['Numeric_log'] = np.log(df['Numeric'] + 1)  # Add 1 to avoid log(0)


In [6]:
# Interaction between 'Gender' and 'Diseases & Risk Factors'
df['Gender_Disease'] = df['Gender'] + "_" + df['Diseases & Risk Factors']
df_encoded = pd.get_dummies(df, columns=['Gender_Disease'], drop_first=True)


In [7]:
from sklearn.preprocessing import StandardScaler

# Normalize Numeric column
scaler = StandardScaler()
df['Numeric_scaled'] = scaler.fit_transform(df[['Numeric']])

In [8]:
# Drop Unnecessary Columns after feature engineering
columns_to_drop = ['Lower Confidence Limit', 'Upper Confidence Limit', 'Year']
df.drop(columns=columns_to_drop, inplace=True)

In [9]:
# Save the new DataFrame
df.to_csv("COPD_feature_engineered.csv", index=False, encoding='utf-8')