<a href="https://colab.research.google.com/github/sayansen2003/sayansen2003/blob/main/feature%20engineering.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split



In [2]:
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/heart-disease/processed.cleveland.data"
column_names = ['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalach', 'exang', 'oldpeak', 'slope', 'ca', 'thal', 'target']
df = pd.read_csv(url, names=column_names)


In [3]:
print(df.head())


    age  sex   cp  trestbps   chol  fbs  restecg  thalach  exang  oldpeak  \
0  63.0  1.0  1.0     145.0  233.0  1.0      2.0    150.0    0.0      2.3   
1  67.0  1.0  4.0     160.0  286.0  0.0      2.0    108.0    1.0      1.5   
2  67.0  1.0  4.0     120.0  229.0  0.0      2.0    129.0    1.0      2.6   
3  37.0  1.0  3.0     130.0  250.0  0.0      0.0    187.0    0.0      3.5   
4  41.0  0.0  2.0     130.0  204.0  0.0      2.0    172.0    0.0      1.4   

   slope   ca thal  target  
0    3.0  0.0  6.0       0  
1    2.0  3.0  3.0       2  
2    2.0  2.0  7.0       1  
3    3.0  0.0  3.0       0  
4    1.0  0.0  3.0       0  


In [4]:
df.replace('?', np.nan, inplace=True)

In [5]:
df['ca'] = pd.to_numeric(df['ca'], errors='coerce')
df['thal'] = pd.to_numeric(df['thal'], errors='coerce')


In [6]:
print("Data types:\n", df.dtypes)
print("\nMissing values:\n", df.isnull().sum())


Data types:
 age         float64
sex         float64
cp          float64
trestbps    float64
chol        float64
fbs         float64
restecg     float64
thalach     float64
exang       float64
oldpeak     float64
slope       float64
ca          float64
thal        float64
target        int64
dtype: object

Missing values:
 age         0
sex         0
cp          0
trestbps    0
chol        0
fbs         0
restecg     0
thalach     0
exang       0
oldpeak     0
slope       0
ca          4
thal        2
target      0
dtype: int64


In [8]:
categorical_cols = df.columns[df.nunique() <= 10].tolist()
categorical_cols = [col for col in categorical_cols if col != 'target']
numerical_cols = df.columns.difference(categorical_cols).tolist()


In [9]:
print("\nNumerical columns:", numerical_cols)
print("\nCategorical columns:", categorical_cols)



Numerical columns: ['age', 'chol', 'oldpeak', 'target', 'thalach', 'trestbps']

Categorical columns: ['sex', 'cp', 'fbs', 'restecg', 'exang', 'slope', 'ca', 'thal']


In [10]:
if not numerical_cols:
    raise ValueError("No numerical columns found")
if not categorical_cols:
    raise ValueError("No categorical columns found")


In [11]:
num_imputer = SimpleImputer(strategy='mean')
df[numerical_cols] = num_imputer.fit_transform(df[numerical_cols])


In [12]:
cat_imputer = SimpleImputer(strategy='most_frequent')
df[categorical_cols] = cat_imputer.fit_transform(df[categorical_cols])


In [15]:
#df = pd.DataFrame(df, columns=column_names + one_hot_encoder.get_feature_names_out(categorical_cols).tolist())
one_hot_encoder = OneHotEncoder(sparse=False, handle_unknown='ignore')
encoded_cats = one_hot_encoder.fit_transform(df[categorical_cols])
df = pd.DataFrame(df, columns=column_names + one_hot_encoder.get_feature_names_out(categorical_cols).tolist())




In [16]:
encoded_cats_df = pd.DataFrame(encoded_cats, columns=one_hot_encoder.get_feature_names_out(categorical_cols))
df = pd.concat([df.reset_index(drop=True), encoded_cats_df.reset_index(drop=True)], axis=1)
df.drop(columns=categorical_cols, inplace=True)


In [17]:
scaler = StandardScaler()
df[numerical_cols] = scaler.fit_transform(df[numerical_cols])


In [18]:
def handle_outliers(df, numerical_cols):
    for col in numerical_cols:
        Q1 = df[col].quantile(0.25)
        Q3 = df[col].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        df[col] = np.where(df[col] > upper_bound, upper_bound,np.where(df[col] < lower_bound, lower_bound, df[col]))
    return df


In [19]:
df = handle_outliers(df, numerical_cols)


In [20]:
df['age_group'] = pd.cut(df['age'], bins=[0, 30, 40, 50, 60, 70, 80], labels=False)


In [21]:
X = df.drop('target', axis=1)
y = df['target']


In [22]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [23]:
print("Preprocessing and feature engineering completed successfully!")

Preprocessing and feature engineering completed successfully!
