# Feature Engineering and Selection

In [10]:
import pandas as pd
from sklearn.impute import SimpleImputer

# Sample data
data={
    'Feature1':[1.0,2.0,None,4.0,5.0],
    'Feature2':[2.0,None,4.0,5.0,None],
    'Feature3':[None,3.0,3.5,4.0,4.5]
}
df=pd.DataFrame(data)
#Handling missing values
imputer=SimpleImputer(strategy='mean')
df_imputed=pd.DataFrame(imputer.fit_transform(df),columns=df.columns)
print("After imputation:\n",df_imputed)

After imputation:
    Feature1  Feature2  Feature3
0       1.0  2.000000      3.75
1       2.0  3.666667      3.00
2       3.0  4.000000      3.50
3       4.0  5.000000      4.00
4       5.0  3.666667      4.50


Encoding categorical variables

In [11]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder

#sample data
data={
    'Color':['Red','Blue','Green','Blue','Red']
}
df=pd.DataFrame(data)
#encoding categorical variables
encoder=OneHotEncoder(sparse_output=False)
encoded_categories=encoder.fit_transform(df[['Color']])
df_encoded=pd.DataFrame(encoded_categories,columns=encoder.get_feature_names_out(['Color']))
df=pd.concat([df,df_encoded],axis=1).drop('Color',axis=1)
print("After One-Hot encoding:\n",df)

After One-Hot encoding:
    Color_Blue  Color_Green  Color_Red
0         0.0          0.0        1.0
1         1.0          0.0        0.0
2         0.0          1.0        0.0
3         1.0          0.0        0.0
4         0.0          0.0        1.0


Feature Scaling

In [12]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
#sample data
data={
    'Feature1':[10,20,30,40,50],
    'Feature2':[100,200,300,400,500]
}
df=pd.DataFrame(data)
#feature scaling
scaler=MinMaxScaler()
df_scaled=pd.DataFrame(scaler.fit_transform(df),columns=df.columns)
print("After Min-Max Scaling:\n",df_scaled)

After Min-Max Scaling:
    Feature1  Feature2
0      0.00      0.00
1      0.25      0.25
2      0.50      0.50
3      0.75      0.75
4      1.00      1.00


Feature Creation

In [15]:
import pandas as pd
from sklearn.preprocessing import PolynomialFeatures
#sample data
data={
    'Feature1':[1,2,3,4,5],
    'Feature2':[2,3,4,5,6]
}
df=pd.DataFrame(data)
#feature creation
poly=PolynomialFeatures(degree=2,include_bias=False)
poly_features=poly.fit_transform(df)
df_poly=pd.DataFrame(poly_features,columns=poly.get_feature_names_out(['Feature1','Feature2']))
print("After creating polynomial features:\n",df_poly)

After creating polynomial features:
    Feature1  Feature2  Feature1^2  Feature1 Feature2  Feature2^2
0       1.0       2.0         1.0                2.0         4.0
1       2.0       3.0         4.0                6.0         9.0
2       3.0       4.0         9.0               12.0        16.0
3       4.0       5.0        16.0               20.0        25.0
4       5.0       6.0        25.0               30.0        36.0
