# Feature Engineering

In [1]:
# Sample DataFrame
import pandas as pd
import numpy as np

data = {
    'Age': [25, 30, np.nan, 35, 40, np.nan, 45],
    'Salary': [50000, np.nan, 70000, np.nan, 90000, 120000, 110000],
    'Gender': ['Male', 'Female', np.nan, 'Female', 'Male', 'Male', np.nan],
    'Experience': [1, 3, 5, np.nan, 10, 15, np.nan]
}

df = pd.DataFrame(data)
print("Original DataFrame with Missing Values:")
print(df)

Original DataFrame with Missing Values:
    Age    Salary  Gender  Experience
0  25.0   50000.0    Male         1.0
1  30.0       NaN  Female         3.0
2   NaN   70000.0     NaN         5.0
3  35.0       NaN  Female         NaN
4  40.0   90000.0    Male        10.0
5   NaN  120000.0    Male        15.0
6  45.0  110000.0     NaN         NaN


In [2]:
print(df.isnull().sum())  # Shows count of missing values per column

Age           2
Salary        2
Gender        2
Experience    2
dtype: int64


In [3]:
# Remove missing values

# Drop rows with any missing values
df_dropped_rows = df.dropna()
print("\nDataFrame after dropping rows with missing values:")
print(df_dropped_rows)


DataFrame after dropping rows with missing values:
    Age   Salary Gender  Experience
0  25.0  50000.0   Male         1.0
4  40.0  90000.0   Male        10.0


In [4]:
# Impute missing values with the mean
df_mean = df
df_mean['Age'].fillna(df_mean['Age'].mean(), inplace=True)
df_mean['Salary'].fillna(df_mean['Salary'].mean(), inplace=True)
df_mean['Experience'].fillna(df_mean['Experience'].mean(), inplace=True)

print("\nDataFrame after imputing numerical columns with mean:")
print(df_mean)


DataFrame after imputing numerical columns with mean:
    Age    Salary  Gender  Experience
0  25.0   50000.0    Male         1.0
1  30.0   88000.0  Female         3.0
2  35.0   70000.0     NaN         5.0
3  35.0   88000.0  Female         6.8
4  40.0   90000.0    Male        10.0
5  35.0  120000.0    Male        15.0
6  45.0  110000.0     NaN         6.8


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_mean['Age'].fillna(df_mean['Age'].mean(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_mean['Salary'].fillna(df_mean['Salary'].mean(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object 

In [5]:

# Impute missing values with the mean
df_median = df
df_median['Age'].fillna(df_median['Age'].median(), inplace=True)
df_median['Salary'].fillna(df_median['Salary'].median(), inplace=True)
df_median['Experience'].fillna(df_median['Experience'].median(), inplace=True)

print("\nDataFrame after imputing numerical columns with median:")
print(df_median)



DataFrame after imputing numerical columns with median:
    Age    Salary  Gender  Experience
0  25.0   50000.0    Male         1.0
1  30.0   88000.0  Female         3.0
2  35.0   70000.0     NaN         5.0
3  35.0   88000.0  Female         6.8
4  40.0   90000.0    Male        10.0
5  35.0  120000.0    Male        15.0
6  45.0  110000.0     NaN         6.8


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_median['Age'].fillna(df_median['Age'].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_median['Salary'].fillna(df_median['Salary'].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermed

In [6]:
# Impute missing values with the mode
df_mode = df
df_mode['Gender'].fillna(df_mode['Gender'].mode()[0], inplace=True)


print("\nDataFrame after imputing categorical columns with mode:")
print(df_mode)



DataFrame after imputing categorical columns with mode:
    Age    Salary  Gender  Experience
0  25.0   50000.0    Male         1.0
1  30.0   88000.0  Female         3.0
2  35.0   70000.0    Male         5.0
3  35.0   88000.0  Female         6.8
4  40.0   90000.0    Male        10.0
5  35.0  120000.0    Male        15.0
6  45.0  110000.0    Male         6.8


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_mode['Gender'].fillna(df_mode['Gender'].mode()[0], inplace=True)


In [7]:
import pandas as pd

# Sample DataFrame
df = pd.DataFrame({
    'Price': [10, 20, 15, 18, 25],
    'Color': ['Red', 'Blue', None, 'Green', 'Red'],
    'Size': [None, 'M', 'L', 'S', 'M']
})


df['Color'] = df['Color'].fillna(df['Color'].mode()[0])  # Most frequent value
df['Size'] = df['Size'].fillna(df['Size'].mode()[0])  # Most frequent value

print(df)

   Price  Color Size
0     10    Red    M
1     20   Blue    M
2     15    Red    L
3     18  Green    S
4     25    Red    M


In [8]:
# Label Encoding  vs ordinal Encoding 

from sklearn.preprocessing import LabelEncoder

# Apply Label Encoding to 'Size'
label_encoder = LabelEncoder()
df['Size'] = label_encoder.fit_transform(df['Size'])

print("\nDataFrame after Label Encoding for 'Size':\n", df)


DataFrame after Label Encoding for 'Size':
    Price  Color  Size
0     10    Red     1
1     20   Blue     1
2     15    Red     0
3     18  Green     2
4     25    Red     1


In [9]:
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder


# Define the preprocessor
preprocessor = ColumnTransformer(
    transformers=[('onehot', OneHotEncoder(), ['Color'])],  # Use column names
    remainder='passthrough'  # Keep numerical columns as they are
)

# Apply the transformation
transformed_data = preprocessor.fit_transform(df)

# Convert back to DataFrame with feature names
feature_names = preprocessor.get_feature_names_out()
df_encoded = pd.DataFrame(transformed_data, columns=feature_names)

print(df_encoded)

   onehot__Color_Blue  onehot__Color_Green  onehot__Color_Red  \
0                 0.0                  0.0                1.0   
1                 1.0                  0.0                0.0   
2                 0.0                  0.0                1.0   
3                 0.0                  1.0                0.0   
4                 0.0                  0.0                1.0   

   remainder__Price  remainder__Size  
0              10.0              1.0  
1              20.0              1.0  
2              15.0              0.0  
3              18.0              2.0  
4              25.0              1.0  


In [10]:
import numpy as np
import pandas as pd

# Creating a dataset with different types of values
np.random.seed(42)
data = {
    'Feature_1': np.random.randint(10, 100, 10),    # Random integers (positive)
    'Feature_2': np.random.randn(10) * 10,         # Normally distributed values
    'Feature_3': np.random.exponential(5, 10),     # Skewed data (exponential)
    'Feature_4': np.random.randint(-50, 50, 10),   # Positive & Negative values
    'Feature_5': [5, 10, 15, 1000, 20, 25, 30, 35, 40, 45]  # Outlier in the middle
}

df = pd.DataFrame(data)
print("Original Dataset:\n", df)

Original Dataset:
    Feature_1  Feature_2  Feature_3  Feature_4  Feature_5
0         61  -8.852303   0.238948         22          5
1         24  -4.121885  18.201498        -12         10
2         81  -4.826188   1.324852        -33         15
3         70   1.641648   0.474887        -47       1000
4         30   2.330952   4.816728         38         20
5         92   1.179946   2.410073          9         25
6         96  14.623781  20.441083        -37         30
7         84  15.387150   3.143946        -42         35
8         84 -24.391058   9.828436         39         40
9         97   6.034412   5.701979          2         45


In [11]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler, MaxAbsScaler

# Initialize scalers
standard_scaler = StandardScaler()
minmax_scaler = MinMaxScaler()
robust_scaler = RobustScaler()
maxabs_scaler = MaxAbsScaler()

# Apply scalers
df_standard = pd.DataFrame(standard_scaler.fit_transform(df), columns=df.columns)
df_minmax = pd.DataFrame(minmax_scaler.fit_transform(df), columns=df.columns)
df_robust = pd.DataFrame(robust_scaler.fit_transform(df), columns=df.columns)
df_maxabs = pd.DataFrame(maxabs_scaler.fit_transform(df), columns=df.columns)

# Log Transformation (Handling zero by adding 1)
df_log = np.log1p(df)

# Display results
print("\nStandard Scaled Data:\n", df_standard)
print("\nMinMax Scaled Data:\n", df_minmax)
print("\nRobust Scaled Data:\n", df_robust)
print("\nMaxAbs Scaled Data:\n", df_maxabs)
print("\nLog Transformed Data:\n", df_log)


Standard Scaled Data:
    Feature_1  Feature_2  Feature_3  Feature_4  Feature_5
0  -0.439070  -0.795715  -0.930574   0.900525  -0.401358
1  -1.929492  -0.365681   1.673370  -0.189078  -0.384279
2   0.366563  -0.429708  -0.773156  -0.862069  -0.367200
3  -0.076535   0.158271  -0.896371  -1.310729   2.997374
4  -1.687802   0.220935  -0.266956   1.413280  -0.350121
5   0.809662   0.116299  -0.615837   0.483912  -0.333042
6   0.970788   1.338453   1.998032  -0.990257  -0.315962
7   0.487408   1.407850  -0.509451  -1.150493  -0.298883
8   0.487408  -2.208314   0.459568   1.445327  -0.281804
9   1.011070   0.557610  -0.138625   0.259582  -0.264725

MinMax Scaled Data:
    Feature_1  Feature_2  Feature_3  Feature_4  Feature_5
0   0.506849   0.390635   0.000000   0.802326   0.000000
1   0.000000   0.509555   0.889141   0.406977   0.005025
2   0.780822   0.491849   0.053752   0.162791   0.010050
3   0.630137   0.654446   0.011679   0.000000   1.000000
4   0.082192   0.671775   0.226599   0.988

  result = func(self.values, **kwargs)
