## Dataset Preparation

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, MinMaxScaler, StandardScaler


data = {
    "Age": [22, 25, np.nan, 28, 35, 30, 24],
    "Salary": [22000, 25000, 27000, np.nan, 120000, 30000, 24000],
    "Gender": ["Male", "Female", "Female", "Male", "Male", np.nan, "Female"],
    "City": ["Ahmedabad", "Surat", "Vadodara", "Ahmedabad", "Surat", "Vadodara", "Surat"]
}

df = pd.DataFrame(data)

## Handling missing values

In [3]:
df["Age"] = df["Age"].fillna(df["Age"].mean())
df["Salary"] = df["Salary"].fillna(df["Salary"].median())
df["Gender"] = df["Gender"].fillna(df["Gender"].mode()[0])

print(df.isnull().sum())
print(df)

Age       0
Salary    0
Gender    0
City      0
dtype: int64
         Age    Salary  Gender       City
0  22.000000   22000.0    Male  Ahmedabad
1  25.000000   25000.0  Female      Surat
2  27.333333   27000.0  Female   Vadodara
3  28.000000   26000.0    Male  Ahmedabad
4  35.000000  120000.0    Male      Surat
5  30.000000   30000.0  Female   Vadodara
6  24.000000   24000.0  Female      Surat


## Label Encoder

In [5]:
le = LabelEncoder()
df["Gender_Label"] = le.fit_transform(df["Gender"])

print("\nAfter Label Encoding:\n", df)


After Label Encoding:
         Age    Salary  Gender       City  Gender_Label
0  0.000000  0.000000    Male  Ahmedabad             1
1  0.230769  0.030612  Female      Surat             0
2  0.410256  0.051020  Female   Vadodara             0
3  0.461538  0.040816    Male  Ahmedabad             1
4  1.000000  1.000000    Male      Surat             1
5  0.615385  0.081633  Female   Vadodara             0
6  0.153846  0.020408  Female      Surat             0


## One Hot Encoding

In [6]:
df = pd.get_dummies(df, columns=["City"])
print(df)

        Age    Salary  Gender  Gender_Label  City_Ahmedabad  City_Surat  \
0  0.000000  0.000000    Male             1            True       False   
1  0.230769  0.030612  Female             0           False        True   
2  0.410256  0.051020  Female             0           False       False   
3  0.461538  0.040816    Male             1            True       False   
4  1.000000  1.000000    Male             1           False        True   
5  0.615385  0.081633  Female             0           False       False   
6  0.153846  0.020408  Female             0           False        True   

   City_Vadodara  
0          False  
1          False  
2           True  
3          False  
4          False  
5           True  
6          False  


## Normalization

In [4]:
scaler = MinMaxScaler()
df[["Age", "Salary"]] = scaler.fit_transform(df[["Age", "Salary"]])

print("\nAfter Normalization:\n", df)


After Normalization:
         Age    Salary  Gender       City
0  0.000000  0.000000    Male  Ahmedabad
1  0.230769  0.030612  Female      Surat
2  0.410256  0.051020  Female   Vadodara
3  0.461538  0.040816    Male  Ahmedabad
4  1.000000  1.000000    Male      Surat
5  0.615385  0.081633  Female   Vadodara
6  0.153846  0.020408  Female      Surat


## IQR

In [None]:
Q1 = df["Salary"].quantile(0.25)
Q3 = df["Salary"].quantile(0.75)
IQR = Q3 - Q1

lower = Q1 - 1.5 * IQR
upper = Q3 + 1.5 * IQR

outliers = df[(df["Salary"] < lower) | (df["Salary"] > upper)]
print("\nOutliers detected using IQR:\n", outliers)


## Regularization

In [7]:
from sklearn.linear_model import Ridge, Lasso
from sklearn.model_selection import train_test_split
from sklearn.datasets import make_regression

X, y = make_regression(n_samples=100, n_features=5, noise=10)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

ridge = Ridge(alpha=1.0)   # L2 Regularization
ridge.fit(X_train, y_train)

lasso = Lasso(alpha=0.1)   # L1 Regularization
lasso.fit(X_train, y_train)

print("Ridge Coefficients:", ridge.coef_)
print("Lasso Coefficients:", lasso.coef_)

Ridge Coefficients: [32.23210571 70.9737319  60.54705952 74.55506586 23.32806179]
Lasso Coefficients: [32.53485998 71.68692846 61.37333665 75.52989204 23.34014128]


In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, MinMaxScaler, StandardScaler

data = {
    "Age": [22, 25, np.nan, 28, 35, 30, 24],
    "Salary": [22000, 25000, 27000, np.nan, 120000, 30000, 24000],
    "Gender": ["Male", "Female", "Female", "Male", "Male", np.nan, "Female"],
    "City": ["Ahmedabad", "Surat", "Vadodara", "Ahmedabad", "Surat", "Vadodara", "Surat"]
}

df = pd.DataFrame(data)

df["Age"].fillna(df["Age"].mean(), inplace=True)
df["Salary"].fillna(df["Salary"].median(), inplace=True)
df["Gender"].fillna(df["Gender"].mode()[0], inplace=True)

le = LabelEncoder()
df["Gender_Label"] = le.fit_transform(df["Gender"])

df = pd.get_dummies(df, columns=["City"])

scaler = MinMaxScaler()
df[["Age", "Salary"]] = scaler.fit_transform(df[["Age", "Salary"]])

std = StandardScaler()
df[["Age", "Salary"]] = std.fit_transform(df[["Age", "Salary"]])

Q1 = df["Salary"].quantile(0.25)
Q3 = df["Salary"].quantile(0.75)
IQR = Q3 - Q1

lower = Q1 - 1.5 * IQR
upper = Q3 + 1.5 * IQR

outliers = df[(df["Salary"] < lower) | (df["Salary"] > upper)]

print("Outliers using IQR method:\n", outliers)
print("\nFinal Preprocessed Dataset:\n", df)


Outliers using IQR method:
         Age    Salary Gender  Gender_Label  City_Ahmedabad  City_Surat  \
4  1.922397  2.443517   Male             1           False        True   

   City_Vadodara  
4          False  

Final Preprocessed Dataset:
             Age    Salary  Gender  Gender_Label  City_Ahmedabad  City_Surat  \
0 -1.337319e+00 -0.518060    Male             1            True       False   
1 -5.850772e-01 -0.427400  Female             0           False        True   
2 -1.809506e-16 -0.366959  Female             0           False       False   
3  1.671649e-01 -0.397179    Male             1            True       False   
4  1.922397e+00  2.443517    Male             1           False        True   
5  6.686597e-01 -0.276299  Female             0           False       False   
6 -8.358246e-01 -0.457620  Female             0           False        True   

   City_Vadodara  
0          False  
1          False  
2           True  
3          False  
4          False  
5       

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["Age"].fillna(df["Age"].mean(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["Salary"].fillna(df["Salary"].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are se