In [2]:

import pandas as pd
import numpy as np
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.feature_selection import mutual_info_classif, SelectKBest, RFE
from sklearn.linear_model import LogisticRegression, LassoCV
from sklearn.ensemble import RandomForestClassifier


iris = load_iris(as_frame=True)
df = iris.data
df['target'] = iris.target


np.random.seed(42)
for col in df.columns[:-1]:
    df.loc[df.sample(frac=0.1).index, col] = np.nan

print("Dataset with missing values:")
print(df.head())


Dataset with missing values:
   sepal length (cm)  sepal width (cm)  petal length (cm)  petal width (cm)  \
0                5.1               3.5                1.4               0.2   
1                4.9               3.0                1.4               0.2   
2                4.7               3.2                1.3               0.2   
3                4.6               3.1                1.5               0.2   
4                5.0               3.6                1.4               0.2   

   target  
0       0  
1       0  
2       0  
3       0  
4       0  


In [3]:
# Impute missing values
imputer = SimpleImputer(strategy='mean')
df_imputed = pd.DataFrame(imputer.fit_transform(df), columns=df.columns)

print("\nDataset after imputing missing values:")
print(df_imputed.head())



Dataset after imputing missing values:
   sepal length (cm)  sepal width (cm)  petal length (cm)  petal width (cm)  \
0                5.1               3.5                1.4               0.2   
1                4.9               3.0                1.4               0.2   
2                4.7               3.2                1.3               0.2   
3                4.6               3.1                1.5               0.2   
4                5.0               3.6                1.4               0.2   

   target  
0     0.0  
1     0.0  
2     0.0  
3     0.0  
4     0.0  


In [4]:
# Standardization
scaler = StandardScaler()
df_standardized = pd.DataFrame(scaler.fit_transform(df_imputed.iloc[:, :-1]), columns=df.columns[:-1])

# Normalization
min_max_scaler = MinMaxScaler()
df_normalized = pd.DataFrame(min_max_scaler.fit_transform(df_imputed.iloc[:, :-1]), columns=df.columns[:-1])

print("\nStandardized data (first 5 rows):")
print(df_standardized.head())
print("\nNormalized data (first 5 rows):")
print(df_normalized.head())



Standardized data (first 5 rows):
   sepal length (cm)  sepal width (cm)  petal length (cm)  petal width (cm)
0          -0.930834          1.129303          -1.377004         -1.322351
1          -1.184698         -0.102166          -1.377004         -1.322351
2          -1.438562          0.390422          -1.436490         -1.322351
3          -1.565493          0.144128          -1.317517         -1.322351
4          -1.057766          1.375597          -1.377004         -1.322351

Normalized data (first 5 rows):
   sepal length (cm)  sepal width (cm)  petal length (cm)  petal width (cm)
0           0.222222          0.625000           0.067797          0.041667
1           0.166667          0.416667           0.067797          0.041667
2           0.111111          0.500000           0.050847          0.041667
3           0.083333          0.458333           0.084746          0.041667
4           0.194444          0.666667           0.067797          0.041667


In [5]:

df_noisy = df_standardized.copy()
df_noisy['sepal length (cm)'] += np.random.normal(0, 0.5, df_noisy.shape[0])


df_noisy['sepal length (cm)'] = df_noisy['sepal length (cm)'].rolling(window=3, min_periods=1).mean()

print("\nNoisy and smoothed data (sepal length):")
print(df_noisy['sepal length (cm)'].head())



Noisy and smoothed data (sepal length):
0   -1.089518
1   -1.022498
2   -1.130150
3   -1.151110
4   -0.964123
Name: sepal length (cm), dtype: float64


In [6]:
# Detect outliers using Z-score
z_scores = np.abs(scaler.fit_transform(df_imputed.iloc[:, :-1]))
outliers = (z_scores > 3).any(axis=1)

# Handle outliers (removal)
df_outliers_removed = df_imputed[~outliers]

print(f"\nNumber of outliers detected: {outliers.sum()}")
print("Dataset after outlier removal:")
print(df_outliers_removed.head())



Number of outliers detected: 1
Dataset after outlier removal:
   sepal length (cm)  sepal width (cm)  petal length (cm)  petal width (cm)  \
0                5.1               3.5                1.4               0.2   
1                4.9               3.0                1.4               0.2   
2                4.7               3.2                1.3               0.2   
3                4.6               3.1                1.5               0.2   
4                5.0               3.6                1.4               0.2   

   target  
0     0.0  
1     0.0  
2     0.0  
3     0.0  
4     0.0  


In [7]:
# Correlation
correlation_matrix = df_imputed.iloc[:, :-1].corr()
print("\nCorrelation Matrix:")
print(correlation_matrix)

# Mutual Information
mutual_info = mutual_info_classif(df_imputed.iloc[:, :-1], df_imputed['target'])
print("\nMutual Information Scores:")
print(mutual_info)



Correlation Matrix:
                   sepal length (cm)  sepal width (cm)  petal length (cm)  \
sepal length (cm)           1.000000         -0.103691           0.758701   
sepal width (cm)           -0.103691          1.000000          -0.430143   
petal length (cm)           0.758701         -0.430143           1.000000   
petal width (cm)            0.732059         -0.330547           0.876528   

                   petal width (cm)  
sepal length (cm)          0.732059  
sepal width (cm)          -0.330547  
petal length (cm)          0.876528  
petal width (cm)           1.000000  

Mutual Information Scores:
[0.42404846 0.2318256  0.86046997 0.91552908]


In [8]:
# (RFE)
model = LogisticRegression()
rfe = RFE(model, n_features_to_select=2)
rfe.fit(df_imputed.iloc[:, :-1], df_imputed['target'])

print("\nRFE Selected Features:")
print(df_imputed.columns[:-1][rfe.support_])



RFE Selected Features:
Index(['petal length (cm)', 'petal width (cm)'], dtype='object')


In [9]:
# Lasso Regression
lasso = LassoCV(cv=5)
lasso.fit(df_imputed.iloc[:, :-1], df_imputed['target'])
lasso_coefs = pd.Series(lasso.coef_, index=df_imputed.columns[:-1])

print("\nLasso Coefficients:")
print(lasso_coefs[lasso_coefs != 0])  # Features with non-zero coefficients



Lasso Coefficients:
sepal length (cm)    0.022613
petal length (cm)    0.319242
petal width (cm)     0.292710
dtype: float64
