In [12]:
#  Data Preprocessing Project 

# Step 1: Import Required Libraries
import numpy as np
import pandas as pd
from sklearn.datasets import make_regression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from scipy import stats
import matplotlib.pyplot as plt
import seaborn as sns

# Step 2: Generate Synthetic Dataset
X, y = make_regression(n_samples=1000, n_features=5, noise=10, random_state=42)
df = pd.DataFrame(X, columns=['Feature1', 'Feature2', 'Feature3', 'Feature4', 'Feature5'])
df['Target'] = y

# Step 3: Add Missing Values and Outliers
# Introduce missing values
df.loc[::10, 'Feature1'] = np.nan
df.loc[::15, 'Feature3'] = np.nan

# Introduce outliers
df.loc[5, 'Feature2'] = df['Feature2'].mean() + 10 * df['Feature2'].std()
df.loc[20, 'Feature4'] = df['Feature4'].mean() - 10 * df['Feature4'].std()

# Step 4: Handle Missing Values
print("Missing values before handling:\n", df.isnull().sum())
df.fillna(df.mean(), inplace=True)
print("\nMissing values after imputation:\n", df.isnull().sum())

# Step 5: Handle Outliers (column-wise Z-score filtering)
for col in df.columns[:-1]:  # Skip 'Target'
    z = np.abs(stats.zscore(df[col]))
    df = df[z < 3]

print("\nShape after outlier removal:", df.shape)

# Step 6: Feature Scaling
X = df.drop('Target', axis=1)
y = df['Target']

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Step 7: Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

print("\nTrain shape:", X_train.shape)
print("Test shape:", X_test.shape)  # Final check


Missing values before handling:
 Feature1    100
Feature2      0
Feature3     67
Feature4      0
Feature5      0
Target        0
dtype: int64

Missing values after imputation:
 Feature1    0
Feature2    0
Feature3    0
Feature4    0
Feature5    0
Target      0
dtype: int64

Shape after outlier removal: (986, 6)

Train shape: (788, 5)
Test shape: (198, 5)
