In [15]:
#Load Data into Jupyter Notebook

import pandas as pd
import mysql.connector
from sqlalchemy import create_engine

# Connect to MySQL
engine = create_engine('mysql+mysqlconnector://root:root@localhost/data_pipeline_db')

# Load the data
df = pd.read_sql("SELECT * FROM customers", con=engine)
df


Unnamed: 0,id,age,income,city,education_level,purchase_amount
0,1,25.0,40000.0,New York,Bachelor,200.0
1,2,32.0,54000.0,Los Angeles,Master,250.0
2,3,,50000.0,Chicago,High School,220.0
3,4,45.0,,Houston,PhD,300.0
4,5,35.0,62000.0,New York,Bachelor,
5,6,28.0,35000.0,Chicago,Bachelor,180.0
6,7,60.0,72000.0,Los Angeles,PhD,400.0
7,8,22.0,30000.0,Houston,High School,100.0
8,9,40.0,80000.0,Miami,Master,320.0
9,10,29.0,39000.0,New York,PhD,210.0


In [27]:
#Handle Missing Data with Predictive Modeling (Linear Regression)

from sklearn.linear_model import LinearRegression

# Step 1: Clean column names (important if imported from SQL)
df.columns = df.columns.str.strip()

# Step 2: Ensure required columns exist
required_cols = ['age', 'income', 'purchase_amount']
for col in required_cols:
    if col not in df.columns:
        raise KeyError(f"Missing required column: '{col}'")

# Step 3: Impute missing 'income' using 'age' and 'purchase_amount'
if df['income'].isnull().sum() > 0:
    df_income_train = df[df['income'].notnull() & df['age'].notnull() & df['purchase_amount'].notnull()]
    df_income_missing = df[df['income'].isnull() & df['age'].notnull() & df['purchase_amount'].notnull()]

    if not df_income_missing.empty and not df_income_train.empty:
        reg_income = LinearRegression()
        reg_income.fit(df_income_train[['age', 'purchase_amount']], df_income_train['income'])

        predicted_income = reg_income.predict(df_income_missing[['age', 'purchase_amount']])
        df.loc[df['income'].isnull() & df['age'].notnull() & df['purchase_amount'].notnull(), 'income'] = predicted_income

# Step 4: Impute missing 'age' using 'income' and 'purchase_amount'
if df['age'].isnull().sum() > 0:
    df_age_train = df[df['age'].notnull() & df['income'].notnull() & df['purchase_amount'].notnull()]
    df_age_missing = df[df['age'].isnull() & df['income'].notnull() & df['purchase_amount'].notnull()]

    if not df_age_missing.empty and not df_age_train.empty:
        reg_age = LinearRegression()
        reg_age.fit(df_age_train[['income', 'purchase_amount']], df_age_train['age'])

        predicted_age = reg_age.predict(df_age_missing[['income', 'purchase_amount']])
        df.loc[df['age'].isnull() & df['income'].notnull() & df['purchase_amount'].notnull(), 'age'] = predicted_age

# Step 5: Show remaining missing values
print("Missing values after imputation:")
print(df[['age', 'income']].isnull().sum())


Missing values after imputation:
age       0
income    0
dtype: int64


In [31]:
#Outlier Detection and Removal (IQR method)

Q1 = df['income'].quantile(0.25)
Q3 = df['income'].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

df = df[(df['income'] >= lower_bound) & (df['income'] <= upper_bound)]


In [33]:
df

Unnamed: 0,id,age,income,city,education_level,purchase_amount
0,1,25.0,40000.0,New York,Bachelor,200.0
1,2,32.0,54000.0,Los Angeles,Master,250.0
2,3,31.661563,50000.0,Chicago,High School,220.0
3,4,45.0,15666.781636,Houston,PhD,300.0
4,5,35.0,62000.0,New York,Bachelor,
5,6,28.0,35000.0,Chicago,Bachelor,180.0
6,7,60.0,72000.0,Los Angeles,PhD,400.0
7,8,22.0,30000.0,Houston,High School,100.0
8,9,40.0,80000.0,Miami,Master,320.0
9,10,29.0,39000.0,New York,PhD,210.0


In [37]:
#Data Encoding
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, OrdinalEncoder

# Option 1: One-hot encoding for city
df_encoded = pd.get_dummies(df, columns=['city'])

# Option 2: Label encoding for education_level
label_encoder = LabelEncoder()
df_encoded['education_level_label'] = label_encoder.fit_transform(df_encoded['education_level'])

# Option 3: (Optional) Ordinal encoding — if education levels have a logical order
# Example order: High School < Diploma < Bachelor < Master < PhD
education_order = [['High School', 'Diploma', 'Bachelor', 'Master', 'PhD']]
ordinal_encoder = OrdinalEncoder(categories=education_order)
df_encoded['education_level_ordinal'] = ordinal_encoder.fit_transform(df_encoded[['education_level']])

# Preview
df_encoded.head()


Unnamed: 0,id,age,income,education_level,purchase_amount,city_Chicago,city_Houston,city_Los Angeles,city_Miami,city_New York,education_level_label,education_level_ordinal
0,1,25.0,40000.0,Bachelor,200.0,False,False,False,False,True,0,2.0
1,2,32.0,54000.0,Master,250.0,False,False,True,False,False,2,3.0
2,3,31.661563,50000.0,High School,220.0,True,False,False,False,False,1,0.0
3,4,45.0,15666.781636,PhD,300.0,False,True,False,False,False,3,4.0
4,5,35.0,62000.0,Bachelor,,False,False,False,False,True,0,2.0


In [41]:
#Scaling and Normalization

from sklearn.preprocessing import StandardScaler, MinMaxScaler

scaler = StandardScaler()

# Scale correct numeric columns
df_encoded[['age_scaled', 'income_scaled', 'purchase_amount_scaled']] = scaler.fit_transform(
    df_encoded[['age', 'income', 'purchase_amount']]
)

# Preview the result
df_encoded.head()


Unnamed: 0,id,age,income,education_level,purchase_amount,city_Chicago,city_Houston,city_Los Angeles,city_Miami,city_New York,education_level_label,education_level_ordinal,age_scaled,income_scaled,purchase_amount_scaled
0,1,25.0,40000.0,Bachelor,200.0,False,False,False,False,True,0,2.0,-0.921336,-0.41403,-0.509711
1,2,32.0,54000.0,Master,250.0,False,False,True,False,False,2,3.0,-0.260958,0.332289,0.093894
2,3,31.661563,50000.0,High School,220.0,True,False,False,False,False,1,0.0,-0.292886,0.119055,-0.268269
3,4,45.0,15666.781636,PhD,300.0,False,True,False,False,False,3,4.0,0.965458,-1.711199,0.6975
4,5,35.0,62000.0,Bachelor,,False,False,False,False,True,0,2.0,0.022061,0.758758,


In [45]:
#Use sklearn.pipeline

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder

# Define features correctly
numeric_features = ['age', 'income', 'purchase_amount']
categorical_features = ['city', 'education_level']

# Define transformers
numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Build ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ]
)

# Full pipeline
pipeline = Pipeline(steps=[('preprocessor', preprocessor)])

# Use the actual columns
X = df[numeric_features + categorical_features]

# Transform
X_processed = pipeline.fit_transform(X)

# Preview shape or output
print(X_processed.shape)



(10, 12)


In [53]:
import pandas as pd

# Get the one-hot encoded feature names
encoded_feature_names = pipeline.named_steps['preprocessor'].transformers_[1][1] \
    .named_steps['onehot'].get_feature_names_out(categorical_features)

# Combine all final column names
final_columns = numeric_features + list(encoded_feature_names)

# Create DataFrame from transformed data
df_transformed = pd.DataFrame(
    X_processed.toarray() if hasattr(X_processed, "toarray") else X_processed,
    columns=final_columns
)


In [59]:
# Save to MySQL
df_transformed.to_sql('processed_customers', con=engine, if_exists='replace', index=False)

print("Data successfully saved to MySQL table 'processed_customers'")


Data successfully saved to MySQL table 'processed_customers'


In [57]:
df

Unnamed: 0,id,age,income,city,education_level,purchase_amount
0,1,25.0,40000.0,New York,Bachelor,200.0
1,2,32.0,54000.0,Los Angeles,Master,250.0
2,3,31.661563,50000.0,Chicago,High School,220.0
3,4,45.0,15666.781636,Houston,PhD,300.0
4,5,35.0,62000.0,New York,Bachelor,
5,6,28.0,35000.0,Chicago,Bachelor,180.0
6,7,60.0,72000.0,Los Angeles,PhD,400.0
7,8,22.0,30000.0,Houston,High School,100.0
8,9,40.0,80000.0,Miami,Master,320.0
9,10,29.0,39000.0,New York,PhD,210.0
