In [None]:
"""This module performs exploratory data analysis (EDA) on the data. It focuses on records with 'yes' or 'no' in the 'y' column. 
It tests the data for multicollinearity."""

import sqlite3
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import OneHotEncoder

try:
    database_path = "data/data.db"
    conn = sqlite3.connect(database_path)
    query = open('data/fetch_all.sql', 'r').read()
    df = pd.read_sql_query(query, conn)
except (sqlite3.Error, FileNotFoundError) as e:
    print(f"An error occurred: {e}")
finally:
    if 'conn' in locals():
        conn.close()

# Filter only rows with 'yes' or 'no' in the 'y' column
df = df[df['y'].isin(['no', 'yes'])]

# Drop specified columns
df = df.drop(columns=['client_id', 'account_id', 'campaign_id', 'outcome_id', 'previous', 'poutcome'])

# Binning day of the week into Early, Mid, and Late

bins = [0, 9, 19, 31]
labels = ['early', 'mid', 'late']
df['day_bin'] = pd.cut(df['day'], bins=bins, labels=labels, right=True)

In [None]:
# Select categorical columns
categorical_columns = ['job', 'marital', 'education', 'in_default', 'housing', 'loan', 'contact', 'month', 'y']

# Initialize OneHotEncoder
encoder = OneHotEncoder(sparse_output=False, drop='first')

# Fit and transform the categorical columns
encoded_features = encoder.fit_transform(df[categorical_columns])

# Create a DataFrame with the encoded features
encoded_df = pd.DataFrame(encoded_features, columns=encoder.get_feature_names_out(categorical_columns))

# Drop the original categorical columns from df and concatenate the encoded features
df_encoded = pd.concat([df.drop(columns=categorical_columns), encoded_df], axis=1)

print(df_encoded.head())

In [None]:
from sklearn.preprocessing import StandardScaler

# Initialize the StandardScaler
scaler = StandardScaler()

# Fit and transform the encoded dataframe
df_encoded_scaled = scaler.fit_transform(df_encoded)

# Convert the scaled data back to a DataFrame
df_encoded_scaled = pd.DataFrame(df_encoded_scaled, columns=df_encoded.columns)

print(df_encoded_scaled.head())

In [None]:
from sklearn.preprocessing import MinMaxScaler

# Initialize the MinMaxScaler
min_max_scaler = MinMaxScaler()

# Fit and transform the encoded dataframe
df_encoded_min_max_scaled = min_max_scaler.fit_transform(encoded_df)

# Convert the scaled data back to a DataFrame
df_encoded_min_max_scaled = pd.DataFrame(df_encoded_min_max_scaled, columns=encoded_df.columns)

print(df_encoded_min_max_scaled.head())

In [7]:
from sklearn.model_selection import train_test_split

# Assuming df_encoded_scaled is the dataframe to be split
X = df_encoded_scaled.drop(columns=['y_yes'])
y = df_encoded_scaled['y_yes']

# Split the data into train and test sets with 20% for testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"X_train shape: {X_train.shape}")
print(f"X_test shape: {X_test.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"y_test shape: {y_test.shape}")

X_train shape: (6868, 38)
X_test shape: (1718, 38)
y_train shape: (6868,)
y_test shape: (1718,)
