<a href="https://colab.research.google.com/github/rc1inger/CSC177-DataPreprocessing/blob/main/Project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Data Preprocessing Project**

In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression

In [None]:
# Load the dataset and rename columns for clarity
data = pd.read_csv('data/heart_disease.csv')
data.columns = ['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalach', 'exang', 'oldpeak', 'slope', 'ca', 'thal', 'disease']

**Drop Duplicates, Handle Missing Values, and Remove Outliers**

In [None]:
# Drop the 'disease' column as it's the target variable for prediction
data = data.drop(['disease'], axis=1)

# Print the shape of the dataset to understand its dimensions
print(f'Number of instances = {{data.shape[0]}}')  # Number of rows
print(f'Number of attributes = {{data.shape[1]}}')  # Number of columns
print(data.head())  # Display the first few rows for a quick overview

In [None]:
# **Modularization: Created a function for duplicate removal**
def remove_duplicates(df):
    dups = df.duplicated()
    print(f'Number of duplicate rows = {{dups.sum()}}')  # Count duplicates
    df_cleaned = df.drop_duplicates()
    print(f'Number of instances after dropping duplicates = {{df_cleaned.shape[0]}}')  # New row count
    return df_cleaned

In [None]:
# Remove duplicates from the dataset
data2 = remove_duplicates(data)

# Replace '?' with NaN to handle missing values properly
data3 = data2.replace('?', np.nan)

In [None]:
# **Handle missing values**
def handle_missing_values(df):
    for col in df.columns:
        df[col] = pd.to_numeric(df[col], errors='coerce')
    print('Replace missing values with median')
    for col in df.columns:
        df[col] = df[col].fillna(df[col].median())  # No inplace, reassign the column directly
    return df

# Handle missing values in the dataset
data3 = handle_missing_values(data3)

In [None]:
# **Visualize missing values**
def visualize_missing_values(df):
    missing_values = df.isna().sum()
    missing_values = missing_values[missing_values > 0]
    if not missing_values.empty:
        missing_values.plot(kind='bar', figsize=(10, 5))  # Plot only columns with missing values
        plt.title('Missing Values Count')  # Title for the plot
        plt.xlabel('Features')  # X-axis label
        plt.ylabel('Count')  # Y-axis label
        plt.show()  # Display the plot
    else:
        print("No missing values to visualize.")

# Visualize missing values after handling missing data
visualize_missing_values(data3)

In [None]:
# **Visualizing potential outliers using boxplots**
data3.boxplot(figsize=(20, 3))  # Initial boxplot for visual inspection
plt.title('Boxplot of Features Before Outlier Removal')
plt.show()  # Display the boxplot

In [None]:
# **Remove outliers**
def remove_outliers(df):
    Z = (df - df.mean()) / df.std()  # Z-score normalization
    print(f'Number of rows before removing outliers = {{Z.shape[0]}}')  # Initial row count
    Z2 = df.loc[((Z > -3).sum(axis=1) == len(df.columns)) & ((Z <= 3).sum(axis=1) == len(df.columns)), :]
    print(f'Number of rows after removing outliers = {{Z2.shape[0]}}')  # Count after outlier removal
    return Z2

# Remove outliers from the dataset
data3 = remove_outliers(data3)

**One hot Encoding and Feature Scaling on the Dataset**

In [None]:
# Using Label Encoding for binary columns and One-Hot Encoding for multiclass columns
X = data.copy()

def encode_categorical(df):
    from sklearn.preprocessing import LabelEncoder
    le = LabelEncoder()
    # Label encode binary categorical columns
    df['sex'] = le.fit_transform(df['sex'])
    df['fbs'] = le.fit_transform(df['fbs'])
    df['exang'] = le.fit_transform(df['exang'])

    # One-Hot encode multiclass columns
    df = pd.get_dummies(df, columns=['cp', 'restecg', 'slope', 'ca', 'thal'])
    return df

X_encoded = encode_categorical(X)

In [None]:
# **Feature scaling**
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
data3_scaled = pd.DataFrame(scaler.fit_transform(data3), columns=data3.columns)

In [None]:
# **Visualize cleaned data**
def visualize_data(df):
    df.boxplot(figsize=(20, 3))  # Size of the plot
    plt.title('Boxplot of Features After Cleaning and Scaling')
    plt.show()

In [None]:
# Visualize the cleaned dataset
visualize_data(data3_scaled)

**Split the Dataset 80% train, 20% test**

In [None]:
from sklearn.preprocessing import MinMaxScaler

# Apply Min-Max Scaling
min_max_scaler = MinMaxScaler()
data3_minmax = pd.DataFrame(min_max_scaler.fit_transform(data3), columns=data3.columns)

# Display the scaled data
print("Data after Min-Max Scaling:\n", data3_minmax.head())


In [None]:
# **Split dataset**
from sklearn.model_selection import train_test_split

# Split the dataset into training and testing sets (80-20 split)
train_data, test_data = train_test_split(data3_scaled, test_size=0.2, random_state=42)

# **Calculate mean and standard deviation for both sets**
train_mean = train_data.mean()
train_std = train_data.std()

test_mean = test_data.mean()
test_std = test_data.std()

In [None]:
# Display the mean and standard deviation for training and test sets
print(f'Train Mean:\n{train_mean}')
print(f'Train Std:\n{train_std}')
print(f'Test Mean:\n{test_mean}')
print(f'Test Std:\n{test_std}')

**Comparing the training and test values**

In [None]:
# Compare the difference between train and test means
mean_diff = abs(train_mean - test_mean)
std_diff = abs(train_std - test_std)

In [None]:
print(f'Mean Differences Between Train and Test:\n{mean_diff}\n')
print(f'Standard Deviation Differences Between Train and Test:\n{std_diff}\n')

In [None]:
# Summarize the differences
if mean_diff.mean() < 0.1 and std_diff.mean() < 0.1:
    print("The training and test sets are well-balanced and represent similar distributions.")
else:
    print("There are some differences between the training and test sets. Further analysis might be required.")