## Import

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.preprocessing import PowerTransformer
from scipy import stats

: 

## Data Load

In [None]:
#import dataset
dataset = pd.read_csv('C:/Users/Felix Ee Jian Hui/Desktop/Degree/Sem4/FYP/Dataset/test.csv')


In [None]:
# View first 5 rows
print("üîπ Head of dataset:")
print(dataset.head())

In [None]:
# Show dataset info
print("\nüîπ Dataset Information:")
print(dataset.info())

In [None]:
# 1. Missing Values
print("\nüîπ Missing Values:")
print(dataset.isnull().sum())

# Option 1: Fill missing numerical values with mean
numeric_cols = dataset.select_dtypes(include=[np.number]).columns
dataset[numeric_cols] = dataset[numeric_cols].fillna(dataset[numeric_cols].mean())

# Option 2: Fill missing categorical values with mode
categorical_cols = dataset.select_dtypes(include=['object']).columns
dataset[categorical_cols] = dataset[categorical_cols].fillna(dataset[categorical_cols].mode().iloc[0])

print("\n‚úÖ After Filling Missing Values:")
print(dataset.isnull().sum())

In [None]:
# 2. Outlier Detection and Removal (using Z-score method)
print("\nüîπ Removing Outliers:")

# ‚ùå Columns to exclude from Z-score outlier detection
excluded_outlier_cols = ['InternetAccess', 'Extracurricular', 'PartTimeJob', 
                         'ParentSupport', 'Romantic', 'FreeTime', 'GoOut']

# ‚úÖ Only apply Z-score to numerical continuous columns
numeric_cols = dataset.select_dtypes(include=[np.number]).columns
zscore_cols = [col for col in numeric_cols if col not in excluded_outlier_cols]

# ‚úÖ Compute Z-scores and filter entries
z_scores = np.abs(stats.zscore(dataset[zscore_cols]))
filtered_entries = (z_scores < 3).all(axis=1)
dataset = dataset[filtered_entries]

print("‚úÖ Dataset shape after outlier removal:", dataset.shape)


In [None]:
# 3. Data Tranformation
# Re-select numeric columns
numeric_cols = dataset.select_dtypes(include=[np.number]).columns

# Exclude binary/ordinal features
excluded_cols = ['Grade', 'SES_Quartile', 'InternetAccess', 'Extracurricular']
continuous_cols = [col for col in numeric_cols if col not in excluded_cols]

# Step 0: Save original skewness
original_skewness = dataset[continuous_cols].skew()

# Step 1: Apply Yeo-Johnson transformation
pt = PowerTransformer(method='yeo-johnson')
dataset[continuous_cols] = pt.fit_transform(dataset[continuous_cols])

# Step 2: Skewness comparison
transformed_skewness = dataset[continuous_cols].skew()
skew_df = pd.DataFrame({
    'Before': original_skewness,
    'After': transformed_skewness
})
print("\n‚úÖ Skewness Comparison (Before vs After Transformation):")
print(skew_df)

In [None]:
# 4. Data Standardization 
# Step 1: Identify numeric columns
numeric_cols = dataset.select_dtypes(include=[np.number]).columns

# Step 2: Exclude binary or categorical numeric features from standardization
excluded_cols = ['Grade', 'SES_Quartile', 'InternetAccess', 'Extracurricular', 'Romantic', 'PartTimeJob']
continuous_cols = [col for col in numeric_cols if col not in excluded_cols]

# Step 3: Apply StandardScaler to continuous numeric features
scaler = StandardScaler()
dataset[continuous_cols] = scaler.fit_transform(dataset[continuous_cols])

# Step 4: Display mean and standard deviation to confirm standardization
print("\nüîπ Summary After Standardization (Mean ‚âà 0, Std ‚âà 1):")
print(dataset[continuous_cols].describe().loc[['mean', 'std']])


In [None]:
# 5. Data Normalization
# Step 1: Identify numeric columns
numeric_cols = dataset.select_dtypes(include=[np.number]).columns

# Step 2: Exclude categorical/binary/ordinal numeric columns from normalization
excluded_cols = ['Grade', 'SES_Quartile', 'InternetAccess', 'Extracurricular', 'Romantic', 'PartTimeJob']
continuous_cols = [col for col in numeric_cols if col not in excluded_cols]

# Step 3: Apply MinMaxScaler to continuous numeric features
scaler_norm = MinMaxScaler()
dataset[continuous_cols] = scaler_norm.fit_transform(dataset[continuous_cols])

# Step 4: Preview normalized data
print("\n‚úÖ Normalized Data (First 5 Rows):")
print(dataset[continuous_cols].head())

# Check the range after normalization
print("\nüîç Normalized Feature Ranges:")
print(dataset[continuous_cols].agg(['min', 'max']))

In [None]:
# Step 6: Encode Categorical Variables (if any)
print("\nüîπ Encoding Categorical Variables:")

# Identify object or category columns
cat_cols = dataset.select_dtypes(include=['object', 'category']).columns.tolist()
print(f"üßæ Categorical columns to encode: {cat_cols}")

# Apply one-hot encoding to these columns (drop_first=True to avoid multicollinearity)
dataset = pd.get_dummies(dataset, columns=cat_cols, drop_first=True)

# Final check
print("‚úÖ Final Dataset Shape after encoding:", dataset.shape)

In [None]:
# Load original dataset again
original_df = pd.read_csv('C:/Users/Felix Ee Jian Hui/Desktop/Degree/Sem4/FYP/Dataset/test.csv')

# Define columns to exclude from continuous data analysis
excluded_cols = ['Grade', 'SES_Quartile', 'InternetAccess', 'Extracurricular', 'Romantic', 'PartTimeJob']

# Get continuous numeric columns only
numeric_cols = original_df.select_dtypes(include=[np.number]).columns
continuous_cols = [col for col in numeric_cols if col not in excluded_cols]

# Set plot style
sns.set(style="whitegrid")

# Loop through each continuous column
for col in continuous_cols:
    fig, axes = plt.subplots(1, 3, figsize=(18, 4))
    fig.suptitle(f'Distribution of {col}: Before vs After Transformation', fontsize=16, fontweight='bold')

    # ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ Histogram ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
    axes[0].hist(original_df[col].dropna(), bins=30, alpha=0.6, label='Before', color='skyblue', edgecolor='black')
    axes[0].hist(dataset[col], bins=30, alpha=0.6, label='After', color='lightgreen', edgecolor='black')
    axes[0].set_title("Histogram")
    axes[0].set_xlabel(col)
    axes[0].set_ylabel("Frequency")
    axes[0].legend()
    axes[0].grid(True)

    # ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ KDE Plot ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
    sns.kdeplot(original_df[col].dropna(), ax=axes[1], label='Before', color='blue', fill=True)
    sns.kdeplot(dataset[col], ax=axes[1], label='After', color='green', fill=True)
    axes[1].set_title("KDE Plot")
    axes[1].set_xlabel(col)
    axes[1].legend()

    # ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ Boxplot ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
    sns.boxplot(data=[original_df[col].dropna(), dataset[col]], ax=axes[2], palette=["skyblue", "lightgreen"])
    axes[2].set_xticklabels(["Before", "After"])
    axes[2].set_title("Boxplot")
    axes[2].set_ylabel(col)

    plt.tight_layout()
    plt.subplots_adjust(top=0.85)  # Space for suptitle
    plt.show()

In [None]:
# Data Visualization
# Load original dataset again (before transformation)
original_df = pd.read_csv('C:/Users/Felix Ee Jian Hui/Desktop/Degree/Sem4/FYP/Dataset/test.csv')

# Select continuous numeric columns (exclude ordinal/binary)
numeric_cols = original_df.select_dtypes(include=[np.number]).columns
excluded_cols = ['Grade', 'SES_Quartile', 'InternetAccess', 'Extracurricular', 'Romantic', 'PartTimeJob']
continuous_cols = [col for col in numeric_cols if col not in excluded_cols]

# Set up the figure
fig, axes = plt.subplots(nrows=len(continuous_cols), ncols=2, figsize=(12, 4 * len(continuous_cols)))

# Plot original vs transformed for each feature
for idx, col in enumerate(continuous_cols):
    # Plot original
    axes[idx, 0].hist(original_df[col].dropna(), bins=30, alpha=0.7, edgecolor='black', color='skyblue')
    axes[idx, 0].set_title(f'{col} - Before Transformation', fontsize=12)
    axes[idx, 0].set_xlabel(col)
    axes[idx, 0].set_ylabel('Frequency')
    axes[idx, 0].grid(True)

    # Plot transformed
    axes[idx, 1].hist(dataset[col], bins=30, alpha=0.7, edgecolor='black', color='lightgreen')
    axes[idx, 1].set_title(f'{col} - After Transformation', fontsize=12)
    axes[idx, 1].set_xlabel(col)
    axes[idx, 1].set_ylabel('Frequency')
    axes[idx, 1].grid(True)

plt.tight_layout()
plt.show()