# Data Prep and Inspection

Importing Packages

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

sns.set_theme(style="whitegrid")
palette = sns.color_palette("Set2")

print("Packages successfully imported!")

Importing the data

In [None]:
data = pd.read_csv("data/data_raw.csv")

print("Data successfully imported!")

### Inspecting Data

In [None]:
data.head()

In [None]:
print(f"The dataset has {data.shape[0]} rows")

In [None]:
dupl = data['Customer ID'].value_counts()[data['Customer ID'].value_counts()>1].size
print(f"Duplicates: {dupl}")
# No customer duplicates

Checking missing values

In [None]:
data.isna().sum()

We will remove the two rows with NA's as there's only two.

In [None]:
data_full = data.dropna()

print(f"The new dataset has {data_full.shape[0]} rows")

Checking for missing data

Inspecting counts and summary stats for cat and numeric vars, respectively

In [None]:
# Selecting numeric columns
numeric_stats = data_full.describe()

# For categorical columns
categorical_columns = data_full.select_dtypes(include=['object', 'category']).columns
categorical_values = {col: data_full[col].value_counts() for col in categorical_columns}

# Display results
print("Numeric Stats:\n", numeric_stats)
print("\nCategorical Values:")
for col, values in categorical_values.items():
    print(f"\n{col}:\n", values)


Plotting pie charts for the categorical vars

In [None]:
# Create a 2x2 grid for the pie charts
fig, axes = plt.subplots(2, 2, figsize=(9, 8))

# Flatten the axes array for iter
axes = axes.flatten()

for i, col in enumerate(categorical_columns):
    # get value counts for the column
    values = data_full[col].value_counts()
    
    # plot pie chart on the corresponding axis
    axes[i].pie(values, labels=values.index, autopct='%1.1f%%', startangle=140, 
                colors=palette[:len(values)])
    
    # setting the title for each subplot
    axes[i].set_title(f'Distribution of {col}', fontsize=14, fontweight='bold')
    
    # equal aspect ratio ensures circle
    axes[i].axis('equal')

# Adjust layout, prevent overlap
plt.tight_layout()

plt.show()

Categorical variables seem to have very equal distributions, we will now inspect the numeric vars

In [None]:
numeric_columns = data_full.select_dtypes(include=['number']).columns

# Create a 2x2 grid for the pie charts
fig, axes = plt.subplots(2, 3, figsize=(12, 10))

# Flatten the axes array for iter
axes = axes.flatten()

for i, col in enumerate(numeric_columns):
    # Plot box plot on the corresponding axis
    sns.boxplot(y=data_full[col], ax=axes[i], color=sns.color_palette("Set2")[i])
    
    # Set the title for each subplot
    axes[i].set_title(f'Box Plot of {col}', fontsize=14, fontweight='bold')

# Adjust layout, prevent overlap
plt.tight_layout()

plt.show()

In [None]:
numeric_columns = [col for col in numeric_columns if col != "Customer ID"]

# Create a 2x2 grid for the histogram
fig, axes = plt.subplots(2, 3, figsize=(9, 8))

# Flatten the axes array for iter
axes = axes.flatten()

for i, col in enumerate(numeric_columns):
    # Plot histogram on the corresponding axis
    axes[i].hist(data_full[col], bins=20, color=sns.color_palette("Set2")[i], edgecolor='black')
    
    # Set the title for each subplot
    axes[i].set_title(f'Distribution of {col}', fontsize=11, fontweight='bold')
    
    # Set labels
    axes[i].set_xlabel(col, fontsize=12)
    axes[i].set_ylabel('Frequency', fontsize=12)

# Adjust layout, prevent overlap
plt.tight_layout()

plt.show()

# Cleaning the data

While the data is cleaned (in the proper structure), we will need to one-hot encode the cat vars for modelling

In [None]:
from sklearn.preprocessing import LabelEncoder

data_encoded = data_full.copy()
# Create a LabelEncoder object
label_encoder = LabelEncoder()

# Initialize a dictionary to store the encoders
label_encoders = {}

# Apply label encoding to each categorical column
for col in data_encoded.select_dtypes(include=['object', 'category']).columns:
    data_encoded[col] = label_encoder.fit_transform(data_encoded[col])
    
    # Save the encoder mapping
    label_encoders[col] = {class_label: int(value) for class_label, value in zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_))}

    print(f"Encoded {col}: {label_encoders[col]}")

# reverse_mapping = {v: k for k, v in label_encoders[''].items()}
data_encoded.head()

# saving the encoded data
data_encoded.to_csv('data/data_encoded.csv')


Now, we will normalise the numerical values in order to ensure equal contribution

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

data_normal = data_encoded.copy()
colnames = data_encoded.columns

data_normal[numeric_columns] = scaler.fit_transform(data_normal[numeric_columns])

data_normal.head()

In [None]:
# Comparing distributions of normalised numeric vars, they should be identical
for col in numeric_columns:
    fig, axes = plt.subplots(1, 2, figsize=(9, 8))
    sns.histplot(data_encoded[col], ax = axes[0], color=sns.color_palette("Set2")[0])
    axes[0].set_title(f'Histogram of {col} before normalization', fontsize=8, fontweight='bold')
    sns.histplot(data_normal[col], ax= axes[1], color=sns.color_palette("Set2")[1])
    axes[1].set_title(f'Histogram of {col} After normalization', fontsize=8, fontweight='bold')
    plt.show()

In [None]:
# Saving normalised data
data_normal.to_csv('data/data_normal.csv')

### Comparing Variables

Using Pairplot to compare variables

In [None]:
sns.pairplot(data_encoded)
plt.show()


This is a bit crowded, let's try something else!

We'll compare the correlations of each variable with each other! Using this, we can discern which variables we want to take a closer look at.

In [None]:
# Compute the correlation matrix
corr = data_normal.corr()

# Generate a mask for the upper triangle
mask = np.triu(np.ones_like(corr, dtype=bool))

# Set up the matplotlib figure
f, ax = plt.subplots(figsize=(11, 9))

# Generate a custom diverging colormap
cmap = sns.diverging_palette(230, 20, as_cmap=True)

# Draw the heatmap with the mask and correct aspect ratio
sns.heatmap(corr, mask=mask, cmap=cmap, vmax=1, center=0,
            square=True, linewidths=.5, cbar_kws={"shrink": .5}, annot=True)

plt.title("Correlation Heatmap", fontsize=18)
plt.show()

First things first, obviously customer ID isn't related to any variable, and items purchased is highly correlated with items bought. One notable insight from this plot is Total Spend being so highly correlated with City, we'll investigate this further!

Overall, we're gonna take the relationships with a correlation of r ≥ ±0.7 to investigate further.

In [None]:
# Calculate the correlation matrix
corr_matrix = data_encoded.corr().abs()

# Set the lower triangle and the diagonal of the matrix to 0 to avoid double-counting and self-correlation
corr_matrix.values[np.tril_indices_from(corr_matrix)] = 0

# Find all pairs where correlation is greater than or equal to 0.7
high_corr_pairs = corr_matrix[corr_matrix >= 0.7].stack().sort_values(ascending=False)

# Display the pairs
print("Highly correlated pairs with r >= ±0.7:\n")
for index, value in high_corr_pairs.items():
    print(f"{index[0]:<25}~ {index[1]:<30}--> r = {value:.2f}")

In [None]:
for index, value in high_corr_pairs.items():
    var1 = index[0]
    var2 = index[1]
    