In [None]:
# !pip install ezodf
# !pip install fancyimpute
# ! pip install kmodes
# !pip install gower
# !pip install pyod

In [50]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import ezodf
import gower

from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.impute import KNNImputer
from fancyimpute import IterativeImputer
from kmodes.kmodes import KModes
from sklearn.ensemble import IsolationForest
from sklearn.cluster import DBSCAN
from sklearn.svm import OneClassSVM
from sklearn.neighbors import LocalOutlierFactor

import torch
import torch.nn as nn
import torch.optim as optim

In [2]:
def read_ods_file(file_path, sheet_name=0):
    ezodf.config.set_table_expand_strategy('all')  # Expand all rows and columns
    spreadsheet = ezodf.opendoc(file_path)
    sheet = spreadsheet.sheets[sheet_name]

    data = []
    for row in sheet.rows():
        row_data = [cell.value for cell in row]
        data.append(row_data)
    df = pd.DataFrame(data[1:], columns=data[0])
    
    return df

file_path = 'Machine Learning Challenge.ods'
raw_data = read_ods_file(file_path, sheet_name=0)
raw_data.head()

Unnamed: 0,consumer_id,gender,has_gender,has_first_name,has_last_name,has_email,has_dob,customer_age,account_age,account_last_updated,account_status,app_downloads,unique_offer_clicked,total_offer_clicks,unique_offer_impressions,total_offer_impressions,avg_redemptions,min_redemptions,max_redemptions,total_offers_redeemed
0,1284b75c-ecae-4015-8e3d-359c0347ede8,,0.0,1.0,1.0,1.0,0.0,,188.0,174.0,0.0,1.0,3.0,3.0,8.0,52.0,1.0,1.0,1.0,1.0
1,128af162-d2c3-4fe4-986c-359c8bdc6c04,,0.0,1.0,1.0,1.0,0.0,,185.0,174.0,0.0,1.0,3.0,8.0,5.0,58.0,1.0,1.0,1.0,1.0
2,12aada5e-36eb-4e9e-8d62-359c076c1b40,,0.0,1.0,1.0,1.0,0.0,,188.0,174.0,0.0,1.0,3.0,4.0,9.0,183.0,1.0,1.0,1.0,3.0
3,12c2e02f-bc79-4048-83ba-359cd3280dcf,,0.0,1.0,1.0,1.0,0.0,,184.0,174.0,0.0,1.0,3.0,7.0,4.0,56.0,1.0,1.0,1.0,1.0
4,12fabdf0-0582-489e-a6d3-35509ab8ae6f,f,1.0,1.0,1.0,1.0,1.0,22.0,572.0,173.0,0.0,1.0,4.0,6.0,7.0,83.0,1.0,1.0,1.0,3.0


In [3]:
df = raw_data.copy()
df = df.drop(['consumer_id', 'account_status'], axis=1) 

In [4]:
len(raw_data["consumer_id"].value_counts()) == raw_data.shape[0] # to indicate if consumer id repeats
# since they don't match, it could have the possibility of duplicate records

False

In [5]:
# duplicate = raw_data[raw_data.duplicated()]
# duplicate.shape

In [7]:
# distinct_consumers_count = df["consumer_id"].value_counts()
# repeated_consumers = distinct_consumers_count[distinct_consumers_count > 1].index.tolist()
# # repeated_consumers

In [8]:
# raw_data[raw_data["consumer_id"] == '91532ab0-a763-464b-a293-359cdbd0d058']

In [9]:
df.columns

Index(['gender', 'has_gender', 'has_first_name', 'has_last_name', 'has_email',
       'has_dob', 'customer_age', 'account_age', 'account_last_updated',
       'app_downloads', 'unique_offer_clicked', 'total_offer_clicks',
       'unique_offer_impressions', 'total_offer_impressions',
       'avg_redemptions', 'min_redemptions', 'max_redemptions',
       'total_offers_redeemed'],
      dtype='object')

In [10]:
# for i in df.columns:
#     if i != "consumer_id":
#         print(df[i].value_counts())
#         print()

In [11]:
df.isnull().sum()

gender                      4522
has_gender                     0
has_first_name                 0
has_last_name                  0
has_email                      0
has_dob                        0
customer_age                5936
account_age                    0
account_last_updated           0
app_downloads                  0
unique_offer_clicked           0
total_offer_clicks             0
unique_offer_impressions       0
total_offer_impressions        0
avg_redemptions                0
min_redemptions                0
max_redemptions                0
total_offers_redeemed          0
dtype: int64

## Data Pre-processing

In [14]:
le = LabelEncoder()

df['gender'] = le.fit_transform(df['gender'])
df['gender'] = df['gender'].replace(2, np.nan)

In [25]:
df["gender"].value_counts()

gender
1.0    2864
0.0    2614
Name: count, dtype: int64

In [27]:
# fig, axes = plt.subplots(2, 2, figsize=(8, 6))

# # Account Age Histogram
# axes[0, 0].hist(df['account_age'], bins=30)
# axes[0, 0].set_title('Account Age Histogram')
# axes[0, 0].set_xlabel('Account Age')
# axes[0, 0].set_ylabel('Frequency')

# # Customer Age Histogram
# axes[0, 1].hist(df['customer_age'], bins=30)
# axes[0, 1].set_title('Customer Age Histogram')
# axes[0, 1].set_xlabel('Customer Age')
# axes[0, 1].set_ylabel('Frequency')

# # Log Transformation of Account Age Histogram
## df['account_age_log'] = np.log(df['account_age'] + 1)  # Add 1 to avoid log(0)
# axes[1, 0].hist(df['account_age_log'], bins=30)
# axes[1, 0].set_title('Log Transformed Account Age Histogram')
# axes[1, 0].set_xlabel('Log(Account Age)')
# axes[1, 0].set_ylabel('Frequency')

# # Log Transformation of Customer Age Histogram
## df['customer_age_log'] = np.log(df['customer_age'] + 1)  # Add 1 to avoid log(0)
# axes[1, 1].hist(df['customer_age_log'], bins=30)
# axes[1, 1].set_title('Log Transformed Customer Age Histogram')
# axes[1, 1].set_xlabel('Log(Customer Age)')
# axes[1, 1].set_ylabel('Frequency')

# plt.tight_layout()
# plt.show()


In [36]:
# 1. KNN Imputation
knn_imputer = KNNImputer(n_neighbors=5)
df_knn = df.copy()
df_knn[['gender', 'customer_age']] = knn_imputer.fit_transform(df_knn[['gender', 'customer_age']])
df_knn['gender'] = df_knn['gender'].apply(round)

# 2. MICE Imputation
mice_imputer = IterativeImputer(max_iter=10, random_state=42)
df_mice = df.copy()
df_mice[['gender', 'customer_age']] = mice_imputer.fit_transform(df_mice[['gender', 'customer_age']])
df_mice['gender'] = df_mice['gender'].apply(round)

# Handling non-negative values for log transformation (for customer_age and account_age)
df_knn['log_customer_age'] = np.log(df_knn['customer_age'] + 1) #np.log1p(df_knn['customer_age'])
df_knn['log_account_age'] = np.log(df_knn['account_age'] + 1) #np.log1p(df_knn['account_age']

df_mice['log_customer_age'] = np.log(df_mice['customer_age'] + 1)
df_mice['log_account_age'] = np.log(df_mice['account_age'] + 1)

In [30]:
# # Set up the 2x2 grid for KNN imputation
# fig, axs = plt.subplots(2, 2, figsize=(8,6))

# # Original customer_age (KNN imputed)
# sns.histplot(df_knn['customer_age'], kde=True, ax=axs[0, 0])
# axs[0, 0].set_title('KNN Imputed Customer Age')

# # Log-transformed customer_age
# sns.histplot(df_knn['customer_age_log'], kde=True, ax=axs[0, 1])
# axs[0, 1].set_title('Log Transformed Customer Age (KNN)')

# # Original account_age (KNN imputed)
# sns.histplot(df_knn['account_age'], kde=True, ax=axs[1, 0])
# axs[1, 0].set_title('KNN Imputed Account Age')

# # Log-transformed account_age
# sns.histplot(df_knn['account_age_log'], kde=True, ax=axs[1, 1])
# axs[1, 1].set_title('Log Transformed Account Age (KNN)')

# plt.tight_layout()
# plt.show()

In [38]:
# # Set up the 2x2 grid for MICE imputation
# fig, axs = plt.subplots(2, 2, figsize=(8,6))

# # Original customer_age (MICE imputed)
# sns.histplot(df_mice['customer_age'], kde=True, ax=axs[0, 0])
# axs[0, 0].set_title('MICE Imputed Customer Age')

# # Log-transformed customer_age
# sns.histplot(df_mice['customer_age_log'], kde=True, ax=axs[0, 1])
# axs[0, 1].set_title('Log Transformed Customer Age (MICE)')

# # Original account_age (MICE imputed)
# sns.histplot(df_mice['account_age'], kde=True, ax=axs[1, 0])
# axs[1, 0].set_title('MICE Imputed Account Age')

# # Log-transformed account_age
# sns.histplot(df_mice['account_age_log'], kde=True, ax=axs[1, 1])
# axs[1, 1].set_title('Log Transformed Account Age (MICE)')

# plt.tight_layout()
# plt.show()

In [40]:
#  MICE Imputation
mice_imputer = IterativeImputer(max_iter=10, random_state=42)
df_mice = df.copy()
df_mice[['gender', 'customer_age']] = mice_imputer.fit_transform(df_mice[['gender', 'customer_age']])
df_mice['gender'] = df_mice['gender'].apply(round)

df_mice['log_customer_age'] = np.log(df_mice['customer_age'] + 1)
df_mice['log_account_age'] = np.log(df_mice['account_age'] + 1)

#  Feature Engineering
df_mice['ctr'] = df_mice['total_offer_clicks'] / df_mice['total_offer_impressions']
df_mice['redemption_rate'] = df_mice['total_offers_redeemed'] / df_mice['total_offer_clicks']
df_mice['log_account_last_updated'] = np.log(df_mice['account_last_updated'])
df_mice['log_total_offer_impressions'] = np.log(df_mice['total_offer_impressions'])
df_mice['log_total_offer_clicks'] = np.log(df_mice['total_offer_clicks'])
df_mice['log_total_offers_redeemed'] = np.log(df_mice['total_offers_redeemed'])
df_mice['log_unique_offer_clicked'] = np.log(df_mice['unique_offer_clicked'])
df_mice['log_unique_offer_impressions'] = np.log(df_mice['unique_offer_impressions'])

df_mice = df_mice.drop(['customer_age',	'account_age', 'account_last_updated', 'total_offer_impressions', 
                        'total_offer_clicks', 'total_offers_redeemed', 'unique_offer_clicked', 'unique_offer_impressions'], axis=1)
df_mice.head()

Unnamed: 0,gender,has_gender,has_first_name,has_last_name,has_email,has_dob,app_downloads,avg_redemptions,min_redemptions,max_redemptions,log_customer_age,log_account_age,ctr,redemption_rate,log_account_last_updated,log_total_offer_impressions,log_total_offer_clicks,log_total_offers_redeemed,log_unique_offer_clicked,log_unique_offer_impressions
0,1,0.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,3.177367,5.241747,0.057692,0.333333,5.159055,3.951244,1.098612,0.0,1.098612,2.079442
1,1,0.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,3.177367,5.225747,0.137931,0.125,5.159055,4.060443,2.079442,0.0,1.098612,1.609438
2,1,0.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,3.177367,5.241747,0.021858,0.75,5.159055,5.209486,1.386294,1.098612,1.098612,2.197225
3,1,0.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,3.177367,5.220356,0.125,0.142857,5.159055,4.025352,1.94591,0.0,1.098612,1.386294
4,0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,3.135494,6.350886,0.072289,0.5,5.153292,4.418841,1.791759,1.098612,1.386294,1.94591


In [42]:
df_mice['app_downloads'] = df_mice['app_downloads'].apply(lambda x: 1 if x == 1 else 0)

### Step 2: Clustering with K-Modes
has_columns = [col for col in df_mice.columns if col.startswith('has_')]
redemption_columns = [col for col in df_mice.columns if 'redemptions' in col]

# K-Modes Clustering for 'has_' columns
kmodes_has = KModes(n_clusters=2, init='Huang', n_init=5, verbose=1)
df_mice['has_cluster'] = kmodes_has.fit_predict(df_mice[has_columns])

# K-Modes Clustering for 'redemption' columns
kmodes_redemption = KModes(n_clusters=2, init='Huang', n_init=5, verbose=1)
df_mice['redemption_cluster'] = kmodes_redemption.fit_predict(df_mice[redemption_columns])

processed_data = df_mice.drop(has_columns + redemption_columns,axis=1)
processed_data.shape

Init: initializing centroids
Init: initializing clusters
Starting iterations...
Run 1, iteration: 1/100, moves: 1600, cost: 1855.0
Init: initializing centroids
Init: initializing clusters
Starting iterations...
Run 2, iteration: 1/100, moves: 0, cost: 8283.0
Init: initializing centroids
Init: initializing clusters
Starting iterations...
Run 3, iteration: 1/100, moves: 0, cost: 8283.0
Init: initializing centroids
Init: initializing clusters
Starting iterations...
Run 4, iteration: 1/100, moves: 0, cost: 1855.0
Init: initializing centroids
Init: initializing clusters
Starting iterations...
Run 5, iteration: 1/100, moves: 0, cost: 8283.0
Best run was number 1
Init: initializing centroids
Init: initializing clusters
Starting iterations...
Run 1, iteration: 1/100, moves: 37, cost: 52.0
Init: initializing centroids
Init: initializing clusters
Starting iterations...
Run 2, iteration: 1/100, moves: 37, cost: 52.0
Init: initializing centroids
Init: initializing clusters
Starting iterations...
R

(10000, 14)

In [43]:
processed_data.columns

Index(['gender', 'app_downloads', 'log_customer_age', 'log_account_age', 'ctr',
       'redemption_rate', 'log_account_last_updated',
       'log_total_offer_impressions', 'log_total_offer_clicks',
       'log_total_offers_redeemed', 'log_unique_offer_clicked',
       'log_unique_offer_impressions', 'has_cluster', 'redemption_cluster'],
      dtype='object')

In [48]:

# --- Isolation Forest --- #
isf_clf = IsolationForest() 
isf_clf.fit(processed_data)

isf_predictions = isf_clf.predict(processed_data)
isf_anomalies_indices = np.where(isf_predictions == -1)[0]

# --- DBSCAN --- #
dbscan = DBSCAN(eps=0.2, min_samples=5)
dbscan.fit(processed_data)

dbscan_predictions = dbscan.labels_
dbscan_anomalies_indices = np.where(dbscan_predictions == -1)[0]

# --- One-Class SVM --- #
ocsvm = OneClassSVM(nu=0.05, kernel="rbf")
ocsvm.fit(processed_data)
ocsvm_predictions = ocsvm.predict(processed_data)
ocsvm_anomalies_indices = np.where(ocsvm_predictions == -1)[0]

# --- Local Outlier Factor (LOF) --- #
lof = LocalOutlierFactor()
lof_predictions = lof.fit_predict(processed_data)
lof_anomalies_indices = np.where(lof_predictions == -1)[0]


print(f"Isolation Forest: {len(isf_anomalies_indices)} anomalies detected")
print(f"DBSCAN: {len(dbscan_anomalies_indices)} anomalies detected")
print(f"One-Class SVM: {len(ocsvm_anomalies_indices)} anomalies detected")
print(f"Local Outlier Factor (LOF): {len(lof_anomalies_indices)} anomalies detected")

Isolation Forest: 1915 anomalies detected
DBSCAN: 7085 anomalies detected
One-Class SVM: 499 anomalies detected
Local Outlier Factor (LOF): 585 anomalies detected


In [54]:
class Autoencoder(nn.Module):
    def __init__(self, input_dim, encoding_dim):
        super(Autoencoder, self).__init__()
        self.encoder = nn.Sequential(
            nn.Linear(input_dim, encoding_dim),
            nn.ReLU()
        )
        self.decoder = nn.Sequential(
            nn.Linear(encoding_dim, input_dim)
        )

    def forward(self, x):
        encoded = self.encoder(x)
        decoded = self.decoder(encoded)
        return decoded

# Define Network parameters
input_dim = processed_data.shape[1]
encoding_dim = 6  
autoencoder = Autoencoder(input_dim, encoding_dim)
criterion = nn.MSELoss()
optimizer = optim.Adam(autoencoder.parameters(), lr=0.01)

data_tensor = torch.Tensor(processed_data.values) # Convert to PyTorch tensors

# Train the autoencoder
num_epochs = 100
for epoch in range(num_epochs):
    optimizer.zero_grad()
    outputs = autoencoder(data_tensor)
    loss = criterion(outputs, data_tensor)
    loss.backward()
    optimizer.step()

# Calculate reconstruction errors
reconstructed_data = autoencoder(data_tensor).detach().numpy()
reconstruction_errors = np.mean(np.square(processed_data.values - reconstructed_data), axis=1)

# Identify anomalies
threshold = np.percentile(reconstruction_errors, 95)
autoencoders_anomalies_indices = np.where(reconstruction_errors > threshold)[0]
# print("Detected anomalies:", anomalies_indices)
print(f"Autoencoders: {len(autoencoders_anomalies_indices)} anomalies detected")

Autoencoders: 500 anomalies detected
