# Importing libs, connecting drive etc

In [None]:
import numpy as np
import tensorflow as tf
from tensorflow.keras import Model
from tensorflow.keras.layers import Input, Embedding, Flatten, Dot, Dense, Concatenate
from tensorflow.keras.models import load_model
from sklearn.model_selection import train_test_split
import pandas as pd
print(pd.__version__)

import matplotlib.pyplot as plt
pd.set_option('display.max_columns', None)

2.0.3


In [None]:
### Importing the initial file if it's in the same directory as in the Github
df = pd.read_csv('data/PSV_merchandise_shared.csv')

In [None]:
### Importing through Google Colab
from google.colab import drive
drive.mount('/content/gdrive/', force_remount=True)
file_path = '/content/gdrive/MyDrive/DeepLearning/Data/PSV_merchandise_shared.csv'   #add file path
df = pd.read_csv(file_path)

# Cleaning etc before feature engineering

In [None]:
# Removing redundant columns
# Columns dropped for the following reasons:
# 'DOB' and 'year_of_birth': Maintaining accuracy or ability to recalculate precise age is not needed.
# 'category_age': Can still categorize from the 'age' column if necessary.
# 'category_distance_from_club': Absolute distance is still retained.
# 'first_source_name', 'year_month_arrival', and 'year_arrival': Considered redundant.
# 'merchandise_purchase_channel', 'merchandise_product_description2', and 'merchandise_product_category': Redundant.

columns_to_drop = [
    'Unnamed: 0',
    'DOB',
    'year_of_birth',  # Added a comma here that was missing in the original script
    'category_age',
    'category_distance_from_club',
    'first_source_name',
    'year_month_arrival',
    'year_arrival',
    'merchandise_purchase_channel',
    'merchandise_product_description2',
    'merchandise_product_category'
]

df = df.drop(columns=columns_to_drop)

# Print the DataFrame to confirm columns are dropped
print(df.columns)

Index(['fan_id', 'age', 'gender', 'distance_from_club', 'is_fanclub_member',
       'is_clubcard_member', 'is_supver_member', 'is_scc_holder',
       'total_spend_merchandise', 'total_spend_ticket', 'total_spend_other',
       'total_spend_all', 'merchandise_transaction_datetime',
       'merchandise_product_description1', 'merchandise_product_name',
       'merchandise_product_size', 'merchandise_transaction_price',
       'merchandise_order_value', 'merchandise_product_price',
       'merchandise_product_units', 'merchandise_order_id'],
      dtype='object')


In [None]:
### Removing instances that have multiple NA's
df = df.dropna(subset=["age", "gender"])

# Changing unknown distances to the average distance
valid_distances = pd.to_numeric(df['distance_from_club'], errors='coerce')  # Converts 'Unknown' to NaN
average_distance = int(valid_distances.mean())
df['distance_from_club'] = df['distance_from_club'].replace('Unknown', average_distance)

# Getting rid of shipping costs and prints as they do not add anything to the model
df = df[df['merchandise_product_description1'] != 'VERZENDKOSTEN']
df = df[df['merchandise_product_description1'] != 'BEDRUKKING TEAMSPORT / OVERIG']
df = df[df['merchandise_product_description1'] != 'BEDRUKKING PSV']

# Changing gender to a binary value
df.loc[:, 'gender'] = df['gender'].replace({'M': 0, 'F': 1})

# Feature engineering

### Removing returned products and corresponding purchases

In [None]:
# Add match_key in the original dataframe for all transactions
df['match_key'] = (df['fan_id'].astype(str) + '-' +
                   df['merchandise_product_name'] + '-' +
                   df['merchandise_product_size'] + '-' +
                   df['merchandise_transaction_price'].abs().astype(str))

# Separate returns and purchases
returns = df[df['merchandise_product_units'] < 0]
purchases = df[df['merchandise_product_units'] > 0]

# Count occurrences of each match_key in purchases and returns
purchase_counts = purchases['match_key'].value_counts()
return_counts = returns['match_key'].value_counts()

# Create a DataFrame from purchase_counts and return_counts to handle multiple returns/purchases
counts_df = pd.DataFrame({
    'purchases': purchase_counts,
    'returns': return_counts
}).fillna(0)  # Fill NaN with 0 where there are no purchases or returns

# The minimum of purchase and return counts is how many can be matched
counts_df['matched'] = counts_df[['purchases', 'returns']].min(axis=1)

# Apply matching to the original DataFrame to flag returns
def flag_returns(row):
    if row['merchandise_product_units'] < 0 and counts_df.at[row['match_key'], 'matched'] > 0:
        counts_df.at[row['match_key'], 'matched'] -= 1
        return True
    return False

df['is_returned'] = df.apply(flag_returns, axis=1)

# Amount of returns
print("Flagged returns:", df['is_returned'].sum())

# Removing all flagged returns
df = df[~df['is_returned']]

# Removing all remaining return transactions
df = df[df['merchandise_product_units'] > 0]

# Optional: print the shape of the dataframe to see the number of rows after cleaning
print("DataFrame shape after removing flagged and all returns:", df.shape)

Flagged returns: 9635
DataFrame shape after removing flagged and all returns: (186156, 23)


### Adding a kid size column

In [None]:
### Kid Size check
# Define the list of child sizes
child_sizes = [62, 68, 74, 80, 88, 92, 98, 104, 110, 128, 140, 152, 164, 176]

# Function to check if the size or product category indicates a kid size
def check_if_child_size(row):
    try:
        # Convert to integer if possible because child sizes are numbers
        size = int(row['merchandise_product_size'])
        if size in child_sizes:
            return 1
    except ValueError:
        # If conversion to int fails, continue to check the product category
        pass

    # Check the merchandise product category
    if 'BABY' in row['merchandise_product_description1']:
        return 1
    else:
        return 0

# Apply the function to each row of the DataFrame
df['is_kid_size'] = df.apply(check_if_child_size, axis=1)


### Remove additional columns that were necessary for Feature Engineering

In [None]:
# Removing columns used for feature engineering after their usage
# These columns were initially retained for important processing steps
# and are now being dropped as they are no longer needed.

columns_to_drop = [
    'merchandise_order_id',
    'merchandise_product_size',
    'merchandise_transaction_datetime',
    'match_key',
    'is_returned',
    'merchandise_transaction_price',
    'merchandise_order_value',
    'merchandise_product_units'
]

df = df.drop(columns=columns_to_drop)

In [None]:
### Removing duplicate orders, as they might influence the model too greatly without making it more efficient.
# Number of rows before removing duplicates: 186156
# Number of rows after removing duplicates: 180661

df = df.drop_duplicates(keep='first')


# Adding negative instances

In [None]:
# Step 1: Generate negative samples with full user and product data
negative_samples = []

# Get all unique products and users based on your specified columns
all_products = df['merchandise_product_name'].unique()
all_users = df['fan_id'].unique()

# Loop over each user
for fan_id in all_users:
    user_data = df[df['fan_id'] == fan_id].iloc[0]  # Get the user's data from any of their transactions

    # Find all products this user has purchased
    purchased_products = df[df['fan_id'] == fan_id]['merchandise_product_name'].unique()

    # Identify products not purchased by this user
    non_purchased_products = np.setdiff1d(all_products, purchased_products)

    # Randomly select non-purchased products to create negative samples
    num_neg_samples = 3 * len(purchased_products)  # Matching number of positive samples
    sampled_non_purchased_products = np.random.choice(non_purchased_products, size=num_neg_samples, replace=False)

    # Append negative samples with user data and product data
    for product_name in sampled_non_purchased_products:
        product_data = df[df['merchandise_product_name'] == product_name].iloc[0]  # Get product data from any transaction involving this product

        # Combine user and product data, overwrite product-specific fields with non-purchased product data
        combined_data = user_data.copy()
        for field in product_features:  # Assuming product_features list is defined with product-specific field names
            combined_data[field] = product_data[field]

        # Set interaction to 0
        combined_data['interaction'] = 0

        # Append to negative_samples list
        negative_samples.append(combined_data)

# Convert negative_samples to DataFrame
negative_samples_df = pd.DataFrame(negative_samples)

# Step 2: Combine the original data with the negative samples
df_full = pd.concat([df, negative_samples_df], ignore_index=True)

# Step 3: Shuffle the DataFrame to mix positive and negative samples
df_full = df_full.sample(frac=1).reset_index(drop=True)

# Exporting

In [None]:
### export CSV from cleaned file

df_full.to_csv('/content/gdrive/MyDrive/DeepLearning/Data/psv_processed.csv', index=False)

In [None]:
### Temp

display(df[:])