In [1]:
import pandas as pd
from google.cloud import storage
from io import BytesIO

In [2]:
storage1 = storage.Client()
bucket = storage1.bucket("final_demo2_blackfriday")
blob = bucket.blob("train.csv")
black_friday_data = blob.download_as_bytes()

df = pd.read_csv(BytesIO(black_friday_data))    

In [3]:
df.head()

Unnamed: 0,User_ID,Product_ID,Gender,Age,Occupation,City_Category,Stay_In_Current_City_Years,Marital_Status,Product_Category_1,Product_Category_2,Product_Category_3,Purchase
0,1000001,P00069042,F,0-17,10,A,2,0,3,,,8370
1,1000001,P00248942,F,0-17,10,A,2,0,1,6.0,14.0,15200
2,1000001,P00087842,F,0-17,10,A,2,0,12,,,1422
3,1000001,P00085442,F,0-17,10,A,2,0,12,14.0,,1057
4,1000002,P00285442,M,55+,16,C,4+,0,8,,,7969


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 550068 entries, 0 to 550067
Data columns (total 12 columns):
 #   Column                      Non-Null Count   Dtype  
---  ------                      --------------   -----  
 0   User_ID                     550068 non-null  int64  
 1   Product_ID                  550068 non-null  object 
 2   Gender                      550068 non-null  object 
 3   Age                         550068 non-null  object 
 4   Occupation                  550068 non-null  int64  
 5   City_Category               550068 non-null  object 
 6   Stay_In_Current_City_Years  550068 non-null  object 
 7   Marital_Status              550068 non-null  int64  
 8   Product_Category_1          550068 non-null  int64  
 9   Product_Category_2          376430 non-null  float64
 10  Product_Category_3          166821 non-null  float64
 11  Purchase                    550068 non-null  int64  
dtypes: float64(2), int64(5), object(5)
memory usage: 50.4+ MB


In [5]:
# Handle missing values before encoding
df['Product_Category_2'] = df['Product_Category_2'].fillna(0)
df['Product_Category_3'] = df['Product_Category_3'].fillna(0)
       
# Drop columns not needed for demographic analysis
df.drop(['User_ID', 'Product_ID'], axis=1, inplace=True)

In [6]:
df.head()

Unnamed: 0,Gender,Age,Occupation,City_Category,Stay_In_Current_City_Years,Marital_Status,Product_Category_1,Product_Category_2,Product_Category_3,Purchase
0,F,0-17,10,A,2,0,3,0.0,0.0,8370
1,F,0-17,10,A,2,0,1,6.0,14.0,15200
2,F,0-17,10,A,2,0,12,0.0,0.0,1422
3,F,0-17,10,A,2,0,12,14.0,0.0,1057
4,M,55+,16,C,4+,0,8,0.0,0.0,7969


In [7]:
df['Gender'] = df['Gender'].replace({'F': 0, 'M': 1})
age_mapping = {'0-17': 0, '18-25': 1, '26-35': 2, '36-45': 3, '46-50': 4, '51-55': 5, '55+': 6}
df['Age'] = df['Age'].map(age_mapping)
df['City_Category'] = df['City_Category'].replace({'A': 0, 'B': 1, 'C': 2})
df['Stay_In_Current_City_Years'] = df['Stay_In_Current_City_Years'].str.replace('+','')

  df['Gender'] = df['Gender'].replace({'F': 0, 'M': 1})
  df['City_Category'] = df['City_Category'].replace({'A': 0, 'B': 1, 'C': 2})


In [8]:
df.head()

Unnamed: 0,Gender,Age,Occupation,City_Category,Stay_In_Current_City_Years,Marital_Status,Product_Category_1,Product_Category_2,Product_Category_3,Purchase
0,0,0,10,0,2,0,3,0.0,0.0,8370
1,0,0,10,0,2,0,1,6.0,14.0,15200
2,0,0,10,0,2,0,12,0.0,0.0,1422
3,0,0,10,0,2,0,12,14.0,0.0,1057
4,1,6,16,2,4,0,8,0.0,0.0,7969


In [9]:
df[['Occupation','Stay_In_Current_City_Years', 'Marital_Status','Product_Category_1','Product_Category_2']]=df[['Occupation','Stay_In_Current_City_Years', 'Marital_Status','Product_Category_1','Product_Category_2']].astype('int64')

In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 550068 entries, 0 to 550067
Data columns (total 10 columns):
 #   Column                      Non-Null Count   Dtype  
---  ------                      --------------   -----  
 0   Gender                      550068 non-null  int64  
 1   Age                         550068 non-null  int64  
 2   Occupation                  550068 non-null  int64  
 3   City_Category               550068 non-null  int64  
 4   Stay_In_Current_City_Years  550068 non-null  int64  
 5   Marital_Status              550068 non-null  int64  
 6   Product_Category_1          550068 non-null  int64  
 7   Product_Category_2          550068 non-null  int64  
 8   Product_Category_3          550068 non-null  float64
 9   Purchase                    550068 non-null  int64  
dtypes: float64(1), int64(9)
memory usage: 42.0 MB


In [11]:
selected_features_blob_name = 'selected_features_names.json'
local_selected_features_path = '/tmp/selected_features_names.json'
# Download the selected features file from GCS
selected_features_blob = bucket.blob(selected_features_blob_name)
selected_features_blob.download_to_filename(local_selected_features_path)

In [15]:
model_blob_name = 'model_.joblib'
local_model_path = '/tmp/model.joblib'
# Download the model from GCS
model_blob = bucket.blob(model_blob_name)
model_blob.download_to_filename(local_model_path)
# Load the model
import joblib
model = joblib.load(local_model_path)

In [16]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error

# Function to split the dataset by unique values in a column
def split_by_column(df, column):
    unique_values = df[column].unique()
    subsets = {value: df[df[column] == value] for value in unique_values}
    return subsets

# Function to train and evaluate model for a given DataFrame
def train_and_evaluate(df, features, target, model):
    if len(df) < 2:
        # Not enough data to split into train and test
        return None, None, None, None
    
    X = df[features]
    y = df[target]
    
    # Splitting into training and testing datasets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Making predictions
    y_pred = model.predict(X_test)
    
    # Calculating metrics
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    mae = mean_absolute_error(y_test, y_pred)
    
    return rmse, mae, y_test, y_pred

# Function to calculate fairness and bias metrics
def calculate_fairness_metrics(group_a, group_b):
    metrics = {}
    
    # RMSE Ratio
    metrics['RMSE Ratio'] = group_a['RMSE'] / group_b['RMSE'] if group_b['RMSE'] != 0 else np.inf
    
    # MAE Ratio
    metrics['MAE Ratio'] = group_a['MAE'] / group_b['MAE'] if group_b['MAE'] != 0 else np.inf
    
    
    return metrics

# Function to evaluate and compare performance for each unique value in a column
def evaluate_column(df, column, features, target, model):
    subsets = split_by_column(df, column)
    performance_metrics = []

    for value, subset in subsets.items():
        rmse, mae, y_test, y_pred = train_and_evaluate(subset, features, target, model)
        if rmse is not None and mae is not None:
            performance_metrics.append({
                column: value,
                'RMSE': rmse,
                'MAE': mae,
                'y_test': y_test,
                'y_pred': y_pred
            })
            print(f"Subset for {column} = {value}")
            print(f"RMSE: {rmse}")
            print(f"MAE: {mae}")
            print("\n")
    
    # Calculate fairness and bias metrics between the first two unique groups
    if len(performance_metrics) >= 2:
        group_a = performance_metrics[0]
        group_b = performance_metrics[1]
        fairness_metrics = calculate_fairness_metrics(group_a, group_b)
        print("Fairness and Bias Metrics:")
        for metric, value in fairness_metrics.items():
            print(f"{metric}: {value}")

    return pd.DataFrame(performance_metrics)

In [17]:
# Define the features and target column
features = ['Gender','Age', 'Occupation', 'City_Category', 'Stay_In_Current_City_Years', 'Marital_Status', 'Product_Category_1', 'Product_Category_2', 'Product_Category_3']
target = 'Purchase'

# Evaluate performance for 'Gender' column
gender_performance = evaluate_column(df, 'Gender', features, target, model)
print("Performance for 'Gender':")
print(gender_performance)

Subset for Gender = 0
RMSE: 3070.776020978545
MAE: 2320.9866660227435


Subset for Gender = 1
RMSE: 3341.6564438941323
MAE: 2547.8547595577584


Fairness and Bias Metrics:
RMSE Ratio: 0.9189382788255989
MAE Ratio: 0.910957211087498
Performance for 'Gender':
   Gender         RMSE          MAE  \
0       0  3070.776021  2320.986666   
1       1  3341.656444  2547.854760   

                                              y_test  \
0  170480     7553
280707     6933
480003     460...   
1  383524    16549
499313     9741
256187    1193...   

                                              y_pred  
0  [4080.2046, 5495.811, 4080.2046, 6731.9233, 11...  
1  [13615.688, 6821.2266, 12005.329, 4080.2046, 5...  
