In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import sqlite3
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [3]:
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', 1000)
pd.set_option('display.width', 1000)

In [4]:
# Load the CSV file from Google Drive
file_path = '/content/drive/My Drive/unnested.csv'
copurchases = pd.read_csv(file_path)

In [5]:
copurchases.head(10)

Unnamed: 0,product_id,category_code,co_purchase_id,co_purchase_category,price,brand,hour,day_of_week,event_type
0,6200687,environment.air_heater,6200259,environment.air_heater,28.03,oasis,0,1,purchase
1,6200687,environment.air_heater,6200724,environment.air_heater,28.03,oasis,0,1,purchase
2,6200687,environment.air_heater,3800985,iron,28.03,oasis,0,1,purchase
3,6200687,environment.air_heater,6200704,environment.air_heater,28.03,oasis,0,1,purchase
4,6200687,environment.air_heater,6200806,environment.air_heater,28.03,oasis,0,1,purchase
5,3200361,kitchen.meat_grinder,3200532,kitchen.meat_grinder,120.95,philips,2,1,purchase
6,3200361,kitchen.meat_grinder,10900210,kitchen.mixer,120.95,philips,2,1,purchase
7,3200361,kitchen.meat_grinder,2501889,kitchen.oven,120.95,philips,2,1,purchase
8,3200361,kitchen.meat_grinder,2800007,kitchen.refrigerators,120.95,philips,2,1,purchase
9,3200361,kitchen.meat_grinder,3200305,kitchen.meat_grinder,120.95,philips,2,1,purchase


In [8]:
from sklearn.metrics.pairwise import cosine_similarity

# Display the first few rows of the dataframe to ensure it loaded correctly
copurchases.head()

# Filter the dataset for purchase events
purchase_data = copurchases[copurchases['event_type'] == 'purchase']

# Create a user-item matrix where rows represent 'product_id' and columns represent 'co_purchase_id'
product_co_matrix = purchase_data.pivot_table(index='product_id', columns='co_purchase_id', aggfunc='size', fill_value=0)

# Compute the cosine similarity between products
product_similarity = cosine_similarity(product_co_matrix)

# Convert the similarity matrix to a DataFrame for easy handling
product_similarity_df = pd.DataFrame(product_similarity, index=product_co_matrix.index, columns=product_co_matrix.index)

# Function to get the top N co-purchases for a given product_id
def get_top_n_co_purchases(product_id, n=5):
    if product_id in product_similarity_df:
        similar_products = product_similarity_df[product_id].sort_values(ascending=False)
        top_n_products = similar_products.head(n + 1).index.tolist()
        top_n_products.remove(product_id)
        return top_n_products[:n]
    else:
        return []

# Example usage:
product_id = 6200687
top_n_co_purchases = get_top_n_co_purchases(product_id, n=5)
print(f"Top 5 co-purchases for product {product_id}: {top_n_co_purchases}")

Top 5 co-purchases for product 6200687: [4400371, 4900429, 2800387, 3901010, 3801057]


In [12]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, mean_squared_error, roc_auc_score, confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.neural_network import MLPClassifier
import matplotlib.pyplot as plt

# Assuming 'data' is your dataframe and 'co_purchase' is the target variable
X = copurchases.drop('co_purchase_id', axis=1)
y =copurchases['co_purchase_id']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [14]:
def evaluate_model(model, X_test, y_test, model_name):
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')
    f1 = f1_score(y_test, y_pred, average='weighted')
    mse = mean_squared_error(y_test, y_pred)
    auroc = roc_auc_score(y_test, model.predict_proba(X_test), multi_class='ovr')
    cm = confusion_matrix(y_test, y_pred)

    print(f"{model_name} Evaluation:")
    print("Accuracy:", accuracy)
    print("Precision:", precision)
    print("Recall:", recall)
    print("F1 Score:", f1)
    print("Mean Squared Error:", mse)
    print("AUROC:", auroc)
    print("Confusion Matrix:\n", cm)

    # Feature Importance for applicable models
    if hasattr(model, 'feature_importances_'):
        feature_importances = model.feature_importances_
        features = X.columns
        plt.figure(figsize=(10, 6))
        plt.barh(features, feature_importances, align='center')
        plt.xlabel('Feature Importance')
        plt.title(f'Feature Importance for {model_name}')
        plt.show()

    return {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1,
        'mse': mse,
        'auroc': auroc,
        'confusion_matrix': cm
    }


In [17]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Separate the features and target
X = co_purchase_id.drop('copurchases_id', axis=1)
y = co_purchase_id['copurchases_id']

# Identify categorical columns
categorical_cols = X.select_dtypes(include=['object', 'category']).columns

# Create a column transformer with OneHotEncoder
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols)
    ],
    remainder='passthrough'  # Leave other columns unchanged
)

# Create a pipeline to apply the preprocessor and the model
def create_pipeline(model):
    return Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('model', model)
    ])

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


NameError: name 'co_purchase_id' is not defined