In [4]:
import pandas as pd

# Load the data from the provided CSV files
purchase_history_path = 'data/raw/purchase_history.csv'
customer_interactions_path = 'data/raw/customer_interactions.csv'
product_details_path = 'data/raw/product_details.csv'

# Reading the CSV files
purchase_history = pd.read_csv(purchase_history_path, delimiter=";")
customer_interactions = pd.read_csv(customer_interactions_path)
product_details = pd.read_csv(product_details_path, delimiter=";")

purchase_history.dropna(axis=1, how='all', inplace=True)
product_details.dropna(axis=1, how='all', inplace=True)

# Display the first few rows of each dataframe
purchase_history_head = purchase_history.head()
customer_interactions_head = customer_interactions.head()
product_details_head = product_details.head()

(purchase_history_head, customer_interactions_head, product_details_head)


(   customer_id  product_id purchase_date
 0            1         101    2023-01-01
 1            1         105    2023-01-05
 2            2         102    2023-01-02
 3            3         103    2023-01-03
 4            4         104    2023-01-04,
    customer_id  page_views  time_spent
 0            1          25         120
 1            2          20          90
 2            3          30         150
 3            4          15          80
 4            5          22         110,
    product_id        category  price  ratings
 0         101     Electronics    500      4.5
 1         102        Clothing     50      3.8
 2         103  Home & Kitchen    200      4.2
 3         104          Beauty     30      4.0
 4         105     Electronics    800      4.8)

In [5]:
# Merging purchase_history with customer_interactions on 'customer_id'
merged_data = pd.merge(purchase_history, customer_interactions, on='customer_id', how='left')

# Merging the result with product_details on 'product_id'
final_data = pd.merge(merged_data, product_details, on='product_id', how='left')

# Feature Engineering
# Convert 'purchase_date' to datetime format
final_data['purchase_date'] = pd.to_datetime(final_data['purchase_date'])

# Create features
final_data['total_purchases'] = final_data.groupby('customer_id')['product_id'].transform('count')
final_data['total_spend'] = final_data.groupby('customer_id')['price'].transform('sum')
final_data['avg_product_rating'] = final_data.groupby('customer_id')['ratings'].transform('mean')
final_data['days_since_last_purchase'] = (final_data['purchase_date'].max() - final_data['purchase_date']).dt.days

# Display the merged and enriched dataset
final_data.head()

Unnamed: 0,customer_id,product_id,purchase_date,page_views,time_spent,category,price,ratings,total_purchases,total_spend,avg_product_rating,days_since_last_purchase
0,1,101,2023-01-01,25,120,Electronics,500,4.5,2,1300,4.65,4
1,1,105,2023-01-05,25,120,Electronics,800,4.8,2,1300,4.65,0
2,2,102,2023-01-02,20,90,Clothing,50,3.8,1,50,3.8,3
3,3,103,2023-01-03,30,150,Home & Kitchen,200,4.2,1,200,4.2,2
4,4,104,2023-01-04,15,80,Beauty,30,4.0,1,30,4.0,1


In [7]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder

# Reinitializing OneHotEncoder without the 'sparse' parameter
encoder = OneHotEncoder(drop='first')  # Drop first to avoid multicollinearity

# Encoding the categorical feature
encoded_categories = encoder.fit_transform(final_data[['category']]).toarray()
encoded_categories_df = pd.DataFrame(encoded_categories, 
                                     columns=[f"category_{x}" for x in encoder.categories_[0][1:]])

# Concatenating the encoded categorical features back to the main dataframe
final_data_encoded = pd.concat([final_data.reset_index(drop=True), encoded_categories_df.reset_index(drop=True)], axis=1)

# Dropping the original 'category' column as it's now encoded
final_data_encoded.drop('category', axis=1, inplace=True)

# Display the transformed dataframe
final_data_encoded.head()


Unnamed: 0,customer_id,product_id,purchase_date,page_views,time_spent,price,ratings,total_purchases,total_spend,avg_product_rating,days_since_last_purchase,category_Clothing,category_Electronics,category_Home & Kitchen
0,1,101,2023-01-01,0.666667,0.571429,0.61039,4.5,2,1.0,4.65,1.0,0.0,1.0,0.0
1,1,105,2023-01-05,0.666667,0.571429,1.0,4.8,2,1.0,4.65,0.0,0.0,1.0,0.0
2,2,102,2023-01-02,0.333333,0.142857,0.025974,3.8,1,0.015748,3.8,0.75,1.0,0.0,0.0
3,3,103,2023-01-03,1.0,1.0,0.220779,4.2,1,0.133858,4.2,0.5,0.0,0.0,1.0
4,4,104,2023-01-04,0.0,0.0,0.0,4.0,1,0.0,4.0,0.25,0.0,0.0,0.0


In [8]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import train_test_split

# Define features (X) and target (y)
X = final_data_encoded.drop(['customer_id', 'product_id', 'purchase_date', 'category_Electronics'], axis=1)
y = final_data_encoded['category_Electronics']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the Logistic Regression model
logreg = LogisticRegression()

# Train the model
logreg.fit(X_train, y_train)

# Predict on the testing set
y_pred = logreg.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


(0.0,
 '              precision    recall  f1-score   support\n\n         0.0       0.00      0.00      0.00       0.0\n         1.0       0.00      0.00      0.00       2.0\n\n    accuracy                           0.00       2.0\n   macro avg       0.00      0.00      0.00       2.0\nweighted avg       0.00      0.00      0.00       2.0\n')

In [9]:
accuracy

0.0

In [10]:
 classification_rep

'              precision    recall  f1-score   support\n\n         0.0       0.00      0.00      0.00       0.0\n         1.0       0.00      0.00      0.00       2.0\n\n    accuracy                           0.00       2.0\n   macro avg       0.00      0.00      0.00       2.0\nweighted avg       0.00      0.00      0.00       2.0\n'