In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
import pickle

# Load the data
cards = pd.read_pickle('/content/drive/MyDrive/Colab Notebooks/data_loaded/cards.pkl')
order_details = pd.read_pickle('/content/drive/MyDrive/Colab Notebooks/data_loaded/order_details.pkl')
orders = pd.read_pickle('/content/drive/MyDrive/Colab Notebooks/data_loaded/orders.pkl')
profiles = pd.read_pickle('/content/drive/MyDrive/Colab Notebooks/data_loaded/profiles.pkl')
users = pd.read_pickle('/content/drive/MyDrive/Colab Notebooks/data_loaded/users.pkl')

# Check the first few rows of each dataset to understand its structure
print(cards.head())
print(order_details.head())
print(orders.head())
print(profiles.head())
print(users.head())

# Merge data as required, assuming 'user.uid' is the common column across datasets
data = pd.merge(profiles, users, on='user.uid', how='left')
data = pd.merge(data, orders, on='user.uid', how='left')
data = pd.merge(data, order_details, on='order.uid', how='left')
data = pd.merge(data, cards, on='user.uid', how='left')

# Check for missing values and fill or drop as necessary
print(data.isnull().sum())

# Fill missing values for boolean columns
data['isAnonymous'] = data['isAnonymous'].fillna(False).astype('boolean')
data['googleId'] = data['googleId'].fillna(False).astype('boolean')
data['appleId'] = data['appleId'].fillna(False).astype('boolean')
data['facebookId'] = data['facebookId'].fillna(False).astype('boolean')
data['item.discount'] = data['item.discount'].fillna(False).astype('boolean')

# Feature Engineering (use existing columns for total_orders and total_spent)
# Replace 'order_id' and 'order_details_amount' with correct column names from merged data
data['total_orders'] = data.groupby('user.uid')['order.uid'].transform('nunique')
data['total_spent'] = data.groupby('user.uid')['item.amount'].transform('sum')  # Assuming 'item.amount' exists in order_details
data['avg_order_value'] = data['total_spent'] / data['total_orders']

# Create churn column (you may want to define your own churn criteria)
# Example: If total_spent is 0 or NaN, consider the customer as churned
data['churn'] = (data['total_spent'] == 0).astype(int)

# Feature selection (remove columns that are not useful)
X = data[['total_orders', 'total_spent', 'avg_order_value']]  # Add more features as needed
y = data['churn']  # Assuming 'churn' is the column indicating customer churn

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Build and train the model (RandomForest in this case)
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
print("Classification Report:")
print(classification_report(y_test, y_pred))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

# Optionally, save the model for future use
with open('churn_prediction_model.pkl', 'wb') as f:
    pickle.dump(model, f)


   card.uid assignedAt   birthday   status                    user.uid  \
1  00000011 2023-03-13 1985-02-21  pending  ogv7xt7089clwmbgmtam35zvjq   
2  00000022 2022-01-14 1985-09-28  pending  naxfhkqjhcqrx3nv6ge9kxenth   
4  00000044 2018-01-03        NaT    valid  70uymyvsxtusxwrrgva0bfen8q   
5  00000055 2018-01-03        NaT    valid  9w4drnej3ixwqtwkixk94r7rdn   
9  00000099 2019-02-08        NaT  pending  uhxdpccr8npu3lljw9osf0vlwy   

   is_fully_empty_not_assigned  
1                        False  
2                        False  
4                        False  
5                        False  
9                        False  
               item.uid           order.uid  item.date           product.uid  \
0  rwzvrd0hxoshtjtzx5ca  4y9zqqvldfqr9n2xnu 2024-07-05  owhlijmmescilimfiktp   
1  mmwaugpnnpvhrcepwiih  termljwsr2gecnzwks 2024-12-27   form:po9lflx23j2024   
2  bshibhxedpkdtmzpmczi  ko8axscf5grobnz9in 2024-07-02  9onsf7m6j7bnwakw2otu   
3  axrcvxzh7evw2ebakhho  ko8axscf5gro

In [None]:
with open('churn_prediction_model.pkl', 'wb') as f:
    pickle.dump(model, f)


In [None]:
with open('churn_prediction_model.pkl', 'rb') as f:
    model = pickle.load(f)
