In [None]:
import pandas as pd
import numpy as np
import psycopg2
import joblib
from sqlalchemy import create_engine
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# ✅ DATABASE CONNECTION
DATABASE_URL = "postgresql://neondb_owner:npg_GUJjKT8F7Hts@ep-crimson-voice-a52x64c8.us-east-2.aws.neon.tech/neondb?sslmode=require"
engine = create_engine(DATABASE_URL)

# ✅ Load dataset from PostgreSQL
def load_data():
    query = "SELECT * FROM user_full_dataset;"
    df = pd.read_sql(query, engine)
    return df

# ✅ Data Preprocessing
def preprocess_data(df):
    # Drop unnecessary columns
    drop_columns = ['user_password', 'user_email', 'order_customer_email', 'order_customer_name']
    df.drop(columns=[col for col in drop_columns if col in df.columns], inplace=True, errors='ignore')

    # Handling missing values
    df.fillna(0, inplace=True)

    # Encoding categorical features
    categorical_cols = ['cart_product_type', 'order_product_type', 'order_payment_method', 'order_status']
    label_encoders = {}
    
    for col in categorical_cols:
        if col in df.columns:
            le = LabelEncoder()
            df[col] = le.fit_transform(df[col])
            label_encoders[col] = le

    # Feature scaling
    scaler = StandardScaler()
    numeric_cols = ['cart_quantity', 'order_quantity', 'order_total_price', 'login_count', 'total_time_spent']
    df[numeric_cols] = scaler.fit_transform(df[numeric_cols])

    return df, label_encoders, scaler

# ✅ Feature Engineering: Creating Churn Label
def create_churn_label(df):
    df['churn'] = np.where((df['order_id'].notnull()) & (df['login_count'] == 0), 1, 0)
    return df

# ✅ Load & preprocess data
df = load_data()
df, label_encoders, scaler = preprocess_data(df)
df = create_churn_label(df)

# ✅ Define features and target
X = df.drop(columns=['user_id', 'churn'], errors='ignore')  # Exclude non-relevant columns
y = df['churn']

# ✅ Split dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# ✅ Train Logistic Regression Model
log_reg = LogisticRegression()
log_reg.fit(X_train, y_train)
log_reg_preds = log_reg.predict(X_test)
log_reg_acc = accuracy_score(y_test, log_reg_preds)

# ✅ Train Random Forest Model
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)
rf_preds = rf.predict(X_test)
rf_acc = accuracy_score(y_test, rf_preds)

# ✅ Train XGBoost Model
xgb = XGBClassifier(use_label_encoder=False, eval_metric='logloss')
xgb.fit(X_train, y_train)
xgb_preds = xgb.predict(X_test)
xgb_acc = accuracy_score(y_test, xgb_preds)

# ✅ Select the Best Model
best_model = max([(log_reg, log_reg_acc), (rf, rf_acc), (xgb, xgb_acc)], key=lambda x: x[1])[0]
best_model_acc = max(log_reg_acc, rf_acc, xgb_acc)
print(f"Best Model: {best_model.__class__.__name__} with Accuracy: {best_model_acc}")

# ✅ Evaluation Metrics
print("\nConfusion Matrix:\n", confusion_matrix(y_test, best_model.predict(X_test)))
print("\nClassification Report:\n", classification_report(y_test, best_model.predict(X_test)))

# ✅ Save the best model
joblib.dump(best_model, "customer_churn_model.pkl")
joblib.dump(label_encoders, "label_encoders.pkl")
joblib.dump(scaler, "scaler.pkl")

print("✅ Model saved successfully!")
