In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, accuracy_score

# Load dataset
df = pd.read_csv("email_data_scaled_standard.csv")

# Define target and features
target = "clicked"
drop_cols = ["clicked", "opened", "email_id", "user_past_purchases"]
X = df.drop(columns=drop_cols)
y = df[target]

# Feature groups
numeric_features = ["hour", "user_past_purchases_scaled"]
categorical_features = ["email_text", "email_version", "weekday", "user_country"]
# Time of day features already in binary format
passthrough_features = ["is_morning", "is_afternoon", "is_evening", "is_night"]

# Column transformer for preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), numeric_features),
        ("cat", OneHotEncoder(drop="first", handle_unknown="ignore"), categorical_features),
    ],
    remainder="passthrough"  # Keeps time features as-is
)

# Pipeline with KNN
pipeline = Pipeline([
    ("preprocessor", preprocessor),
    ("knn", KNeighborsClassifier())
])

# Hyperparameter grid for optimization
param_grid = {
    "knn__n_neighbors": [3, 5, 7, 9],
    "knn__weights": ["uniform", "distance"]
}

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Grid search with 5-fold CV
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring="accuracy", n_jobs=-1)
grid_search.fit(X_train, y_train)

# Evaluate best model
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)

# Results
print("✅ Best Parameters:", grid_search.best_params_)
print("✅ Accuracy:", accuracy_score(y_test, y_pred))
print("\n📊 Classification Report:\n", classification_report(y_test, y_pred))


✅ Best Parameters: {'knn__n_neighbors': 9, 'knn__weights': 'uniform'}
✅ Accuracy: 0.97735

📊 Classification Report:
               precision    recall  f1-score   support

           0       0.98      1.00      0.99     19547
           1       0.00      0.00      0.00       453

    accuracy                           0.98     20000
   macro avg       0.49      0.50      0.49     20000
weighted avg       0.96      0.98      0.97     20000



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [5]:
def predict_user_click(email_text, email_version, hour, weekday, user_country,
                       user_past_purchases_scaled, is_morning, is_afternoon, is_evening, is_night):
    """
    Predicts whether a user will click the email link based on input features.
    """
    # Create input DataFrame
    user_input = pd.DataFrame([{
        "email_text": email_text,
        "email_version": email_version,
        "hour": hour,
        "weekday": weekday,
        "user_country": user_country,
        "user_past_purchases_scaled": user_past_purchases_scaled,
        "is_morning": is_morning,
        "is_afternoon": is_afternoon,
        "is_evening": is_evening,
        "is_night": is_night
    }])
    
    # Predict using the trained model
    prediction = best_model.predict(user_input)[0]
    
    # Output result
    return "✅ Click" if prediction == 1 else "❌ No Click"


In [15]:
predict_user_click(
    email_text="long_email",
    email_version="personalized",
    hour=23,
    weekday="Tuesday",
    user_country="US",
    user_past_purchases_scaled=0.7,
    is_morning=1,
    is_afternoon=0,
    is_evening=0,
    is_night=0
)


'❌ No Click'

In [24]:
predict_user_click(
    email_text="short_email",              # better engagement from short content
    email_version="personalized",         # personalized worked well
    hour=9,                                # strong morning performance
    weekday="Tuesday",                    # acceptable day
    user_country="US",                    # high engagement segment
    user_past_purchases_scaled=0.8,       # strong past purchase history
    is_morning=1,
    is_afternoon=0,
    is_evening=0,
    is_night=0
)


'❌ No Click'

In [25]:
predict_user_click(
    email_text="long_email",              # longer content performs better here
    email_version="personalized",         # personalized still optimal
    hour=10,                               # best hour in the morning range
    weekday="Tuesday",                    # strong midweek performance
    user_country="US",                    # high engagement segment
    user_past_purchases_scaled=0.8,       # strong past purchase history
    is_morning=1,
    is_afternoon=0,
    is_evening=0,
    is_night=0
)


'❌ No Click'