In [1]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression, RidgeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score, f1_score, confusion_matrix
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC, SVC
from sklearn.model_selection import StratifiedKFold, train_test_split, cross_val_score
import time
import pickle


In [3]:
import sklearn
sklearn.__version__

'1.4.2'

In [4]:
pd.__version__

'2.2.2'

In [5]:
np.__version__

'1.26.4'

In [2]:
final_df=pd.read_csv('../project_1_dataset/filename.csv')

In [3]:
len(final_df)

9730639

In [4]:
final_df.head()

Unnamed: 0,product_id,product_name,order_id,add_to_cart_order,reordered,user_id,order_number,order_dow,order_hour_of_day,days_since_prior_order,days_since_last_purchase,user_order_count,total_items_ordered,product_popularity,avg_order_hour,order_frequency,department_affinity,preferred_order_dow,avg_reorder_rate,user_product_reorder_rate
0,25773,#2 Coffee Filters,2434116,2,0,64923,6,2,13,7.0,0.0,174,45,777,13.5,3,0.114943,3.522989,0.706897,0.666667
1,25773,#2 Coffee Filters,2314839,1,1,195887,22,6,10,6.0,7.0,373,105,777,12.801609,3,0.058981,2.640751,0.710456,0.666667
2,25773,#2 Coffee Filters,2632180,17,0,138985,3,3,14,7.0,0.0,414,300,777,11.838164,2,0.024155,2.717391,0.623188,0.5
3,25773,#2 Coffee Filters,541991,13,0,95635,2,6,7,18.0,0.0,378,351,777,11.828042,1,0.018519,2.399471,0.484127,0.0
4,25773,#2 Coffee Filters,2520810,4,0,180592,9,2,14,15.0,0.0,88,10,777,9.965909,1,0.079545,2.204545,0.602273,0.0


### Label encode

In [5]:
le=LabelEncoder()

In [6]:
x=final_df.drop(columns=['reordered'])

In [7]:
y=final_df['reordered']

In [8]:
x['product_name']=le.fit_transform(x['product_name'])

In [9]:
x.head()

Unnamed: 0,product_id,product_name,order_id,add_to_cart_order,user_id,order_number,order_dow,order_hour_of_day,days_since_prior_order,days_since_last_purchase,user_order_count,total_items_ordered,product_popularity,avg_order_hour,order_frequency,department_affinity,preferred_order_dow,avg_reorder_rate,user_product_reorder_rate
0,25773,0,2434116,2,64923,6,2,13,7.0,0.0,174,45,777,13.5,3,0.114943,3.522989,0.706897,0.666667
1,25773,0,2314839,1,195887,22,6,10,6.0,7.0,373,105,777,12.801609,3,0.058981,2.640751,0.710456,0.666667
2,25773,0,2632180,17,138985,3,3,14,7.0,0.0,414,300,777,11.838164,2,0.024155,2.717391,0.623188,0.5
3,25773,0,541991,13,95635,2,6,7,18.0,0.0,378,351,777,11.828042,1,0.018519,2.399471,0.484127,0.0
4,25773,0,2520810,4,180592,9,2,14,15.0,0.0,88,10,777,9.965909,1,0.079545,2.204545,0.602273,0.0


In [None]:
def model_selection(X, y):
   
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    

    models = {
    'Logistic Regression (L2 Regularization)': LogisticRegression(max_iter=1000, solver='liblinear', penalty='l2'),
    'Logistic Regression (L1 Regularization)': LogisticRegression(max_iter=1000, solver='liblinear', penalty='l1'),
    'Logistic Regression (ElasticNet Regularization)': LogisticRegression(max_iter=1000, solver='saga', penalty='elasticnet', l1_ratio=0.5),
    'Naive Bayes': GaussianNB(),
    'Decision Stump': DecisionTreeClassifier(max_depth=1),
    'Linear Discriminant Analysis': LinearDiscriminantAnalysis(),
    'SGD Classifier (Log Loss)': SGDClassifier(loss='log_loss', max_iter=1000),
    'SGD Classifier (Hinge Loss)': SGDClassifier(loss='hinge', max_iter=1000),
    'Random Forest (Shallow Depth)': RandomForestClassifier(n_estimators=10, max_depth=3),
    'Ridge Classifier': RidgeClassifier(),
    'Linear SVC': LinearSVC(max_iter=1000),
    'QDA': QuadraticDiscriminantAnalysis(),
}
    
    # Initialize variables to track the best model
    best_model = None
    best_model_name = ""
    best_roc_auc = 0

    for model_name, model in models.items():
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        y_prob = model.predict_proba(X_test)[:, 1]
        accuracy = accuracy_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred)
        recall = recall_score(y_test, y_pred)
        roc_auc = roc_auc_score(y_test, y_prob)
        f1_score_value=f1_score(y_test, y_pred)
        print(f"Model: {model_name}")
        print(f"Accuracy: {accuracy:.4f}")
        print(f"Precision: {precision:.4f}")
        print(f"Recall: {recall:.4f}")
        print(f"ROC-AUC: {roc_auc:.4f}")
        print(f"f1_score_value: {f1_score_value:.4f}")
        print("-" * 40)
        
        # Select the best model based on ROC-AUC
        if roc_auc > best_roc_auc:
            best_roc_auc = roc_auc
            best_model = model
            best_model_name = model_name
    
    print(f"\nBest Model: {best_model_name} with ROC-AUC of {best_roc_auc:.4f}")
    
    return best_model, best_model_name


best_model, best_model_name = model_selection(x, y)


In [None]:

def model_selection(X, y, models, scoring='accuracy', k=10, sample_fraction=0.3):
    """
    Perform model selection on a stratified 30% sample of the dataset using k-fold cross-validation.
    
    Parameters:
    X (array-like): Feature matrix.
    y (array-like): Target vector.
    models (dict): Dictionary of models to evaluate. 
                   Example: {'Logistic Regression': LogisticRegression(), 'Random Forest': RandomForestClassifier()}
    scoring (str): Scoring metric for evaluation. Default is 'accuracy'.
    k (int): Number of folds for cross-validation. Default is 5.
    sample_fraction (float): Fraction of the dataset to sample. Default is 0.3 (30% sample).
    
    Returns:
    best_model (model): The best performing model based on average k-fold score.
    best_model_name (str): The name of the best performing model.
    """
    X_sample, _, y_sample, _ = train_test_split(X, y, test_size=1-sample_fraction, stratify=y, random_state=42)
    cv = StratifiedKFold(n_splits=k, shuffle=True, random_state=42)
    

    best_model = None
    best_model_name = None
    best_score = -np.inf

    for model_name, model in models.items():
        print(f"Evaluating {model_name}...")
        
        start_time = time.time()
        cv_scores = cross_val_score(model, X_sample, y_sample, cv=cv, scoring=scoring)
        avg_score = np.mean(cv_scores)  # Get average score across all folds
        end_time = time.time()
     
        print(f"{model_name}: Average {scoring} = {avg_score:.4f} (Time taken: {end_time - start_time:.2f} seconds)")
        if avg_score > best_score:
            best_score = avg_score
            best_model = model
            best_model_name = model_name
    
    print(f"\nBest Model: {best_model_name} with Average {scoring}: {best_score:.4f}")
    
    return best_model, best_model_name


models = {
    'Logistic Regression (L2 Regularization)': LogisticRegression(max_iter=1000, solver='liblinear', penalty='l2'),
    'Logistic Regression (L1 Regularization)': LogisticRegression(max_iter=1000, solver='liblinear', penalty='l1'),
    'Logistic Regression (ElasticNet Regularization)': LogisticRegression(max_iter=1000, solver='saga', penalty='elasticnet', l1_ratio=0.5),
    'Naive Bayes': GaussianNB(),
    'Decision Stump': DecisionTreeClassifier(max_depth=1),
    'Linear Discriminant Analysis': LinearDiscriminantAnalysis(),
    'SGD Classifier (Log Loss)': SGDClassifier(loss='log_loss', max_iter=1000),
    'SGD Classifier (Hinge Loss)': SGDClassifier(loss='hinge', max_iter=1000),
    'Random Forest (Shallow Depth)': RandomForestClassifier(n_estimators=10, max_depth=3),
    'Ridge Classifier': RidgeClassifier(),
    'Linear SVC': LinearSVC(max_iter=1000),
    'QDA': QuadraticDiscriminantAnalysis(),
}
best_model, best_model_name = model_selection(x, y, models, scoring='accuracy', k=5)



Evaluating Logistic Regression (L2 Regularization)...
Logistic Regression (L2 Regularization): Average accuracy = 0.6568 (Time taken: 57.69 seconds)
Evaluating Logistic Regression (L1 Regularization)...
Logistic Regression (L1 Regularization): Average accuracy = 0.8591 (Time taken: 154.58 seconds)
Evaluating Logistic Regression (ElasticNet Regularization)...




## Taking the models with threshold value 0.8 and finalising the model based 40% data

In [14]:
def model_final(X, y,sample_fraction=0.4):
    
    X_sample, _, y_sample, _ = train_test_split(X, y, test_size=1-sample_fraction, stratify=y, random_state=42)
    X_train, X_test, y_train, y_test = train_test_split(X_sample,y_sample, test_size=0.2, random_state=42)
    

    models = {
    'Logistic Regression (L1 Regularization)': LogisticRegression(max_iter=1000, solver='liblinear', penalty='l1'),
    'Decision Stump': DecisionTreeClassifier(max_depth=1),
    'Linear Discriminant Analysis': LinearDiscriminantAnalysis(),
    'Random Forest (Shallow Depth)': RandomForestClassifier(n_estimators=10, max_depth=3),
    'QDA': QuadraticDiscriminantAnalysis(),
}
    
    # Initialize variables to track the best model
    best_model = None
    best_model_name = ""
    best_f1_score_value = 0

    for model_name, model in models.items():
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        y_prob = model.predict_proba(X_test)[:, 1]
        accuracy = accuracy_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred)
        recall = recall_score(y_test, y_pred)
        roc_auc = roc_auc_score(y_test, y_prob)
        f1_score_value=f1_score(y_test, y_pred)
        print(f"Model: {model_name}")
        print(f"Accuracy: {accuracy:.4f}")
        print(f"Precision: {precision:.4f}")
        print(f"Recall: {recall:.4f}")
        print(f"ROC-AUC: {roc_auc:.4f}")
        print(f"f1_score_value: {f1_score_value:.4f}")
        print("-" * 40)
        
        # Select the best model based on ROC-AUC
        if f1_score_value > best_f1_score_value:
            best_f1_score_value = f1_score_value
            best_model = model
            best_model_name = model_name
    
    print(f"\nBest Model: {best_model_name} with F1 score of {best_f1_score_value:.4f}")
    
    return best_model, best_model_name


best_model, best_model_name = model_final(x, y)


Model: Logistic Regression (L1 Regularization)
Accuracy: 0.8594
Precision: 0.8539
Recall: 0.9188
ROC-AUC: 0.9364
f1_score_value: 0.8852
----------------------------------------
Model: Decision Stump
Accuracy: 0.8358
Precision: 0.7822
Recall: 1.0000
ROC-AUC: 0.7999
f1_score_value: 0.8778
----------------------------------------
Model: Linear Discriminant Analysis
Accuracy: 0.8526
Precision: 0.8257
Recall: 0.9507
ROC-AUC: 0.9328
f1_score_value: 0.8838
----------------------------------------
Model: Random Forest (Shallow Depth)
Accuracy: 0.8726
Precision: 0.8252
Recall: 0.9946
ROC-AUC: 0.9239
f1_score_value: 0.9020
----------------------------------------
Model: QDA
Accuracy: 0.8242
Precision: 0.8823
Recall: 0.8100
ROC-AUC: 0.9135
f1_score_value: 0.8446
----------------------------------------

Best Model: Random Forest (Shallow Depth) with F1 score of 0.9020


In [16]:


scaler = StandardScaler()
X_scaled = scaler.fit_transform(x)

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42, stratify=y)
rf_model = RandomForestClassifier(n_estimators=10, max_depth=3, random_state=42)
rf_model.fit(X_train, y_train)
y_pred = rf_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, rf_model.predict_proba(X_test)[:, 1])

print("Random Forest (Shallow Depth) Model Performance:")
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-Score: {f1:.4f}")
print(f"ROC-AUC Score: {roc_auc:.4f}")
conf_matrix = confusion_matrix(y_test, y_pred)
print("\nConfusion Matrix:")
print(conf_matrix)

feature_importance = rf_model.feature_importances_
print("\nFeature Importance:")
for i, v in enumerate(feature_importance):
    print(f"Feature {i}: {v:.4f}")


Random Forest (Shallow Depth) Model Performance:
Accuracy: 0.8749
Precision: 0.8333
Recall: 0.9848
F1-Score: 0.9028
ROC-AUC Score: 0.9262

Confusion Matrix:
[[ 572848  225955]
 [  17468 1129857]]

Feature Importance:
Feature 0: 0.0000
Feature 1: 0.0001
Feature 2: 0.0000
Feature 3: 0.0009
Feature 4: 0.0000
Feature 5: 0.1141
Feature 6: 0.0000
Feature 7: 0.0000
Feature 8: 0.0811
Feature 9: 0.1529
Feature 10: 0.0272
Feature 11: 0.0000
Feature 12: 0.0183
Feature 13: 0.0000
Feature 14: 0.3254
Feature 15: 0.0004
Feature 16: 0.0005
Feature 17: 0.0105
Feature 18: 0.2686


In [20]:


def model_final(X, y, sample_fraction=0.4):
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    models = {
        'Logistic Regression (L1 Regularization)': LogisticRegression(max_iter=1000, solver='liblinear', penalty='l1'),
        'Linear Discriminant Analysis': LinearDiscriminantAnalysis(),
        'QDA': QuadraticDiscriminantAnalysis(),
    }
    best_model = None
    best_model_name = ""
    best_f1_score_value = 0

    for model_name, model in models.items():
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        y_prob = model.predict_proba(X_test)[:, 1]
        accuracy = accuracy_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred)
        recall = recall_score(y_test, y_pred)
        roc_auc = roc_auc_score(y_test, y_prob)
        f1_score_value = f1_score(y_test, y_pred)
        
        print(f"Model: {model_name}")
        print(f"Accuracy: {accuracy:.4f}")
        print(f"Precision: {precision:.4f}")
        print(f"Recall: {recall:.4f}")
        print(f"ROC-AUC: {roc_auc:.4f}")
        print(f"F1-Score: {f1_score_value:.4f}")
        print("-" * 40)
        if f1_score_value > best_f1_score_value:
            best_f1_score_value = f1_score_value
            best_model = model
            best_model_name = model_name
    
    print(f"\nBest Model: {best_model_name} with F1 score of {best_f1_score_value:.4f}")
    with open('best_model.pkl', 'wb') as file:
        pickle.dump(best_model, file)
        print(f"The best model has been saved as 'best_model.pkl'.")

    return best_model, best_model_name

best_model, best_model_name = model_final(x, y)


Model: Logistic Regression (L1 Regularization)
Accuracy: 0.8594
Precision: 0.8537
Recall: 0.9189
ROC-AUC: 0.9365
F1-Score: 0.8851
----------------------------------------
Model: Linear Discriminant Analysis
Accuracy: 0.8522
Precision: 0.8253
Recall: 0.9506
ROC-AUC: 0.9328
F1-Score: 0.8835
----------------------------------------
Model: QDA
Accuracy: 0.8242
Precision: 0.8824
Recall: 0.8097
ROC-AUC: 0.9135
F1-Score: 0.8445
----------------------------------------

Best Model: Logistic Regression (L1 Regularization) with F1 score of 0.8851
The best model has been saved as 'best_model.pkl'.
