In [1]:
import pandas as pd
import numpy as np

In [2]:
# here df we will be using which is already encoded during explaining EDA and feature engineering part , in EDA file
df=pd.read_csv('train_df.csv')
df.head(1)

Unnamed: 0,item_no,category,main_promotion,color,stars,success_indicator
0,739157,5,0,3,1.0,0


In [3]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler  # as distance based algorithem is there
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.neural_network import MLPClassifier

In [4]:
import warnings
warnings.filterwarnings('ignore')  #to do away with unwanted or unuseful warnings which makes code lengthy

#Model_selection_Pipeline

In [5]:
data = df
x=df.drop('success_indicator',axis=1)
y=df['success_indicator']


x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=77)

pipeline = Pipeline([('scaler', StandardScaler()),('classifier', None) ])

# Adding models to the param_grids list so to compare their accuracy with same data but different hyperparamere
param_grids = [
    {'classifier': [LogisticRegression()],
     'classifier__C': [0.1, 1, 10]},
    {'classifier': [RandomForestClassifier()],
     'classifier__n_estimators': [100, 200, 300]},  #number of base model used to predict output
    {'classifier': [MLPClassifier()],
     'classifier__hidden_layer_sizes': [(100,), (50, 50), (25, 25, 25)],
     'classifier__activation': ['relu', 'tanh'],  # these will be our activation function
     'classifier__solver': ['adam']}              # adam will be optimiazer for ANN
]

best_model = None
best_score = 0

for entity in param_grids:
    gs = GridSearchCV(pipeline, entity, cv=5, scoring='accuracy')  # cross validation in 5 , while measure of comparison
    gs.fit(x_train, y_train)                                       # is acccuracy , we may use random search cv also if
                                                                    # data is more

    y_pred = gs.predict(x_test)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)

    print("Best Parameters:", gs.best_params_)
    print("Best Score:", gs.best_score_)
    print("Accuracy:", accuracy)
    print("Precision:", precision)
    print("Recall:", recall)
    print("F1 Score:", f1)
    print("***************************************************")

    if gs.best_score_ > best_score:
        best_model = gs.best_estimator_
        best_score = gs.best_score_

print("Best Model:", best_model)
print("Best Score:", best_score)

Best Parameters: {'classifier': LogisticRegression(C=1), 'classifier__C': 1}
Best Score: 0.7753150758549568
Accuracy: 0.7733847637415622
Precision: 0.7406113537117904
Recall: 0.8305582761998042
F1 Score: 0.7830101569713759
***************************************************
Best Parameters: {'classifier': RandomForestClassifier(n_estimators=200), 'classifier__n_estimators': 200}
Best Score: 0.8040008860033261
Accuracy: 0.8032786885245902
Precision: 0.7996089931573802
Recall: 0.801175318315377
F1 Score: 0.8003913894324852
***************************************************
Best Parameters: {'classifier': MLPClassifier(activation='tanh', hidden_layer_sizes=(50, 50)), 'classifier__activation': 'tanh', 'classifier__hidden_layer_sizes': (50, 50), 'classifier__solver': 'adam'}
Best Score: 0.8279900796676761
Accuracy: 0.8283510125361621
Precision: 0.7878787878787878
Recall: 0.89128305582762
F1 Score: 0.8363970588235293
***************************************************
Best Model: Pipeline(s

#ANN will be the model which we will be choosing

##Reasons

Accuracy perspective : Among majority of the models which we have used for modelling the data we getting high accuracy on ANN.

The MLP Classifier (Neural Network) outperformed both Logistic Regression and Random Forest classifiers in terms of accuracy, precision, recall, and F1 score.

The MLP Classifier achieved the highest F1 score (0.8364) and recall (0.8913), indicating a good balance between precision and recall.

Random Forest Classifier performed well but slightly lower than the MLP Classifier in terms of accuracy and F1 score.
Logistic Regression Classifier had the lowest performance among the three models, with the lowest accuracy, precision, recall, and F1 score.

Overall, the MLP Classifier with a tanh activation function and hidden layer sizes (50, 50) is recommended as the best model for the classification task.
