In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import warnings 
warnings.filterwarnings('ignore')
%matplotlib inline
df=pd.read_csv("Travel.csv")
df.head()
#DATA CLEANING
''' 
Handling Missing Values
Handling Duplicates
Checking Datatype
Understand the Dataset
'''
df.isnull().sum()
#Check all the categories
df["Gender"].value_counts()
df["MaritalStatus"].value_counts()
#Replace fe male with Feamale
df["Gender"]=df["Gender"].replace("Fe Male","Female")
df["MaritalStatus"]=df["MaritalStatus"].replace("Single","Unmarried")
df.head()
df["Gender"].value_counts()

df["MaritalStatus"].value_counts()
#Checking the features with nan Values
features_with_na=[features for features in df.columns if df[features].isnull().sum()>=1]
features_with_na
df["Age"].isnull().sum()
df["TypeofContact"].isnull().sum()
df["Age"].isnull().mean()
#checking what percentage of null values in each column
for feature in features_with_na:
    print(feature,np.round(df[feature].isnull().mean()*100,5),"% missing values ")
#Statistics on numerical columns (Null cols)
#df[features_with_na].describe()
df[features_with_na].select_dtypes(exclude='object').describe()
#IMPUTUTING NULL VALUES
# we use median and mmode imputation techniques to fill the null values
features_with_na
df["TypeofContact"].mode()[0]
#AGE median
df["Age"].fillna(df["Age"].median(),inplace=True)

#TYPE OF CONTRACT mode 
df["TypeofContact"].fillna(df["TypeofContact"].mode()[0],inplace=True)

#DurationOfPitch median
df["DurationOfPitch"].fillna(df["DurationOfPitch"].median(),inplace=True)

#NumberOfFollowups mode 
df["NumberOfFollowups"].fillna(df["NumberOfFollowups"].mode()[0],inplace=True)

#Preferred Property Star mode
df["PreferredPropertyStar"].fillna(df["PreferredPropertyStar"].mode()[0],inplace=True)

#Number of Trips median
df["NumberOfTrips"].fillna(df["NumberOfTrips"].median(),inplace=True)

#Number of childredn Visiting mode
df["NumberOfChildrenVisiting"].fillna(df["NumberOfChildrenVisiting"].mode()[0],inplace=True)

#monthly income median
df["MonthlyIncome"].fillna(df["MonthlyIncome"].median(),inplace=True)




df.head()
#HERE we can see completely filled the null values
df.isnull().sum()
#CustomerId is not important,so we are going to remove that feature
df.drop('CustomerID',inplace=True,axis=1)
#One column is deleted
len(df.columns)
#FEATURE ENGINEERING
#Creating one new feature with total visiting by combining numberOfpersonsVisiting+numberOfChildrens Visiting
df["totalVisitings"]=df["NumberOfPersonVisiting"]+df["NumberOfChildrenVisiting"]
df["totalVisitings"]
df.drop(columns=["NumberOfPersonVisiting",'NumberOfChildrenVisiting'],inplace=True,axis=1)
len(df.columns)
df.head()
df.info()
#Get all the numeric Features
num_features=[feature for feature in df.columns if df[feature].dtype!="O"]
print(f"Number of Numerical features:{len(num_features)}")
#Get all categorical Features
categorical_features = [feature for feature in df.columns if df[feature].dtype=="O"]
print(f"Number of Categorical features are:{len(categorical_features)}")
#Discreate Features
discreate_features=[feature for feature in num_features if len(df[feature].unique())<=25]
print(f"Number of discreate features are:{len(discreate_features)}")
#Continous features
continous_features=[feature for feature in num_features if feature not in discreate_features]
print(f"Number of Continous features are:{len(continous_features)}")
#from numerical features we get both discreate and continous features
#Train Test Split and Model Training
from sklearn.model_selection import train_test_split
X=df.drop(["ProdTaken"],axis=1)
y=df["ProdTaken"]
X.head()
y.head()
y.value_counts()
#train test split
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=42)
X_train.shape,X_test.shape
X.info()
#Create Column transformers with 3 types of Transformers
cat_features=X.select_dtypes(include="object").columns 
num_features=X.select_dtypes(exclude="object").columns


from sklearn.preprocessing import OneHotEncoder,StandardScaler
from sklearn.compose import ColumnTransformer
numeric_transformer=StandardScaler()
oh_transformer=OneHotEncoder(drop="first")

preprocessor=ColumnTransformer(
    transformers=[
        ("oneHotEncoder",oh_transformer,categorical_features),
        ("StandardScaler",numeric_transformer,num_features)
    ]
)
#For example(WHY drop=TRUE):

#If a feature has three categories: A, B, and C, instead of creating three columns (A, B, C), the encoder will create two columns (B, C) and drop the first (A).
#If the row corresponds to category A, both B and C columns will be 0.
'''Without drop="first" (one-hot encoding):
The encoder creates one column for each category.

Red	  Green	Blue
1	  0	    0
0	  1	    0
0	  0	    1
1	  0	    0
With drop="first":
The encoder drops the first category (Red), leaving just two columns (Green and Blue).

Green	Blue
0	    0
1	    0
0	    1
0	    0
By dropping the first column, 
the information about the first category (Red) 
is still retained—when both Green and Blue are 0, 
the category must be Red. This helps reduce the number 
of columns and avoids redundancy.
'''






preprocessor
#AApplying transformation on training dataset
#Here we can see one hot encoding on categorical fetures and standara scaling in all numerical features(to scale down the values)
X_train=preprocessor.fit_transform(X_train)
X_train
pd.DataFrame(X_train)
#applying transformation on test data(transform)
X_test=preprocessor.transform(X_test)
X_test
#Random Forest Classifier Training
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score,classification_report,ConfusionMatrixDisplay,precision_score,recall_score,f1_score,roc_auc_score,roc_curve
#Efficient way of training the model
models={
    "Random forest":RandomForestClassifier()
}
for i in range(len(list(models))):
    model=list(models.values())[i]
    model.fit(X_train,y_train) #Model train


    #Make Predictions    
    y_train_pred=model.predict(X_train)
    y_test_pred=model.predict(X_test)
    
    #Training set Perfomance
    train_accuracy=accuracy_score(y_train,y_train_pred)
    train_f1score=f1_score(y_train,y_train_pred)
    train_precision=precision_score(y_train,y_train_pred)
    train_recall=recall_score(y_train,y_train_pred)
    train_roc_auc_score=roc_auc_score(y_train,y_train_pred)


    #Test set Perfomance
    test_accuracy=accuracy_score(y_test,y_test_pred)
    test_f1score=f1_score(y_test,y_test_pred)
    test_precision=precision_score(y_test,y_test_pred)
    test_recall=recall_score(y_test,y_test_pred)
    test_roc_auc_score=roc_auc_score(y_test,y_test_pred)

    print(list(models.keys())[i])
    print("**********")
    print("Model Perfomance for Training Set")
    print("Accuracy:{:.4f}".format(train_accuracy))
    print("F1 Score:{:.4f}".format(train_f1score))
    print("Precision:{:.4f}".format(train_precision))
    print("Recall:{:.4f}".format(train_recall))
    print("ROC AUC Score:{:.4f}".format(train_roc_auc_score))



    print("----------------------------------------")
    
    
    print("Model Perfomance for Test Set")
    print("Accuracy:{:.4f}".format(test_accuracy))
    print("F1 Score:{:.4f}".format(test_f1score))
    print("Precision:{:.4f}".format(test_precision))
    print("Recall:{:.4f}".format(test_recall))
    print("ROC AUC Score:{:.4f}".format(test_roc_auc_score))


#Here efficiently we can add many algorithms
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score,classification_report,ConfusionMatrixDisplay,precision_score,recall_score,f1_score,roc_auc_score,roc_curve
models={
    "Random forest":RandomForestClassifier(),
    "decision tree":DecisionTreeClassifier(),
    "logistic regression":LogisticRegression(),
    "Gradient Boosting":GradientBoostingClassifier(),
    "AdaBoost":AdaBoostClassifier(),
}
for i in range(len(list(models))):
    model=list(models.values())[i]
    model.fit(X_train,y_train) #Model train


    #Make Predictions    
    y_train_pred=model.predict(X_train)
    y_test_pred=model.predict(X_test)
    
    #Training set Perfomance
    train_accuracy=accuracy_score(y_train,y_train_pred)
    train_f1score=f1_score(y_train,y_train_pred)
    train_precision=precision_score(y_train,y_train_pred)
    train_recall=recall_score(y_train,y_train_pred)
    train_roc_auc_score=roc_auc_score(y_train,y_train_pred)


    #Test set Perfomance
    test_accuracy=accuracy_score(y_test,y_test_pred)
    test_f1score=f1_score(y_test,y_test_pred)
    test_precision=precision_score(y_test,y_test_pred)
    test_recall=recall_score(y_test,y_test_pred)
    test_roc_auc_score=roc_auc_score(y_test,y_test_pred)

    print(list(models.keys())[i])
    print("**********")
    print("Model Perfomance for Training Set")
    print("Accuracy:{:.4f}".format(train_accuracy))
    print("F1 Score:{:.4f}".format(train_f1score))
    print("Precision:{:.4f}".format(train_precision))
    print("Recall:{:.4f}".format(train_recall))
    print("ROC AUC Score:{:.4f}".format(train_roc_auc_score))



    print("----------------------------------------")
    
    
    print("Model Perfomance for Test Set")
    print("Accuracy:{:.4f}".format(test_accuracy))
    print("F1 Score:{:.4f}".format(test_f1score))
    print("Precision:{:.4f}".format(test_precision))
    print("Recall:{:.4f}".format(test_recall))
    print("ROC AUC Score:{:.4f}".format(test_roc_auc_score))
    
    print("================================================")



Age 4.62357 % missing values 
TypeofContact 0.51146 % missing values 
DurationOfPitch 5.13502 % missing values 
NumberOfFollowups 0.92062 % missing values 
PreferredPropertyStar 0.53191 % missing values 
NumberOfTrips 2.86416 % missing values 
NumberOfChildrenVisiting 1.35025 % missing values 
MonthlyIncome 4.76678 % missing values 
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4888 entries, 0 to 4887
Data columns (total 18 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   ProdTaken               4888 non-null   int64  
 1   Age                     4888 non-null   float64
 2   TypeofContact           4888 non-null   object 
 3   CityTier                4888 non-null   int64  
 4   DurationOfPitch         4888 non-null   float64
 5   Occupation              4888 non-null   object 
 6   Gender                  4888 non-null   object 
 7   NumberOfFollowups       4888 non-null   float64
 8   ProductPitched         

In [5]:
adaboost_params={
    "n_estimators":[50,60,70,80,90],
    "algorithm":["SAMME","SAMME.R"]
}
adaboost_params

{'n_estimators': [50, 60, 70, 80, 90], 'algorithm': ['SAMME', 'SAMME.R']}

In [8]:
randomcv_model=[
    ("AB",AdaBoostClassifier(),adaboost_params)
]
randomcv_model

[('AB',
  AdaBoostClassifier(),
  {'n_estimators': [50, 60, 70, 80, 90], 'algorithm': ['SAMME', 'SAMME.R']})]

In [9]:
from sklearn.model_selection import RandomizedSearchCV
model_param={}
for name,model,params in randomcv_model:
    random=RandomizedSearchCV(estimator=model,param_distributions=params,n_iter=100,cv=3,verbose=2,n_jobs=-1)

    random.fit(X_train,y_train)
    model_param[name]=random.best_params_

for model_name in model_param:
    print(f"-----Best Params for {model_name}--------")
    print(model_param[model_name]) 

Fitting 3 folds for each of 10 candidates, totalling 30 fits
-----Best Params for AB--------
{'n_estimators': 80, 'algorithm': 'SAMME'}


In [10]:
models={
    "Random forest":RandomForestClassifier(n_estimators=1000,min_samples_split= 2,max_features= 8, max_depth=None),
    "adaboost":AdaBoostClassifier(n_estimators=80,algorithm="SAMME")
}
for i in range(len(list(models))):
    model=list(models.values())[i]
    model.fit(X_train,y_train) #Model train


    #Make Predictions    
    y_train_pred=model.predict(X_train)
    y_test_pred=model.predict(X_test)
    
    #Training set Perfomance
    train_accuracy=accuracy_score(y_train,y_train_pred)
    train_f1score=f1_score(y_train,y_train_pred)
    train_precision=precision_score(y_train,y_train_pred)
    train_recall=recall_score(y_train,y_train_pred)
    train_roc_auc_score=roc_auc_score(y_train,y_train_pred)


    #Test set Perfomance
    test_accuracy=accuracy_score(y_test,y_test_pred)
    test_f1score=f1_score(y_test,y_test_pred)
    test_precision=precision_score(y_test,y_test_pred)
    test_recall=recall_score(y_test,y_test_pred)
    test_roc_auc_score=roc_auc_score(y_test,y_test_pred)

    print(list(models.keys())[i])
    print("**********")
    print("Model Perfomance for Training Set")
    print("Accuracy:{:.4f}".format(train_accuracy))
    print("F1 Score:{:.4f}".format(train_f1score))
    print("Precision:{:.4f}".format(train_precision))
    print("Recall:{:.4f}".format(train_recall))
    print("ROC AUC Score:{:.4f}".format(train_roc_auc_score))



    print("----------------------------------------")
    
    
    print("Model Perfomance for Test Set")
    print("Accuracy:{:.4f}".format(test_accuracy))
    print("F1 Score:{:.4f}".format(test_f1score))
    print("Precision:{:.4f}".format(test_precision))
    print("Recall:{:.4f}".format(test_recall))
    print("ROC AUC Score:{:.4f}".format(test_roc_auc_score))
    
    print("================================================")

#Here we can see the increase in the recall compare to before
#Plotting Roc AUC CURVE

Random forest
**********
Model Perfomance for Training Set
Accuracy:1.0000
F1 Score:1.0000
Precision:1.0000
Recall:1.0000
ROC AUC Score:1.0000
----------------------------------------
Model Perfomance for Test Set
Accuracy:0.9294
F1 Score:0.7864
Precision:0.9621
Recall:0.6649
ROC AUC Score:0.8293
adaboost
**********
Model Perfomance for Training Set
Accuracy:0.8465
F1 Score:0.3802
Precision:0.7699
Recall:0.2524
ROC AUC Score:0.6176
----------------------------------------
Model Perfomance for Test Set
Accuracy:0.8364
F1 Score:0.3496
Precision:0.7818
Recall:0.2251
ROC AUC Score:0.6049
