## Data Importing

In [1]:
# Import libraries
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
import pandas as pd
import numpy as np
%matplotlib inline

In [2]:
df = pd.read_csv("Travel.csv")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4888 entries, 0 to 4887
Data columns (total 20 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   CustomerID                4888 non-null   int64  
 1   ProdTaken                 4888 non-null   int64  
 2   Age                       4662 non-null   float64
 3   TypeofContact             4863 non-null   object 
 4   CityTier                  4888 non-null   int64  
 5   DurationOfPitch           4637 non-null   float64
 6   Occupation                4888 non-null   object 
 7   Gender                    4888 non-null   object 
 8   NumberOfPersonVisiting    4888 non-null   int64  
 9   NumberOfFollowups         4843 non-null   float64
 10  ProductPitched            4888 non-null   object 
 11  PreferredPropertyStar     4862 non-null   float64
 12  MaritalStatus             4888 non-null   object 
 13  NumberOfTrips             4748 non-null   float64
 14  Passport

In [8]:
## Data Cleaning
df.head()

Unnamed: 0,CustomerID,ProdTaken,Age,TypeofContact,CityTier,DurationOfPitch,Occupation,Gender,NumberOfPersonVisiting,NumberOfFollowups,ProductPitched,PreferredPropertyStar,MaritalStatus,NumberOfTrips,Passport,PitchSatisfactionScore,OwnCar,NumberOfChildrenVisiting,Designation,MonthlyIncome
0,200000,1,41.0,Self Enquiry,3,6.0,Salaried,Female,3,3.0,Deluxe,3.0,Single,1.0,1,2,1,0.0,Manager,20993.0
1,200001,0,49.0,Company Invited,1,14.0,Salaried,Male,3,4.0,Deluxe,4.0,Divorced,2.0,0,3,1,2.0,Manager,20130.0
2,200002,1,37.0,Self Enquiry,1,8.0,Free Lancer,Male,3,4.0,Basic,3.0,Single,7.0,1,3,0,0.0,Executive,17090.0
3,200003,0,33.0,Company Invited,1,9.0,Salaried,Female,2,3.0,Basic,3.0,Divorced,2.0,1,5,1,1.0,Executive,17909.0
4,200004,0,,Self Enquiry,1,8.0,Small Business,Male,2,3.0,Basic,4.0,Divorced,1.0,0,5,1,0.0,Executive,18468.0


In [3]:
df.isna().sum()

CustomerID                    0
ProdTaken                     0
Age                         226
TypeofContact                25
CityTier                      0
DurationOfPitch             251
Occupation                    0
Gender                        0
NumberOfPersonVisiting        0
NumberOfFollowups            45
ProductPitched                0
PreferredPropertyStar        26
MaritalStatus                 0
NumberOfTrips               140
Passport                      0
PitchSatisfactionScore        0
OwnCar                        0
NumberOfChildrenVisiting     66
Designation                   0
MonthlyIncome               233
dtype: int64

In [18]:
df['TypeofContact'].unique()

array(['Self Enquiry', 'Company Invited', nan], dtype=object)

In [11]:
df['ProdTaken'].value_counts()

ProdTaken
0    3968
1     920
Name: count, dtype: int64

In [15]:
df['Gender'].value_counts()
df['MaritalStatus'].value_counts()

MaritalStatus
Married      2340
Divorced      950
Single        916
Unmarried     682
Name: count, dtype: int64

In [16]:
df['Gender'] = df['Gender'].replace('Fe Male', 'Female')
df['MaritalStatus'] = df['MaritalStatus'].replace('Single','Unmarried')

In [19]:
features_with_na = [feature for feature in df.columns if df[feature].isnull().sum()>=1]

In [21]:
for feature in features_with_na:
    print(feature, np.round(df[feature].isnull().mean()*100,5),'% missing')

Age 4.62357 % missing
TypeofContact 0.51146 % missing
DurationOfPitch 5.13502 % missing
NumberOfFollowups 0.92062 % missing
PreferredPropertyStar 0.53191 % missing
NumberOfTrips 2.86416 % missing
NumberOfChildrenVisiting 1.35025 % missing
MonthlyIncome 4.76678 % missing


In [23]:
df[features_with_na].select_dtypes(exclude='object').describe()

Unnamed: 0,Age,DurationOfPitch,NumberOfFollowups,PreferredPropertyStar,NumberOfTrips,NumberOfChildrenVisiting,MonthlyIncome
count,4662.0,4637.0,4843.0,4862.0,4748.0,4822.0,4655.0
mean,37.622265,15.490835,3.708445,3.581037,3.236521,1.187267,23619.853491
std,9.316387,8.519643,1.002509,0.798009,1.849019,0.857861,5380.698361
min,18.0,5.0,1.0,3.0,1.0,0.0,1000.0
25%,31.0,9.0,3.0,3.0,2.0,1.0,20346.0
50%,36.0,13.0,4.0,3.0,3.0,1.0,22347.0
75%,44.0,20.0,4.0,4.0,4.0,2.0,25571.0
max,61.0,127.0,6.0,5.0,22.0,3.0,98678.0


In [26]:
df.Age.fillna(df.Age.median(), inplace=True)
df.TypeofContact.fillna(df.TypeofContact.mode()[0], inplace=True)
df.DurationOfPitch.fillna(df.DurationOfPitch.median(), inplace=True)
df.NumberOfFollowups.fillna(df.NumberOfFollowups.mode()[0], inplace=True)
df.PreferredPropertyStar.fillna(df.PreferredPropertyStar.mode()[0], inplace=True)
df.NumberOfTrips.fillna(0, inplace=True)
df.NumberOfChildrenVisiting.fillna(df.NumberOfChildrenVisiting.mode()[0], inplace=True)
df.MonthlyIncome.fillna(df.MonthlyIncome.median(), inplace=True)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df.Age.fillna(df.Age.median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df.DurationOfPitch.fillna(df.DurationOfPitch.median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which w

In [27]:
df.drop('CustomerID', axis=1, inplace=True)

### Feature Engineering

In [30]:
df["TotalNumberOfPeopleVisiting"] = df['NumberOfChildrenVisiting']+df['NumberOfPersonVisiting']
df.drop(['NumberOfPersonVisiting','NumberOfChildrenVisiting'], axis=1,inplace=True)

In [35]:
# numeric features
num_features = [feature for feature in df.columns if df[feature].dtype!='object']
# categorial features
cat_features = [feature for feature in df.columns if df[feature].dtype=='object']
# discrete features
dis_features = [feature for feature in num_features if len(df[feature].unique())<=25]
# continous features
con_features = [feature for feature in num_features if feature not in dis_features]

In [36]:
df.head()

Unnamed: 0,ProdTaken,Age,TypeofContact,CityTier,DurationOfPitch,Occupation,Gender,NumberOfFollowups,ProductPitched,PreferredPropertyStar,MaritalStatus,NumberOfTrips,Passport,PitchSatisfactionScore,OwnCar,Designation,MonthlyIncome,TotalNumberOfPeopleVisiting
0,1,41.0,Self Enquiry,3,6.0,Salaried,Female,3.0,Deluxe,3.0,Unmarried,1.0,1,2,1,Manager,20993.0,3.0
1,0,49.0,Company Invited,1,14.0,Salaried,Male,4.0,Deluxe,4.0,Divorced,2.0,0,3,1,Manager,20130.0,5.0
2,1,37.0,Self Enquiry,1,8.0,Free Lancer,Male,4.0,Basic,3.0,Unmarried,7.0,1,3,0,Executive,17090.0,3.0
3,0,33.0,Company Invited,1,9.0,Salaried,Female,3.0,Basic,3.0,Divorced,2.0,1,5,1,Executive,17909.0,3.0
4,0,36.0,Self Enquiry,1,8.0,Small Business,Male,3.0,Basic,4.0,Divorced,1.0,0,5,1,Executive,18468.0,2.0


### Train Test Split 

In [38]:
from sklearn.model_selection import train_test_split
X = df.drop(['ProdTaken'], axis=1)
y = df['ProdTaken']

In [41]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.20,random_state=42)
X_train.shape,X_test.shape

((3910, 17), (978, 17))

In [42]:
# create column transformer with 3 types of transformers
cat_features = X.select_dtypes(include='object').columns
num_features = X.select_dtypes(exclude='object').columns


In [None]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer

numeric_transformer = StandardScaler()
one_Hot_transformer = OneHotEncoder(drop='first')
preprocessor = ColumnTransformer(
    [
        ("OneHotEncoder",one_Hot_transformer,cat_features),
        ("StandardScaler",numeric_transformer,num_features)
    ]
)

In [45]:
preprocessor

In [44]:
X_train = preprocessor.fit_transform(X_train)
X_test = preprocessor.transform(X_test)

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report,precision_score, recall_score,f1_score

In [48]:
models = {
    "Random Forest":RandomForestClassifier()
}

In [49]:
# Model Training
for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(X_train,y_train) 

In [50]:
# Model predictions
y_train_predict = model.predict(X_train)
y_test_predict = model.predict(X_test)

In [55]:
# Model Performance Train
accuracy_score_train = accuracy_score(y_train_predict,y_train)
confusion_matrix_train = confusion_matrix(y_train_predict,y_train)
precision_score_train = precision_score(y_train_predict,y_train)
recall_score_train = recall_score(y_train_predict,y_train)
f1_score_train = f1_score(y_train_predict,y_train)
print('accuracy_score_train: ',accuracy_score_train)
print('confusion_matrix_train: ',confusion_matrix_train)
print('precision_score_train: ',precision_score_train)
print('recall_score_train: ',recall_score_train)
print('f1_score_train: ',f1_score_train)

accuracy_score_train:  1.0
confusion_matrix_train:  [[3181    0]
 [   0  729]]
precision_score_train:  1.0
recall_score_train:  1.0
f1_score_train:  1.0


In [56]:
# Model Performance Test
accuracy_score_test = accuracy_score(y_test_predict,y_test)
confusion_matrix_test = confusion_matrix(y_test_predict,y_test)
precision_score_test = precision_score(y_test_predict,y_test)
recall_score_test = recall_score(y_test_predict,y_test)
f1_score_test = f1_score(y_test_predict,y_test)
print('accuracy_score_test: ',accuracy_score_test)
print('confusion_matrix_test: ',confusion_matrix_test)
print('precision_score_test: ',precision_score_test)
print('recall_score_test: ',recall_score_test)
print('f1_score_test: ',f1_score_test)

accuracy_score_test:  0.9274028629856851
confusion_matrix_test:  [[783  67]
 [  4 124]]
precision_score_test:  0.6492146596858639
recall_score_test:  0.96875
f1_score_test:  0.7774294670846394
