In [None]:
# problem statement
#The Tour & Travels Customer Churn Prediction dataset assists a travel company in predicting customer churn. It includes indicators such as age, frequent flyer status, annual income class, services opted frequency, social media account synchronization, and hotel bookings.
#The goal is to build predictive models to save company resources. The dataset, used for practice and in a hackathon, is freely available. Analysts can perform exploratory data analyses to reveal insights for effective churn prediction. The binary target variable distinguishes customers who churn (1) from those who don't (0), guiding the modeling process.

In [1]:
# Suppressing Warnings
import warnings
warnings.filterwarnings('ignore')

In [9]:
# Importing all the necessary libraries

import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt

%matplotlib inline


# Library for modelling
import statsmodels.api as sm

# Library for train test split
from sklearn.model_selection import train_test_split

# Library for Logistic Regression
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.feature_selection import RFE

# Library for scaling
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler

# Library for Variance Inflation Factor
from statsmodels.stats.outliers_influence import variance_inflation_factor


#Library for analyzing the model
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_recall_curve

In [10]:
#Reading the dataset

df = pd.read_csv('Customertravel.csv')

#Checking the first 5 rows of the data set
df.head()

Unnamed: 0,Age,FrequentFlyer,AnnualIncomeClass,ServicesOpted,AccountSyncedToSocialMedia,BookedHotelOrNot,Target
0,34,No,Middle Income,6,No,Yes,0
1,34,Yes,Low Income,5,Yes,No,1
2,37,No,Middle Income,3,Yes,No,0
3,30,No,Middle Income,2,No,No,0
4,30,No,Low Income,1,No,No,0


In [11]:
#Determining the shape of the datset
df.shape

(954, 7)

In [12]:
#Inspecting the dataset

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 954 entries, 0 to 953
Data columns (total 7 columns):
 #   Column                      Non-Null Count  Dtype 
---  ------                      --------------  ----- 
 0   Age                         954 non-null    int64 
 1   FrequentFlyer               954 non-null    object
 2   AnnualIncomeClass           954 non-null    object
 3   ServicesOpted               954 non-null    int64 
 4   AccountSyncedToSocialMedia  954 non-null    object
 5   BookedHotelOrNot            954 non-null    object
 6   Target                      954 non-null    int64 
dtypes: int64(3), object(4)
memory usage: 37.3+ KB


In [13]:
#Checking for count of missing values in each column
df.isnull().sum()

Age                           0
FrequentFlyer                 0
AnnualIncomeClass             0
ServicesOpted                 0
AccountSyncedToSocialMedia    0
BookedHotelOrNot              0
Target                        0
dtype: int64

In [19]:
#EDA

In [20]:
#'Target' column:

In [21]:
#checking the value count in the column

round(df.Target.value_counts(normalize = True)*100,2)

0    76.52
1    23.48
Name: Target, dtype: float64

In [22]:
#'FrequentFlyer' column:

In [23]:

#checking the value count in the column

df.FrequentFlyer.value_counts()

No           608
Yes          286
No Record     60
Name: FrequentFlyer, dtype: int64

In [24]:
df.FrequentFlyer = df.FrequentFlyer.replace('No Record','No')

In [25]:
#Checking the value counts
df.FrequentFlyer.value_counts()

No     668
Yes    286
Name: FrequentFlyer, dtype: int64

In [26]:
#'AnnualIncomeClass' column:

In [27]:
#checking the value count in the column

df.AnnualIncomeClass.value_counts()

Middle Income    409
Low Income       386
High Income      159
Name: AnnualIncomeClass, dtype: int64

In [28]:
#'ServicesOpted' column:

In [29]:
#checking the value count in the column

df.ServicesOpted.value_counts()

1    404
2    176
3    124
4    117
5     69
6     64
Name: ServicesOpted, dtype: int64

In [None]:
#'AccountSyncedToSocialMedia' column:

In [30]:
#checking the value count in the column

df.AccountSyncedToSocialMedia.value_counts()

No     594
Yes    360
Name: AccountSyncedToSocialMedia, dtype: int64

In [None]:
#'BookedHotelOrNot' column:

In [31]:
#checking the value count in the column

df.BookedHotelOrNot.value_counts()

No     576
Yes    378
Name: BookedHotelOrNot, dtype: int64

In [None]:
# Data Preparation Converting binary variables (Yes/No) to 0/1

In [32]:
#creatig a list of variables to be converted
var = ['FrequentFlyer','AccountSyncedToSocialMedia','BookedHotelOrNot']

def binary_map(x):
    return x.map({'Yes':1,'No':0})

df[var] = df[var].apply(binary_map)

df.head()

Unnamed: 0,Age,FrequentFlyer,AnnualIncomeClass,ServicesOpted,AccountSyncedToSocialMedia,BookedHotelOrNot,Target
0,34,0,Middle Income,6,0,1,0
1,34,1,Low Income,5,1,0,1
2,37,0,Middle Income,3,1,0,0
3,30,0,Middle Income,2,0,0,0
4,30,0,Low Income,1,0,0,0


In [None]:
#For categorical variables with multiple levels, we will create dummy variables:

In [33]:
# Creating a dummy variable for some of the categorical variables and dropping the first one.
dummy1 = pd.get_dummies(df[['AnnualIncomeClass']], drop_first=True)

# Adding the results to the master dataframe
df = pd.concat([df, dummy1], axis=1)

In [34]:
df.head()

Unnamed: 0,Age,FrequentFlyer,AnnualIncomeClass,ServicesOpted,AccountSyncedToSocialMedia,BookedHotelOrNot,Target,AnnualIncomeClass_Low Income,AnnualIncomeClass_Middle Income
0,34,0,Middle Income,6,0,1,0,0,1
1,34,1,Low Income,5,1,0,1,1,0
2,37,0,Middle Income,3,1,0,0,0,1
3,30,0,Middle Income,2,0,0,0,0,1
4,30,0,Low Income,1,0,0,0,1,0


In [35]:
# We shall drop the AAnnualIncomeClass column

df.drop('AnnualIncomeClass',axis = 1, inplace = True)
df.head()

Unnamed: 0,Age,FrequentFlyer,ServicesOpted,AccountSyncedToSocialMedia,BookedHotelOrNot,Target,AnnualIncomeClass_Low Income,AnnualIncomeClass_Middle Income
0,34,0,6,0,1,0,0,1
1,34,1,5,1,0,1,1,0
2,37,0,3,1,0,0,0,1
3,30,0,2,0,0,0,0,1
4,30,0,1,0,0,0,1,0


In [36]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 954 entries, 0 to 953
Data columns (total 8 columns):
 #   Column                           Non-Null Count  Dtype
---  ------                           --------------  -----
 0   Age                              954 non-null    int64
 1   FrequentFlyer                    954 non-null    int64
 2   ServicesOpted                    954 non-null    int64
 3   AccountSyncedToSocialMedia       954 non-null    int64
 4   BookedHotelOrNot                 954 non-null    int64
 5   Target                           954 non-null    int64
 6   AnnualIncomeClass_Low Income     954 non-null    uint8
 7   AnnualIncomeClass_Middle Income  954 non-null    uint8
dtypes: int64(6), uint8(2)
memory usage: 46.7 KB


In [38]:
#Train Test Split
# Putting predictor variables to X
X = df.drop(['Target'], axis=1)

# Putting Target variables to y
y = df["Target"]

In [39]:
# Splitting the data into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, test_size=0.3, random_state=100)

In [40]:
#shape of test and test data sets after split
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(667, 7)
(287, 7)
(667,)
(287,)


In [41]:
#Feature Scaling
scaler=StandardScaler()

#creating a list of numeric variables
num_cols = ['Age','ServicesOpted']

X_train[num_cols] = scaler.fit_transform(X_train[num_cols])

# X-train dataframe after standard scaling
X_train.head()

Unnamed: 0,Age,FrequentFlyer,ServicesOpted,AccountSyncedToSocialMedia,BookedHotelOrNot,AnnualIncomeClass_Low Income,AnnualIncomeClass_Middle Income
105,-0.651985,1,2.202621,0,1,0,0
953,-0.351865,1,-0.867058,0,0,0,0
299,-0.952106,0,-0.253122,0,0,0,1
69,0.548496,1,0.97475,0,0,0,0
85,-1.552346,1,-0.867058,0,1,1,0


In [42]:
#Checking the Churn Rate
churn = (sum(df['Target'])/len(df['Target'].index))*100
churn

23.48008385744235