<a href="https://colab.research.google.com/github/rafael-ariascalles/MachineLearningExamples/blob/main/Travel_Package_Purchase_Prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
%%capture
!pip install pandas-profiling==2.7.1

# Libraries

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib.pyplot import plot as plt
import pandas_profiling 

# Business Objective

A company is planning to launch a new product i.e. Wellness Tourism Package. Wellness Tourism is defined as Travel that allows the traveler to maintain, enhance or kick-start a healthy lifestyle, and support or increase one's sense of well being.

it is needed to:

* Give recommendations to the Policy Maker and Marketing Team for the newly package

* Build a model to predict the potential customer who is going to purchase the newly introduced package

# Explore and Process Data

### Data Sorce and Data Understanding 

In [2]:
data = pd.read_excel("drive/MyDrive/PGP-AIML/Tourism.xlsx",sheet_name=1)

In [3]:
def data_understanding(df):
    n_rows, n_columns = df.shape
    print("------------ SAMPLE DATA ----------------------")
    display(data.sample(10).head())
    print("Number of Columns: {} and Rows: {}".format(n_columns,n_rows))
    print("------------ INFORMATION ----------------------")
    print(df.info())
    print("------------ NUMERIC TYPE ---------------------")
    dm = df.describe()
    display(dm)
    print(dm.columns)
    print("------------ OBJECT TYPE ----------------------")
    dc = df.describe(include=["O"])
    display(dc)
    print(dc.columns)

In [4]:
data_understanding(data)

------------ SAMPLE DATA ----------------------


Unnamed: 0,CustomerID,ProdTaken,Age,TypeofContact,CityTier,DurationOfPitch,Occupation,Gender,NumberOfPersonVisited,NumberOfFollowups,ProductPitched,PreferredPropertyStar,MaritalStatus,NumberOfTrips,Passport,PitchSatisfactionScore,OwnCar,NumberOfChildrenVisited,Designation,MonthlyIncome
445,200445,0,55.0,Self Enquiry,3,24.0,Salaried,Female,2,3.0,Super Deluxe,4.0,Single,4.0,0,2,0,1.0,AVP,31835.0
1402,201402,0,31.0,Self Enquiry,1,8.0,Small Business,Female,2,5.0,Basic,3.0,Married,2.0,1,5,1,0.0,Executive,16129.0
781,200781,0,,Self Enquiry,1,6.0,Small Business,Male,2,4.0,Basic,5.0,Divorced,2.0,0,3,0,1.0,Executive,
1476,201476,0,59.0,Self Enquiry,1,9.0,Small Business,Female,2,1.0,Basic,5.0,Married,5.0,1,1,0,1.0,Executive,17670.0
822,200822,0,,Company Invited,1,8.0,Salaried,Male,3,3.0,Deluxe,3.0,Single,3.0,0,4,1,2.0,Manager,


Number of Columns: 20 and Rows: 4888
------------ INFORMATION ----------------------
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4888 entries, 0 to 4887
Data columns (total 20 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   CustomerID               4888 non-null   int64  
 1   ProdTaken                4888 non-null   int64  
 2   Age                      4662 non-null   float64
 3   TypeofContact            4863 non-null   object 
 4   CityTier                 4888 non-null   int64  
 5   DurationOfPitch          4637 non-null   float64
 6   Occupation               4888 non-null   object 
 7   Gender                   4888 non-null   object 
 8   NumberOfPersonVisited    4888 non-null   int64  
 9   NumberOfFollowups        4843 non-null   float64
 10  ProductPitched           4888 non-null   object 
 11  PreferredPropertyStar    4862 non-null   float64
 12  MaritalStatus            4888 non-null   object

Unnamed: 0,CustomerID,ProdTaken,Age,CityTier,DurationOfPitch,NumberOfPersonVisited,NumberOfFollowups,PreferredPropertyStar,NumberOfTrips,Passport,PitchSatisfactionScore,OwnCar,NumberOfChildrenVisited,MonthlyIncome
count,4888.0,4888.0,4662.0,4888.0,4637.0,4888.0,4843.0,4862.0,4748.0,4888.0,4888.0,4888.0,4822.0,4655.0
mean,202443.5,0.188216,37.622265,1.654255,15.490835,2.905074,3.708445,3.581037,3.236521,0.290917,3.078151,0.620295,1.187267,23619.853491
std,1411.188388,0.390925,9.316387,0.916583,8.519643,0.724891,1.002509,0.798009,1.849019,0.454232,1.365792,0.485363,0.857861,5380.698361
min,200000.0,0.0,18.0,1.0,5.0,1.0,1.0,3.0,1.0,0.0,1.0,0.0,0.0,1000.0
25%,201221.75,0.0,31.0,1.0,9.0,2.0,3.0,3.0,2.0,0.0,2.0,0.0,1.0,20346.0
50%,202443.5,0.0,36.0,1.0,13.0,3.0,4.0,3.0,3.0,0.0,3.0,1.0,1.0,22347.0
75%,203665.25,0.0,44.0,3.0,20.0,3.0,4.0,4.0,4.0,1.0,4.0,1.0,2.0,25571.0
max,204887.0,1.0,61.0,3.0,127.0,5.0,6.0,5.0,22.0,1.0,5.0,1.0,3.0,98678.0


Index(['CustomerID', 'ProdTaken', 'Age', 'CityTier', 'DurationOfPitch',
       'NumberOfPersonVisited', 'NumberOfFollowups', 'PreferredPropertyStar',
       'NumberOfTrips', 'Passport', 'PitchSatisfactionScore', 'OwnCar',
       'NumberOfChildrenVisited', 'MonthlyIncome'],
      dtype='object')
------------ OBJECT TYPE ----------------------


Unnamed: 0,TypeofContact,Occupation,Gender,ProductPitched,MaritalStatus,Designation
count,4863,4888,4888,4888,4888,4888
unique,2,4,3,5,4,5
top,Self Enquiry,Salaried,Male,Basic,Married,Executive
freq,3444,2368,2916,1842,2340,1842


Index(['TypeofContact', 'Occupation', 'Gender', 'ProductPitched',
       'MaritalStatus', 'Designation'],
      dtype='object')


### Exploratory Data Analysis

In [8]:
baseline_report = pandas_profiling.ProfileReport(data)

In [10]:
baseline_report.to_notebook_iframe()

### Data Preprocessing

# Modeling

### Model Training

### Model Evaluation

# Business Insight