In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

In [None]:
data = pd.read_csv('../input/holiday-package-purchase-prediction/Travel.csv')

In [None]:
data.head()

In [None]:
data.info()

In [None]:
data.shape

In [None]:
round(data.describe())

In [None]:
data.isnull().sum()

In [None]:
plt.figure(figsize = (10,7))
sns.set_style('darkgrid')
sns.heatmap(data.isnull(), cmap='hot')
plt.title('Null Values Heatmap')
plt.show()

## Data Cleaning and Imputing missing values

In [None]:
data = data.drop(columns = 'CustomerID')

In [None]:
Rounded_duration = round(data['DurationOfPitch'].mean())
Rounded_monthly_income  = round(data['MonthlyIncome'].mean())

data['Age'] = data['Age'].fillna(data['Age'].mode().values[0])
data['TypeofContact'] = data['TypeofContact'].fillna(data['TypeofContact'].mode().values[0])
data['DurationOfPitch'] = data['DurationOfPitch'].fillna(Rounded_duration)
data['NumberOfFollowups'] = data['NumberOfFollowups'].fillna(data['NumberOfFollowups'].mode().values[0])
data['PreferredPropertyStar'] = data['PreferredPropertyStar'].fillna(data['PreferredPropertyStar'].mode().values[0])
data['NumberOfTrips'] = data['NumberOfTrips'].fillna(data['NumberOfTrips'].mode().values[0])
data['NumberOfChildrenVisiting'] = data['NumberOfChildrenVisiting'].fillna(data['NumberOfChildrenVisiting'].mode().values[0])
data['MonthlyIncome'] = data['MonthlyIncome'].fillna(Rounded_monthly_income)

In [None]:
plt.figure(figsize = (10,7))
sns.heatmap(data.isnull(), cmap='hot')
plt.title('Null Values Heatmap')
plt.show()

## No Null values...  Lets Jump on EDA

In [None]:
plt.figure(figsize=(15,7))
sns.heatmap(data.corr(), annot = True, cmap='hot')
plt.title('Data Correlation')
plt.show()

In [None]:
sns.countplot(x = data['ProdTaken'], palette='Oranges')
plt.title('Distribution of our label.')
plt.show()

## This is a Imbalanced Dataset..  So we have to perform over sampling on it.. We will see that further.

In [None]:
Categorical_Features = ['TypeofContact', 'CityTier', 'Occupation', 'Gender', 'NumberOfPersonVisiting', 'ProductPitched',
'MaritalStatus', 'Passport' , 'PitchSatisfactionScore', 'OwnCar', 'NumberOfChildrenVisiting', 'Designation',
'PitchSatisfactionScore', 'PreferredPropertyStar','NumberOfFollowups' ]
Continous_Features = ['Age' , 'DurationOfPitch' , 'NumberOfTrips' , 'MonthlyIncome']

## Distribution of Categorical Values

In [None]:
for columns in Categorical_Features:
        plt.figure(figsize=(15,7))
        plt.subplot(1,2,1)
        sns.countplot(x = data[columns], palette='Set2')
        plt.title('Distribution of {}'.format(columns))
        sns.color_palette("Set2")
        plt.subplot(1,2,2)
        sns.countplot(x = data[columns], hue = data['ProdTaken'],palette='dark')
        plt.title('Distribution of {} with respect to Product taken'.format(columns))
        plt.show()

## Distribution of Continous Values

In [None]:
for columns in Continous_Features:
        plt.figure(figsize=(10,7))
        data[columns].plot(kind = 'hist' , color = 'Black')
        plt.title('Distribution of {}'.format(columns))
        plt.show()

## Lets Encode the Categorical Data

In [None]:
data['Gender'] = data['Gender'].replace('Fe Male','Female')
data['Gender'] = data['Gender'].replace('Male',0)
data['Gender'] = data['Gender'].replace('Female',1)

data['ProductPitched'] = data['ProductPitched'].replace('Basic',0)
data['ProductPitched'] = data['ProductPitched'].replace('Standard',1)
data['ProductPitched'] = data['ProductPitched'].replace('Deluxe',2)
data['ProductPitched'] = data['ProductPitched'].replace('Super Deluxe',3)
data['ProductPitched'] = data['ProductPitched'].replace('King',4)

data = pd.get_dummies(data , columns= ['MaritalStatus','TypeofContact','Designation', 'Occupation','TypeofContact'],drop_first=True)

## Performing Oversampling

In [None]:
from imblearn.over_sampling import RandomOverSampler
data = data.drop(columns='TypeofContact_Self Enquiry')

x = data.drop(columns='ProdTaken')
y = data['ProdTaken']
os = RandomOverSampler()
x_oversample,y_oversample = os.fit_resample(x,y)
print(x_oversample.shape)
print(y_oversample.shape)

## Train Test Split

In [None]:
from sklearn.model_selection import train_test_split

x_train , x_test , y_train , y_test = train_test_split(x_oversample, y_oversample, test_size=0.33 , random_state=10)

## Train Our Model

In [None]:
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier()
rfc.fit(x_train,y_train)
score = rfc.score(x_test,y_test)
y_predict = rfc.predict(x_test)

from sklearn.metrics import confusion_matrix
confusion_matrix(y_test,y_predict )

In [None]:
score