We work at Delhi Delights! which is a food delivery company in Delhi. It offers a premium membership called ‘Delighted Members’, with which there is no delivery cost for your order. Lately, the number of purchases of this premium membership has been going down. Now, based on past data, Delhi Delights! wants to predict which of the customers will buy the 'Delighted Members' membership and which ones will not. 


![image1](https://miro.medium.com/max/875/0*8CoIJ0viQsn4UH4G.jpg)

In [None]:
# Importing the required libraries
import pandas as pd, numpy as np
import matplotlib.pyplot as plt, seaborn as sns
%matplotlib inline

In [None]:
# Reading the csv file and putting it into 'df' object.
df = pd.read_csv(r"/kaggle/input/delhi-delights-data/DelhiDelightsData.csv")

In [None]:
df.columns

In [None]:
df.head(len(df))

In [None]:
df.shape

In [None]:
df.info()

In [None]:
df.describe()

# EDA 

In [None]:
plt.figure(figsize = (10,5))
ax= sns.violinplot(df['Average Delivery Rating (a1)'])
plt.show()

No Outliers exists for this field

In [None]:
plt.figure(figsize = (15,5))
ax= sns.countplot(df['"Delighted Members" Purchase'])
for p in ax.patches:
    ax.annotate(str(p.get_height()), (p.get_x() * 1.01 , p.get_height() * 1.01))
plt.xticks(rotation = 45)
plt.show()

**Balanced Dataset** over "Delighted Members" Purchase' field

In [None]:
df['"Delighted Members" Purchase'].value_counts(ascending=False) * 100 / len(df)

In [None]:
plt.figure(figsize = (10,5))
ax= sns.violinplot(df['Average Orders per month (a2)'])
plt.show()

No Outliers exists for this field

In [None]:
plt.figure(figsize = (10,5))
sns.violinplot(y = 'Average Delivery Rating (a1)', x = '"Delighted Members" Purchase', data = df)
plt.show()

In [None]:
plt.figure(figsize = (10,5))
sns.violinplot(y = 'Average Orders per month (a2)',x='Average Delivery Rating (a1)', 
               hue = '"Delighted Members" Purchase', split=True,data = df,inner="quartile")
plt.show()

In [None]:
plt.figure(figsize = (10,5))
sns.violinplot(y = 'Average Orders per month (a2)', x = '"Delighted Members" Purchase', data = df)
plt.show()

In [None]:
plt.figure(figsize = (10,5))
sns.heatmap(df.corr(), annot = True, cmap="rainbow")
plt.show()

In [None]:
# Putting feature variable to X
X = df.drop('"Delighted Members" Purchase',axis=1)

# Putting response variable to y
y = df['"Delighted Members" Purchase']

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, random_state=10)
X_train.shape, X_test.shape

In [None]:
from sklearn.tree import DecisionTreeClassifier

### Decision Tree over whole dataset

In [None]:
dt = DecisionTreeClassifier()
dt.fit(X, y)

In [None]:
from sklearn import tree
fig = plt.figure(figsize=(25,20))
_ = tree.plot_tree(dt,
                   feature_names=X.columns,
                   class_names=['Yes', "No"],
                   filled=True)

### Decision Tree over test dataset

In [None]:
dt = DecisionTreeClassifier(random_state=7)
dt.fit(X_train, y_train)

In [None]:
from sklearn import tree
fig = plt.figure(figsize=(25,20))
_ = tree.plot_tree(dt,
                   feature_names=X.columns,
                   class_names=['Yes', "No"],
                   filled=True)

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score

In [None]:
def evaluate_model(dt_classifier):
    print("Train Accuracy :", round(accuracy_score(y_train, dt_classifier.predict(X_train)),2))
    print("Train Confusion Matrix:")
    print(confusion_matrix(y_train, dt_classifier.predict(X_train)))
    print("-"*50)
    print("Test Accuracy :", round(accuracy_score(y_test, dt_classifier.predict(X_test)),2))
    print("Test Confusion Matrix:")
    print(confusion_matrix(y_test, dt_classifier.predict(X_test)))

In [None]:
evaluate_model(dt)

**Train Accuracy as 1 means overfitting model. Opted for Hyper-parameter tuning of this model**

In [None]:
dt = DecisionTreeClassifier(random_state=50)

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
# Create the parameter grid based on the results of random search 
params = {
    'max_depth': [2, 3, 4,5, 6],
    'min_samples_leaf': [1,2,3,4,5],
    'criterion': ["gini", "entropy"]
}

In [None]:
# Instantiate the grid search model
grid_search = GridSearchCV(estimator=dt, 
                           param_grid=params, 
                           cv=4, n_jobs=-1, verbose=1, scoring = "accuracy")

In [None]:
%%time
grid_search.fit(X_train, y_train)

In [None]:
score_df = pd.DataFrame(grid_search.cv_results_)
score_df.head()

In [None]:
score_df.nlargest(5,"mean_test_score")

In [None]:
grid_search.best_estimator_

In [None]:
dt_best = grid_search.best_estimator_

In [None]:
fig = plt.figure(figsize=(25,10))
_ = tree.plot_tree(dt_best,
                   feature_names=X.columns,
                   class_names=['Yes', "No"],
                   filled=True)

In [None]:
evaluate_model(dt_best)

In [None]:
from sklearn.metrics import classification_report

In [None]:
print(classification_report(y_test, dt_best.predict(X_test)))

In [None]:
print(classification_report(y_train, dt_best.predict(X_train)))