In [None]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import roc_auc_score, plot_roc_curve, classification_report
from sklearn import tree
import matplotlib as plt
import seaborn as sns
%matplotlib inline

In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
ads = pd.read_csv('/kaggle/input/social-network-ads/Social_Network_Ads.csv')

In [None]:
ads.isnull().sum()

In [None]:
ads.groupby(['Purchased'], as_index=False).aggregate({'Age': 'mean', 'EstimatedSalary': 'mean'}) \
                                    .rename(columns={'Age': 'mean_age', 'EstimatedSalary': 'mean_slary'}) \
                                    .round()

In [None]:
X = ads.drop(['Purchased'], axis=1)
Y = ads.Purchased

In [None]:
X_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.33, random_state=42)

In [None]:
clf = DecisionTreeClassifier()

In [None]:
params = {'criterion': ['gini', 'entropy'], 'max_depth': range(1,30),
             'min_samples_split': range(2,10),
             'min_samples_leaf': range(1,10)}

In [None]:
gd_cv = GridSearchCV(clf, params, cv=5)

In [None]:
gd_cv.fit(X_train,y_train)

In [None]:
gd_cv.best_params_

In [None]:
gd_cv.best_score_

In [None]:
gd_cv.score(x_test,y_test)

In [None]:
best_clf = gd_cv.best_estimator_

In [None]:
proba = best_clf.predict_proba(x_test)

In [None]:
pd.Series(proba[:, 1]).hist()

In [None]:
thr = np.where(proba[:, 1] > 0.8, 1, 0)

In [None]:
y_pred = gd_cv.predict(x_test)

In [None]:
print(classification_report(y_test, y_pred))

In [None]:
print(classification_report(thr, y_pred))

In [None]:
roc_auc_score(y_test, y_pred)

In [None]:
plot_roc_curve(best_clf, x_test, y_test)