In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import warnings
ignore=True
import matplotlib.pyplot as plt
import seaborn as sns
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

**Content**
In order to enable machine learning experimentation, this dataset has been structured as follows:

Each row is a comparison between two groups of potential customers:<br>
1. Column names starting with "g1" represent characteristics of the first customer group (these were known before the campaign was run).
2. Column names starting with "g2" represent characteristics of the second customer group (these were known before the campaign was run)
3. Column names starting with "c_" are features representing some comparison of the two groups (also known before the campaign was run)

The last column, named "target", is categorical, with 3 categories:<br>
0 - none of the two groups were profitable<br>
1 - group1 turned out to be more profitable<br>
2 - group2 turned out to be more profitable<br>

**Inspiration**
Can you build a machine learning classifier that accurately predicts which of the 2 groups (if any) will turn out to be more profitable?

In [None]:
df=pd.read_csv("/kaggle/input/predicting-profitable-customer-segments/customerTargeting.csv")

In [None]:
df.head()

In [None]:
df.shape

In [None]:
df.info()

In [None]:
df.describe()

# <h1 style="background-color:Crimson;color:white;text-align:center;">Analysing Target</h1>

In [None]:
fig=plt.figure(figsize=(12,7))
sns.countplot(x=df["target"], palette = 'rocket_r')

<h1 style="background-color:Crimson;color:white;text-align:center;">Dropping Feature(Variance Threshold)</h1>
<div class="alert alert-info">Dropping Columns based on low Variance since they do not contribute much in prediction. This method removes features with variation below a certain cutoff.</div>

In [None]:
from sklearn.feature_selection import VarianceThreshold
X=df.drop(['target'],axis=1)
Y=df['target']

In [None]:
var_thres=VarianceThreshold(threshold=0)
var_thres.fit(X)

In [None]:
sum(var_thres.get_support()) #Counting columns with variance threshold by grt_support method

In [None]:
constant_columns = [column for column in X.columns #Checking for contsant columns 
                    if column not in X.columns[var_thres.get_support()]]

In [None]:
X.drop(constant_columns,axis=1)#Dropping constant columns 

<h1 style="background-color:Crimson;color:white;text-align:center;">Getting Correlation Among Features & with Target</h1>

In [None]:
#Feature Correlation
cor_target =df.corr().abs()
Target_Corr = cor_target.corr()['target'].to_frame().reset_index() #Feature Correlation related to SalePrice
Feature_corr =cor_target.unstack().to_frame(name='Correlation') # Feature Relation
Feature = Feature_corr[(Feature_corr['Correlation']>=0.80)&(Feature_corr['Correlation']<1)].sort_values(by='Correlation', ascending = False).reset_index()
display(Feature)

<div class="alert alert-info" role="alert">
Dropping Columns based on Correlation</div>

In [None]:
cor_target =df.corr()
# Select upper triangle of correlation matrix
upper = cor_target.where(np.triu(np.ones(cor_target.shape), k=1).astype(np.bool))
# Find index of feature columns with correlation greater than 0.82
to_drop = [column for column in upper.columns if any(upper[column] > 0.82)]
df=df.drop(df[to_drop], axis=1)

In [None]:
my_corr=df.corr() # Correlation of newly created dataframe 
cor_target = abs(my_corr["target"]) # Correlation with respect to target column 
#Selecting highly correlated features
relevant_features = cor_target[cor_target>0.2] # Columns whose correlation is greater than 0.5
display(relevant_features) # getting the correlation values with Correlation greater than 0.5

<h1 style="background-color:Crimson;color:white;text-align:center;">Checking VIF</h1>

Variance inflation factor (VIF) is a measure of the amount of multicollinearity in a set of multiple regression variables. Mathematically, the VIF for a regression model variable is equal to the ratio of the overall model variance to the variance of a model that includes only that single independent variable. This ratio is calculated for each independent variable. A high VIF indicates that the associated independent variable is highly collinear with the other variables in the model.

1. A variance inflation factor (VIF) provides a measure of multicollinearity among the independent variables in a multiple regression model.
2. Detecting multicollinearity is important because while multicollinearity does not reduce the explanatory power of the model, it does reduce the statistical significance of the independent variables. 
3. A large variance inflation factor (VIF) on an independent variable indicates a highly collinear relationship to the other variables that should be considered or adjusted for in the structure of the model and selection of independent variables.

In [None]:
X=df.drop(['target'],axis=1)
Y=df['target']

In [None]:
from statsmodels.tools.tools import add_constant
from statsmodels.stats.outliers_influence import variance_inflation_factor
X_vif = add_constant(X)
vif = pd.Series([variance_inflation_factor(X_vif.values, i) 
               for i in range(X_vif.shape[1])], 
              index=X_vif.columns)

In [None]:
display(vif.sort_values(ascending = False).head(10))

In [None]:
df.shape

<h1 style="background-color:Crimson;color:white;text-align:center;">Mutual Info Gain</h1>

Mutual information (MI) is a measure of the amount of information between two random variables is symmetric and non-negative, and it could be equal to zero if and only if two random variables are independent, and higher values mean higher dependency. Mutual information measures the dependency between the variables.

In [None]:
from sklearn.feature_selection import mutual_info_classif
mutual_info = mutual_info_classif(X,Y)
mutual_info

In [None]:
mutual_info = pd.Series(mutual_info)
mutual_info.index = X.columns
mutual_info.sort_values(ascending=False)

Selecting K-Best Features based on Target

In [None]:
# Selecting Top 10 Columns using kbest
from sklearn.feature_selection import SelectKBest
sel_cols = SelectKBest(mutual_info_classif, k=10)
sel_cols.fit(X,Y)
X.columns[sel_cols.get_support()]

<h1 style="background-color:Crimson;color:white;text-align:center;">Model</h1>

In [None]:
X=df[['g1_1','g1_10', 'g1_5', 'g2_1', 'g2_19', 'c_2', 'c_3', 'c_10','c_11','c_25']]
Y=df["target"]

In [None]:
display(X.shape)
display(Y.shape)

In [None]:
X.describe()

<h1 style="background-color:lightgreen;color:white;text-align:center;">Train-Test Split</h1>

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

<h1 style="background-color:lightgreen;color:white;text-align:center;">Min-Max Scaling</h1>

In [None]:
from sklearn.preprocessing import MinMaxScaler
mms = MinMaxScaler(feature_range=(0,1))
X_train = mms.fit_transform(X_train)
X_test = mms.fit_transform(X_test)

<h1 style="background-color:lightgreen;color:white;text-align:center;">Implementing PYCARET (An AutoML Approach)</h1>

In [None]:
#!pip install pycaret

In [None]:
# Importing PyCaret Module based on type of Problem
from pycaret import classification
from pycaret.classification import * 

In [None]:
# Setting up of variables depending on its nature (continuous or Categorical) and also selecting those feature which are not important 
# for further analysis.
classification_setup=setup(data=df,target='target',numeric_features=['g1_1','g1_10','g1_5','g2_1','g2_19','c_2',
                                                                     'c_3','c_11','c_10','c_25'],
                          ignore_features=['g1_2', 'g1_3', 'g1_11', 'g1_7', 'g1_15','g1_16', 
                           'g1_17', 'g1_18', 'g1_19', 'g1_20', 'g1_21','g2_2','g2_3', 'g2_5', 'g2_7', 
                           'g2_10', 'g2_15', 'g2_16', 'g2_17','g2_18', 'g2_11', 'g2_20', 'c_1', 'c_5', 
                           'c_4','c_7', 'c_8', 'c_9', 'c_13', 'c_14', 'c_15', 'c_16','c_22', 'c_23', 
                           'c_24', 'c_27', 'c_28','c_6'],silent = True)

In [None]:
compare_models() # Here pycaret will build different model based on selections we have made above. It will show accuracy and other metrics 
# that how has different model performed on data.

In [None]:
gbc  = create_model('gbc') # Based on Above Summarized result Gradient Boost Performs best 

In [None]:
tuned_gbc = tune_model(gbc,n_iter = 100,optimize = 'AUC')#Optimizing based on AUROC Scores

In [None]:
plot_model(estimator = tuned_gbc, plot = 'learning')#Plotting Learning curve

In [None]:
plot_model(estimator = tuned_gbc, plot = 'auc')#Plotting AUC Scores

In [None]:
plot_model(estimator = tuned_gbc, plot = 'confusion_matrix')#Plotting Confusion Matrix

In [None]:
plot_model(estimator = tuned_gbc, plot = 'feature')#Feature Importances Based on GBModel

In [None]:
evaluate_model(tuned_gbc)

In [None]:
#interpret_model(tuned_gbc)

In [None]:
#predict_model(tuned_gbc, data=X_test)

<h1 style="background-color:lightgreen;color:white;text-align:center;">ADABoost Model</h1>

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import  AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
ada_clf = AdaBoostClassifier(DecisionTreeClassifier(max_depth=8),random_state = 42)
parameters = {"base_estimator__criterion" : ["gini", "entropy"],
              "base_estimator__splitter" :   ["random"],
              "algorithm" : ["SAMME","SAMME.R"],
              "n_estimators" :[5,10,15,20,100],
              "learning_rate":  [0.05, 0.5, 1]}
ada_clf = GridSearchCV(ada_clf, parameters, cv=3, scoring="accuracy")
ada_clf.fit(X_train, y_train)
print(f'Best parameters {ada_clf.best_params_}')
print('-----')
print(f'Mean cross-validated accuracy score of the best_estimator: '+f'{ada_clf.best_score_:.3f}')

In [None]:
print("Test Accuracy:",ada_clf.score(X_test, y_test))

<h1 style="background-color:lightgreen;color:white;text-align:center;">Confusion Matrix</h1>

In [None]:
from sklearn.metrics import classification_report , confusion_matrix , accuracy_score
from mlxtend.plotting import plot_confusion_matrix

Y_Pred=ada_clf.predict(X_test)
cnf_mat=confusion_matrix(y_test, Y_Pred)
fig, ax = plot_confusion_matrix(conf_mat=cnf_mat,figsize=(8, 8),
                                show_absolute=True,
                                show_normed=True,
                                colorbar=True)
plt.show()

In [None]:
from yellowbrick.classifier import ConfusionMatrix
fig=plt.figure(figsize=(8,8))
classes=["0", "1", "2"]
cnf_mat=ConfusionMatrix(ada_clf, classes=classes,label_encoder={0: 'non_Profitable', 1: 'Group_1', 2: 'Group_2'})
cnf_mat.fit(X_train, y_train)
cnf_mat.score(X_test, y_test)
cnf_mat.show()

<h1 style="background-color:lightgreen;color:white;text-align:center;">Classification Report</h1>

In [None]:
fig=plt.figure(figsize=(12,7))
from yellowbrick.classifier import ClassificationReport
classes=["0", "1", "2"]
visualizer = ClassificationReport(ada_clf, classes=classes, support=True)
visualizer.fit(X_train, y_train)        # Fit the visualizer and the model
visualizer.score(X_test, y_test)        # Evaluate the model on the test data
visualizer.show()       

<h1 style="background-color:lightgreen;color:white;text-align:center;">Precision-Recall Curve</h1>

In [None]:
fig=plt.figure(figsize=(15,8))
from yellowbrick.classifier import PrecisionRecallCurve
viz = PrecisionRecallCurve(ada_clf,per_class=True,
    cmap="Set1")
viz.fit(X_train, y_train)
viz.score(X_test, y_test)
viz.show()

<h1 style="background-color:lightgreen;color:white;text-align:center;">AUROC</h1>

In [None]:
fig=plt.figure(figsize=(18,8))
from yellowbrick.classifier import ROCAUC
visualizer = ROCAUC(ada_clf, classes=["0", "1", "2"])
visualizer.fit(X_train, y_train)        # Fit the training data to the visualizer
visualizer.score(X_test, y_test)        # Evaluate the model on the test data
visualizer.show() 

<h1 style="background-color:lightgreen;color:white;text-align:center;">Class Prediction Error</h1>

In [None]:
from yellowbrick.classifier import ClassPredictionError
fig=plt.figure(figsize=(18,8))
visualizer = ClassPredictionError(ada_clf,per_class=True,cmap="Set1")
# Fit the training data to the visualizer
visualizer.fit(X_train, y_train)
# Evaluate the model on the test data
visualizer.score(X_test, y_test)
# Draw visualization
visualizer.show()

<h1 style="background-color:lightgreen;color:white;text-align:center;">Observations</h1>

Based on my Analysis model would suggest to go for Customers in Group 1. Reason being:
1. Our Model Accuracy being 59% says about 59% of time it correctly labelled the customer in respective groups.
2. The precision of Model for Class1(0.63) > Class 0(0.36) & Class 2(0.57) which says among the labelled customers as Class 1. Model correctly predicts them as class 1 with score of 63% i.e 63 out of every 100 class 1 customers are correctly predicted.
3. In this scenario we want our assumptions to be more correct rather than false negatives and true negative( i.e we want to be more confident in our decisions so based on True Positive rates we can see 80% True Positive Scores.). Though it would also depend on customer behaviour and other demographic and social factors.
4. Also the Recall Score of Class 1 Group is better than other two group which says (False Positive better than False Negative). 
<br>

**Suggestion:** Should perform A/B Testing. 

<br>
**Still Room for Improvement**