Table of content
1. Import Required Libraries
2. Load Data
3. Data Information
4. Check for null values
5. EDA
    1. Check Outliers
    2. Univariant Analysis
    3. Bivariant Analysis
    4. Multivariant Analysis
    5. Correlation
6. Encode Categorical Variables
7. Spliting Data
8. Base Line Models
    1. Logistic Regression
    2. XGBoost Classifier
    3. Random Forest Classifier
    4. Gradient Boosting Classifier
    5. Stacking Classifier
9. Balancing target variable
10. Feature Selection
11. Dimentionality Reduction
12. Hyper Parameter Tuning
13. Best Model

# Import Required Libraries

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, StackingClassifier
from xgboost import XGBClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report
from imblearn.combine import SMOTETomek
from sklearn.feature_selection import SelectFromModel
from sklearn.decomposition import PCA

# display all columns of the dataframe
pd.options.display.max_columns = None

# display all rows of the dataframe
pd.options.display.max_rows = None

# use below code to convert the 'exponential' values to float
np.set_printoptions(suppress=True)

# Load Data

In [None]:
df = pd.read_csv('../input/arketing-campaign/marketing_campaign.csv', sep=';')
df.head()

In [None]:
# Dropping ID Column beacause we dont id column for predictions
df.drop('ID', axis=1, inplace = True)

# Data Information

In [None]:
# Shape of Dataset
print('Data contains', df.shape[0], 'rows and', df.shape[1], 'columns')

In [None]:
# Dataset information about value count and variable data type
df.info()

In [None]:
# Numerical Data Description
df.describe().T

In [None]:
# Categorical Data Description
df.describe(include='O').T

# Check for null values

In [None]:
# Check for null values in the dataset
df.isnull().sum()

Only income column contains null values

# Filling Null Values

In [None]:
def fill_na(frame):
    for i in frame.columns:
        if(((frame[i].isnull().sum() / len(frame))*100) <= 30) & (frame[i].dtype == 'int64'):
            frame[i] = frame[i].fillna(frame[i].median())
            
        elif(((frame[i].isnull().sum() / len(frame))*100) <= 30) & (frame[i].dtype == 'O'):
            frame[i] = frame[i].fillna(frame[i].mode()[0])
            
        elif(((frame[i].isnull().sum() / len(frame))*100) <= 30) & (frame[i].dtype == 'float64'):
            frame[i] = frame[i].fillna(frame[i].median())
            
fill_na(df)

# EDA

# 1. Check Outliers

In [None]:
def detect_outliers(frame):
    for i in frame.columns:
        if(frame[i].dtype == 'int64'):
            sns.boxplot(frame[i])
            plt.show()
            
        elif(frame[i].dtype == 'float64'):
            sns.boxplot(frame[i])
            plt.show()
            
detect_outliers(df)

# 2. Univariant Analysis

In [None]:
def univariant(frame):
    for i in frame.columns:
        if(frame[i].dtype == 'int64'):
            print(i)
            sns.distplot(x=frame[i])
            plt.show()
                
        elif(frame[i].dtype == 'float64'):
            print(i)
            sns.distplot(x=frame[i])
            plt.show()
            
univariant(df)

In [None]:
# Plot Response variable seperately because our target variable(Class) is int and we have to treat it like object this time
sns.countplot(df['Response'])
plt.show()

Our target variable(Response) is not balanced 

# 3. Multivariant Analysis

In [None]:
sns.pairplot(df)

# 4. Correlation

In [None]:
# Check correlation between variables
plt.figure(figsize=(30,25))
sns.heatmap(df.corr(), annot=True)

In [None]:
# Converting dt_Customer into datetime64 data type
df['Dt_Customer'] = df['Dt_Customer'].astype('datetime64')

In [None]:
# Creating two new columns Date_customer and Month_customer from Dt_Customer column
df['Date_Customer'] = df['Dt_Customer'].dt.day
df['Month_Customer'] = df['Dt_Customer'].dt.month
df['Year_Customer'] = df['Dt_Customer'].dt.year

In [None]:
# Now we can drop Dt_Customer column
df.drop('Dt_Customer', axis=1, inplace=True)

# Encode Categorical Variables

In [None]:
def encode(dataframe):
    lec = LabelEncoder()
    for j in dataframe.columns:
        if(dataframe[j].dtype == 'object'):
            dataframe[j] = lec.fit_transform(dataframe[j])
            
encode(df)

# Split data into train and test

In [None]:
x = df.drop('Response', axis=1)
y = df['Response']

X_train, X_test, Y_train, Y_test = train_test_split(x, y, test_size=0.3, random_state=1)

# Lets Build Models

# Base Line Models

# 1. Logistic Regression

In [None]:
lr = LogisticRegression(max_iter=10000)
lr.fit(X_train, Y_train)

In [None]:
lr_pred = lr.predict(X_test)
print(classification_report(Y_test, lr_pred))

# 2. XGBoost Classifier

In [None]:
xgb = XGBClassifier()
xgb.fit(X_train, Y_train)

In [None]:
xgb_pred = xgb.predict(X_test)
print(classification_report(Y_test, xgb_pred))

# 3. Random Forest Classifier

In [None]:
rf = RandomForestClassifier()
rf.fit(X_train, Y_train)

In [None]:
rf_pred = rf.predict(X_test)
print(classification_report(Y_test, rf_pred))

# 4. Gradient Boosting Classifier

In [None]:
gb = GradientBoostingClassifier()
gb.fit(X_train, Y_train)

In [None]:
gb_pred = gb.predict(X_test)
print(classification_report(Y_test, gb_pred))

In [None]:
accuracy_score(Y_test, gb_pred)

# 5. Stacking Classifier

In [None]:
estimators = [('xgb', XGBClassifier()),
             ('rf', RandomForestClassifier()),
             ('gb', GradientBoostingClassifier())]
stack = StackingClassifier(estimators=estimators)
stack.fit(X_train, Y_train)

In [None]:
stack_pred = stack.predict(X_test)
print(classification_report(Y_test, stack_pred))

From all my base line models Gradient Boosting Classifier gives best results

# Balancing the target variable

In [None]:
smote = SMOTETomek()
x_train, y_train = smote.fit_resample(X_train, Y_train)

Building models again using new training sets 

# 1. Logistic Regression

In [None]:
slr = LogisticRegression(max_iter=10000)
slr.fit(x_train, y_train)

In [None]:
slr_pred = slr.predict(X_test)
print(classification_report(Y_test, slr_pred))

# 2. XGBoost Classifier

In [None]:
sxgb = XGBClassifier()
sxgb.fit(x_train, y_train)

In [None]:
sxgb_pred = sxgb.predict(X_test)
print(classification_report(Y_test, sxgb_pred))

# 3. Random Forest Classifier

In [None]:
srf = RandomForestClassifier()
srf.fit(x_train, y_train)

In [None]:
srf_pred = srf.predict(X_test)
print(classification_report(Y_test, srf_pred))

# 4. Gradient Boosting Classifier

In [None]:
sgb = GradientBoostingClassifier()
sgb.fit(x_train, y_train)

In [None]:
sgb_pred = sgb.predict(X_test)
print(classification_report(Y_test, sgb_pred))

In [None]:
sstack = StackingClassifier(estimators=estimators)
sstack.fit(x_train, y_train)

In [None]:
sstack_pred = sstack.predict(X_test)
print(classification_report(Y_test, sstack_pred))

Models after balancing the target variable gives good results. But if i them compare with base line models then base line model of Gradient Boosting Classifier give highest accuracy. So i further build my model with Gradient Boosting Classifier base line.

# Feature Selection

In [None]:
th = np.sort(gb.feature_importances_)
l = []
for g in th:
    select = SelectFromModel(gb, threshold = g, prefit = True)
    x_Train = select.transform(X_train)
    model = GradientBoostingClassifier()
    model.fit(x_Train, Y_train)
    x_Test = select.transform(X_test)
    y_pred = model.predict(x_Test)
    accuracy = accuracy_score(Y_test, y_pred)
    print('Threshold:', g, 'Model Score:', accuracy)

In [None]:
imp = pd.DataFrame(rf.feature_importances_)
imp.index = X_train.columns
imp[imp[0] < 0.017037885998921535]

In [None]:
X_train = X_train.drop(['Z_CostContact', 'Z_Revenue'], axis=1)
X_test = X_test.drop(['Z_CostContact', 'Z_Revenue'], axis=1)

# Building model after feature selection

In [None]:
fgb = GradientBoostingClassifier()
fgb.fit(X_train, Y_train)

In [None]:
fgb_pred = fgb.predict(X_test)
print(classification_report(Y_test, fgb_pred))

In [None]:
accuracy_score(Y_test, fgb_pred)

After feature selection i am getting same accuracy. So i further build model using base line

# Dimentionality Reduction

In [None]:
# First i check how many components we want
# For this first i am initializing the pca
pca = PCA()
# Fitting the training set in pca
pca.fit(X_train)

In [None]:
# Now check number of components
pca.explained_variance_ratio_

As shown above our 99.97% data covers in 1 principal component

In [None]:
# Creating pca with n_components = 15
Pca = PCA(n_components=15)
# Fitting the training data
X_Train = Pca.fit_transform(X_train)
X_Test = Pca.fit_transform(X_test)

In [None]:
# Building models after applying pca
pgb = GradientBoostingClassifier()
pgb.fit(X_Train, Y_train)

In [None]:
pgb_pred = pgb.predict(X_Test)
print(classification_report(Y_test, pgb_pred))

# Hyper Parameter Tuning

In [None]:
grid = {
    'learning_rate' : [0.2, 0.3, 0.4, 0.5],
    'n_estimators' : [300, 500, 700, 900],
    'min_samples_split' : [3, 4, 5, 6],
    'max_depth' : [2, 3, 4, 5],
    'loss' : ['deviance', 'exponential']
}
random_cv = RandomizedSearchCV(estimator=gb,
                              param_distributions=grid,
                              n_iter=20,
                              n_jobs=-1,
                              cv=5,
                              verbose=7,
                              random_state=10,
                              scoring='accuracy')
random_cv.fit(X_train, Y_train)

In [None]:
random_cv.best_estimator_

In [None]:
hgb = GradientBoostingClassifier(learning_rate=0.5, loss='exponential', max_depth=2,
                           min_samples_split=4, n_estimators=300)
hgb.fit(X_train, Y_train)

In [None]:
hgb_pred = hgb.predict(X_test)
print(classification_report(Y_test, hgb_pred))

# My Best Model
My Best model is Gradient Boosting Classifier after Hyper Parameter tuning