In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

Task Details You're a marketing analyst and you've been told by the Chief Marketing Officer that recent marketing campaigns have not been as effective as they were expected to be. You need to analyze the data set to understand this problem and propose data-driven solutions.

Expected Submission Submit a well documented notebook with these three sections:

Section 01: Data Exploration Are there any null values or outliers? How will you wrangle/handle them? Are there any variables that warrant transformations? Are there any useful variables that you can engineer with the given data? Do you notice any patterns or anomalies in the data? Can you plot them?

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
data=pd.read_csv('../input/marketing-data/marketing_data.csv',index_col=0)
data.head()

let's check columns names:

In [None]:
data.columns

In [None]:
data.nunique()

In [None]:
data.info()

In [None]:
data.describe()

In [None]:
data['Total_Children']=data['Kidhome']+data['Teenhome']

In [None]:
data = data.drop(['Kidhome','Teenhome'],axis=1)

In [None]:
data.head()

In [None]:
data.nunique()

### 1. Cleaning Data

Cleaning Income column:

In [None]:
data[' Income '].dtype

In [None]:
data['Income'] = data[' Income '].str.replace(' ','').str.replace('$','').str.replace(',','').astype(float)
data = data.drop([' Income '],axis=1)

In [None]:
data.isnull().sum()

filling null values with "Income" column mean():

In [None]:
data['Income'] = data['Income'].fillna(data['Income'].mean())

In [None]:
data.isnull().sum()

In [None]:
data['Dt_Customer']=pd.to_datetime(data['Dt_Customer'])

### 2. Removing Outliners

let's check if any older then 1910:

In [None]:
data.loc[data['Year_Birth']<1910]


In [None]:
data.drop(index=[11004,1150,7829],inplace=True)

In [None]:
sns.boxplot(x='Education',y='Income',data=data)

In [None]:
data.loc[data.Income>600000]

In [None]:
data.drop(index=9432,inplace=True)

In [None]:
sns.boxplot(x='Education',y='Income',data=data)

In [None]:
data.loc[data.Income>140000]

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline

fig,axis=plt.subplots(1,2,figsize=(12,6))
sns.boxplot(x='Education',y='Income',data=data,ax=axis[0])


sns.boxplot(x='Country',y='Income',data=data,ax=axis[1])

### 3. Feature Engineering

In [None]:
data['Total_purchases'] = data['NumDealsPurchases']+data['NumWebPurchases']+data['NumCatalogPurchases']+data['NumStorePurchases']
data = data.drop(['NumDealsPurchases','NumWebPurchases','NumCatalogPurchases','NumStorePurchases'],axis=1)

In [None]:
data['veg_products'] = data['MntFruits']
data['nonveg_products'] = data['MntMeatProducts']+data['MntFishProducts']
data = data.drop(['MntFruits','MntMeatProducts','MntFishProducts'],axis=1)

In [None]:
data.head()

In [None]:
sns.barplot(x=data.Marital_Status,y=data.NumWebVisitsMonth)

In [None]:
data.dtypes

In [None]:
#Import library:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
var_mod =['Education','Marital_Status','Country','Dt_Customer']
for i in var_mod:
    data[i] = le.fit_transform(data[i])

In [None]:
data.dtypes

In [None]:
X= data.drop('Response',axis=1)
y = data['Response']

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,stratify=data['Response'],test_size=0.3)

### 4. Feature Selection

Mutual information (MI) between two random variables is a non-negative value, which measures the dependency between the variables. It is equal to zero if and only if two random variables are independent, and higher values mean higher dependency.

In [None]:
from sklearn.feature_selection import mutual_info_classif
# determine the mutual information
mutual_info = mutual_info_classif(X_train, y_train)
mutual_info

In [None]:
mutual_info = pd.Series(mutual_info)
mutual_info.index = X_train.columns
mutual_info.sort_values(ascending=False)

In [None]:
#let's plot the ordered mutual_info values per feature
mutual_info.sort_values(ascending=False).plot.bar(figsize=(14, 7))
plt.show()

In [None]:
from sklearn.feature_selection import SelectPercentile
#we Will select the  top 40% important features
sel_five_cols = SelectPercentile(mutual_info_classif, percentile=50)
sel_five_cols.fit(X_train, y_train)
X_train.columns[sel_five_cols.get_support()]

In [None]:
X_train.drop(['AcceptedCmp3','veg_products','NumWebVisitsMonth','Total_purchases','Marital_Status','Complain','MntSweetProducts','Education','Year_Birth','AcceptedCmp2','Country'],axis=1)
X_test.drop(['AcceptedCmp3','veg_products','NumWebVisitsMonth','Total_purchases','Marital_Status','Complain','MntSweetProducts','Education','Year_Birth','AcceptedCmp2','Country'],axis=1)

In [None]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.fit_transform(X_test)

### 5. ANN training

In [None]:
from keras.models import Sequential
from keras.layers import Dense

In [None]:
model = Sequential()
# Adding the input layer and the first hidden layer
model.add(Dense(units = 15, kernel_initializer='he_uniform', activation='relu', input_dim=X_train.shape[1]))
# Adding the second hidden layer
model.add(Dense(units = 10, kernel_initializer='he_uniform',activation='relu'))
# Adding the third hidden layer
model.add(Dense(units = 5, kernel_initializer='he_uniform',activation='relu'))
# Adding the output layer
model.add(Dense(units=1, kernel_initializer='glorot_uniform',activation='sigmoid'))

In [None]:
model.compile(optimizer='Adam',loss='binary_crossentropy',metrics = ['accuracy'])

In [None]:
model.summary()

In [None]:
model_history = model.fit(X_train,y_train,validation_data=(X_test,y_test),epochs=25,batch_size=10)

In [None]:
pred = model.predict(X_test)

In [None]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test,pred.round())

In [None]:
# summarize history for loss
plt.plot(model_history.history['loss'])
plt.plot(model_history.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'validation'], loc='upper left')
plt.show()

In [None]:
# summarize history for accuracy
plt.plot(model_history.history['accuracy'])
plt.plot(model_history.history['val_accuracy'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'validation'], loc='upper left')
plt.show()