In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import datetime
import math

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Exploratory Data Analysis

In [None]:
df = pd.read_csv('../input/marketing-data/marketing_data.csv')

# converting 'Income' to a float type
df.columns = df.columns.str.replace(' ', '')
df.Income = df.Income.str.replace('$', '').str.replace(',', '').astype('float')

In [None]:
# checking if any columns have null values
df.columns[df.isnull().sum() > 0]

In [None]:
# plotting income distribution
plt.figure(figsize=(8,4))
sns.displot(df.Income)

In [None]:
# filling missing income values with the median income
df.Income.fillna(df.Income.median(), inplace=True)

In [None]:
# Create a df copy with only numeric fields
df_numeric = df.copy()
df_numeric.drop(columns = [column for column in df_numeric 
                           if df_numeric.dtypes[column] == 'object'], inplace=True)

# Boxplot all numeric field to see if there are any outliers due to data entry errors
df_numeric.plot(figsize=(14,25), kind='box', subplots=True, layout=(6,4))
plt.subplots_adjust(wspace=0.5) 

In [None]:
# We can see a few birth years at around 1900 but the oldest person alive 
# is born in 1903 so there's a good reason to believe these are entry errors
df = df.drop(df[df['Year_Birth'] <= 1900].index)

In [None]:
plt.figure(figsize=(3,4))
df['Year_Birth'].plot(kind='box')

In [None]:
df.Dt_Customer.dtypes

In [None]:
df.Dt_Customer = pd.to_datetime(df.Dt_Customer)

# Creating aditional variables
* TotalAmountSpent - Total amount spent by the customer (sum of all Mnt_ variables)
* TotalNumPurchases - Total number of purchases by the customer (sum of all Num_Purchases variables)
* YearCustomer - The year when the customer joined (extracted from Dt_Customer)
* Dependents - Sum of kids and teens at home

In [None]:
df['TotalAmountSpent'] = df.iloc[:, 9:15].sum(axis=1)

cols = list(range(9,15))
cols.append(28)
df.iloc[:, cols].head(10)

In [None]:
df['TotalNumPurchases'] = df.iloc[:, 16:20].sum(axis=1)

cols = list(range(16,20))
cols.append(29)
df.iloc[:, cols].head(10)

In [None]:
df['YearCustomer'] = df['Dt_Customer'].dt.year
df['Dependents'] = df.loc[:, ['Kidhome', 'Teenhome']].sum(axis=1)


In [None]:
# calculate correlation matrix
## using non-parametric test of correlation (kendall), since some features are binary
corrs = df.drop(columns='ID').select_dtypes(include=np.number).corr(method = 'kendall')

# plot clustered heatmap of correlations
sns.clustermap(corrs, cbar_pos=(-0.05, 0.8, 0.05, 0.18), cmap='coolwarm', center=0)

In [None]:
sns.lmplot(y='TotalAmountSpent', x='Income', data=df[df.Income < 100000])

Seems like the relationship between amount spent and income is not linear. Applying a square root transformation on amount spent should give us a linear relationship, necessary for linear regression.

In [None]:
df['rootTotalMnt'] = df.TotalAmountSpent.apply(lambda x: math.sqrt(x))
sns.lmplot(y='rootTotalMnt', x='Income', data=df[df.Income < 100000])

In [None]:
sns.barplot(x='Kidhome', y='TotalAmountSpent', data=df)
plt.title('Total amount spent by number of kids at home')
plt.ylabel('Total amount spent')
plt.xlabel('# of kids at home')

In [None]:
sns.barplot(x='Dependents', y='NumDealsPurchases', data=df)
plt.title('Number of purchases with deals by number of dependents')
plt.ylabel('# of deals purchases')

In [None]:
campaigns = pd.DataFrame(df.iloc[:,20:25].sum(), columns=['amount']).reset_index()
campaigns.columns
plt.figure(figsize=(8,4))
sns.barplot(x='index', y='amount', data=campaigns)