In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

Task Details
You're a marketing analyst and you've been told by the Chief Marketing Officer that recent marketing campaigns have not been as effective as they were expected to be. You need to analyze the data set to understand this problem and propose data-driven solutions.

Expected Submission
Submit a well documented notebook with these three sections:

Section 01: Data Exploration
Are there any null values or outliers? How will you wrangle/handle them?
Are there any variables that warrant transformations?
Are there any useful variables that you can engineer with the given data?
Do you notice any patterns or anomalies in the data? Can you plot them?


First thing first! lets import our data and quickly check it:

In [None]:
data=pd.read_csv('../input/marketing-data/marketing_data.csv')
pd.set_option('display.max_columns', 30)
data.head()

Are there any null values or outliers? 

In [None]:
data.isna().sum()

As can be seen, in the "Income" column we have some null values. Lets check it closer:

In [None]:
data.loc[data[' Income '].isna()]

Sometimes, NaN value means there is no value for that specific feature. For instance, when someone does not have any income, they may write NaN for that, which means NaN in this case is zero (no income); or, when a house does not have a pool, we may see NaN for the pool record, which means no pool. However, in this case, NaN values seems to be missed and it does not mean "there is no income" or "income is zero". 

Anyway, lets check how much missed data we have for " Income " column

In [None]:
(data[' Income '].isna().sum()/data.shape[0])*100

Only about one percent of the data are null values, so we may want to drop them. However, before complate dropping of null values, lets check the relation btw different features and the income value. We may find a way to guess the missing values. Hence, save the non-NaN values in a clean_data variable.

# Method1: Deleting the NaN values

We do not want to change the original data, so we are copying non-NaN values in a new dataset named clean_data

In [None]:
clean_data=data.dropna()

Fist we need to clean the Income column. We have "$" and "," in the data which should be removed

In [None]:
clean_income=clean_data[' Income '].str.replace('$', '').str.replace(',', '').astype('float')


Now, we are going to replace " Income " column with "clean_income"

In [None]:
clean_data['clean_income']=clean_income

In [None]:
clean_data.drop(' Income ', axis=1, inplace=True)

In [None]:
clean_data.head(10)

Lest check the most affected parameters on Income

In [None]:
newd=data.dropna()
import seaborn as sns
sns.boxplot(x=newd.Education, y=clean_data.clean_income)

Wow, it seems some people get more than 600000 after graduation. Good for them :). Lets see who is this lucky person :)

In [None]:
clean_data.loc[clean_data.clean_income>600000]

Since this is just one record and the income is not the norm of income for graduate we can delete it as an outlier

In [None]:
clean_data.drop(index=527, inplace=True)

In [None]:
fig, axs=plt.subplots(1,2, figsize=(10,5))
sns.boxplot(x=newd.Education, y=clean_data.clean_income, ax=axs[0])
sns.boxplot(x=newd.Country, y=clean_data.clean_income, ax=axs[1])

So, it seems the country does not have too much effect on the income. On the other hand, as expected, PhD on the average geeting more money than Basic!! One way to replace the NaN values can be based on the education. For example, if someone have graduation, we can replace income with the average of graduaction income which is 50000. 

# Method 2: Use average values, based on "Education" columns

We found a better way to deal with the NaN value so lets replace them with the average values based on the Eductaion

In [None]:
data[' Income '].loc[data.loc[(data.Education=='PhD') & (data[' Income '].isna())].index]=58000
data[' Income '].loc[data.loc[(data.Education=='Graduation') & (data[' Income '].isna())].index]=50000
data[' Income '].loc[data.loc[(data.Education=='Master') & (data[' Income '].isna())].index]=55000
data[' Income '].loc[data.loc[(data.Education=='2n Cycle') & (data[' Income '].isna())].index]=48000


In [None]:
data.isna().sum()

Now do the same cleaning procedure on "Income" column: 

In [None]:
clean_income=data[' Income '].str.replace('$', '').str.replace(',', '').astype('float')
data['clean_income']=clean_income
data.drop(' Income ', axis=1, inplace=True)
#data.drop(index=527, inplace=True)

#  Dealing with Categorical features

In [None]:
data.dtypes

In [None]:
data[['Education', 'Marital_Status','Dt_Customer','Country']]

Now, lets convert these categorical objects to some meaningful data:

In [None]:
from sklearn.preprocessing import OneHotEncoder
Cat_c=['Education', 'Marital_Status','Country']
#Cat_c=['Education']
for items in Cat_c:
    le=OneHotEncoder()
    t=le.fit_transform(data[[items]]).toarray()
    a=data[items].unique()
    indexs=np.unique(a, return_index=True)[1]
    col=[a[indexs] for index in sorted(indexs)]
    data=pd.concat([data, pd.DataFrame(t, columns=col[1])], axis=1).drop([items],axis=1)

In [None]:
pd.set_option('display.max_columns', 50)
data.head(10)

Now, we are going to change the DT_Customer to a date_times which python can read:

In [None]:
data['Dt_Customer_n']=pd.to_datetime(data.Dt_Customer)
data=data.drop(['Dt_Customer'], axis=1)

Our cleaned data is:

In [None]:
pd.set_option('display.max_column', 50)
data.head()

In [None]:
data.loc[data.clean_income>600000]

In [None]:
data=data.drop(index=527)

We have fairly normal distribution for income and Year_birth:

In [None]:
fig, axs = plt.subplots(1,2)
sns.distplot(data.clean_income, ax=axs[0])
sns.distplot(data.Year_Birth, ax=axs[1])

Do we have any Year_Birth before 1910?! wow

In [None]:
data.loc[data.Year_Birth<1910]

In [None]:
data=data.drop(index=[513, 827, 2233])

In [None]:
fid, axs=plt.subplots(2,3, figsize=(14,14))
sns.scatterplot( x='clean_income',y='MntMeatProducts', data=data, ax=axs[0, 0])
sns.scatterplot( x='clean_income',y='MntWines', data=data, ax=axs[0, 1])
sns.scatterplot( x='clean_income',y='MntFishProducts', data=data, ax=axs[0, 2])
sns.scatterplot( x='clean_income',y='MntSweetProducts', data=data, ax=axs[1, 0])
sns.scatterplot( x='clean_income',y='MntGoldProds', data=data, ax=axs[1, 1])
sns.scatterplot( x='clean_income',y='MntFruits', data=data, ax=axs[1, 2])

As can be seen from the figures, amount of different products purchasing is exponentially increasing by the incom. There are a few obvious outliners, which must be removed.

In [None]:
data=data.drop(index=data[data.clean_income>150000].index)

In [None]:
fid, axs=plt.subplots(2,3, figsize=(14,14))
sns.scatterplot( x='clean_income',y='MntMeatProducts', data=data, ax=axs[0, 0])
sns.scatterplot( x='clean_income',y='MntWines', data=data, ax=axs[0, 1])
sns.scatterplot( x='clean_income',y='MntFishProducts', data=data, ax=axs[0, 2])
sns.scatterplot( x='clean_income',y='MntSweetProducts', data=data, ax=axs[1, 0])
sns.scatterplot( x='clean_income',y='MntGoldProds', data=data, ax=axs[1, 1])
sns.scatterplot( x='clean_income',y='MntFruits', data=data, ax=axs[1, 2])

In [None]:
fig, axs=plt.subplots(1,2, figsize=(14,7))
sns.boxplot(x='Kidhome',y='clean_income', data=data, ax=axs[0])
sns.boxplot(x='Teenhome',y='clean_income', data=data, ax=axs[1])


It seems that no kids at home families have an average income more than one and two kids. On the other hand, no teenhome has an average lower income than one and two teenhome.