In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

from pandas_profiling import ProfileReport
import plotly.express as px
import plotly.graph_objects as go
import plotly.figure_factory as ff
from plotly.subplots import make_subplots

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
data = pd.read_csv('/kaggle/input/marketing-data/marketing_data.csv')
# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Section 01: *Exploratory Data Analysis*


1. Are there any null values or outliers? How will you wrangle/handle them?
1. Are there any variables that warrant transformations?
1. Are there any useful variables that you can engineer with the given data?
1. Do you notice any patterns or anomalies in the data? Can you plot them?


In [None]:
data.head(10)

In [None]:
print(data.shape)
print(data.columns)
data.rename(columns={' Income ':'Income'},inplace=True)
print(data.dtypes)

In [None]:
#Converting Column to Correct/usable datatypes
data['Income'] = (data['Income'].str.replace(r'[^-+\d.]', '').astype('float').round(2))
data['Dt_Customer']=pd.to_datetime(data['Dt_Customer'], dayfirst = True, yearfirst = False)

1. *Are there any null values or outliers? How will you wrangle/handle them?*

I will break this question into two sections:
    1. 1. Are there any null Values? How will you wrangle/handle them?
       2. Are there any outliers Values? How will you wrangle/handle them?
       
# 1.Are there any null Values? How will you wrangle/handle them?

In [None]:
fig = go.Figure()
fig.add_trace(go.Bar(
    y=data.isna().sum().reset_index(name='blanks')['blanks'],
    x=data.columns
    ))
#data.isna().sum()
fig.show()

We found Income column has 24 missing values, as this can prove to be one of the important feature for our analysis we should not ignore the feature. We will impute the missing values based on median income earned by customers based on their Educational background.

In [None]:
data["Income"] = data.groupby("Education")["Income"].transform(lambda x: x.fillna(x.median()))

# 1. 2. Are there any outliers Values? How will you wrangle/handle them?

In [None]:
data.describe()

Based on the above summary of data, we do get a good picture of the features by simply looking at each feature's distribution. Most of the columns have no or very small Outliers we can conclude by comparing min values with 25th percentile and max values with 75th percentile (closer the better). We will still investigate following:
1. Year_Birth
1. Income
1. Recency
1. All the products (columns starting with Mnt)

# Year_Birth:

Seems to have few outliers: It's likely impossible to have customers born in 1893, 1899. So we will exclude these customers for further analysis.

In [None]:
print((data['Year_Birth'].value_counts().to_frame().reset_index()).sort_values(by='index').head(5))
data = data.loc[data['Year_Birth']>=1940,]

# Income:

Max Income seems too far from the 75th percentile of the income distribution

In [None]:
fig = go.Figure()
fig.add_trace(go.Box(
y=data['Income'],
x=data['Education'],
name='Income',
))
fig.update_layout(
yaxis_title='Income In USD',
boxmode='group',
title = 'Income By Educational Qualifications'
)
fig.show()

Using the above plot we can clearly see the outliers, in two Income ranges: 

1. Approx. Range $ 150K: Ideally we should not consider these as outliers as people do have such high incomes with Phd, Master and even with Graduation.


2. Approx.Range $666K: This data point is actually an outlier as we do not usually see such high income in actual world, also the figure $ 6,66,666 looks fishy. So we will exclude this data point from our further analysis. 




In [None]:
data = data.loc[data['Income']<666666.000000,]
print(data.shape)

# Quantity Of Products Purchased:

In [None]:
fig = go.Figure()
for i in data.loc[:, data.columns.str.startswith('Mnt')].columns:
    fig.add_trace(go.Box(
    y=data[i],
    name=i,
    ))

fig.update_layout(
yaxis_title='Quantity',
boxmode='group',
title = 'Quantity By Products Bought'
)
    
fig.show()

We can see huge number of outliers in case of MntMeat Products. We can use [Benford law](http://https://en.wikipedia.org/wiki/Benford%27s_law) to check if there are any anomalies in the data.

In [None]:
import collections


BENFORD_PERCENTAGES = [0, 0.301, 0.176, 0.125, 0.097, 0.079, 0.067, 0.058, 0.051, 0.046]

def calculate(data):

    """
    Calculates a set of values from the numeric list
    input data showing how closely the first digits
    fit the Benford Distribution.
    Results are returned as a list of dictionaries.
    """

    results = []

    first_digits = list(map(lambda n: str(n)[0], data))
    first_digit_frequencies = collections.Counter(first_digits)
    benford_law = pd.DataFrame(columns=['n','data_frequency','data_frequency_percent','benford_frequency','benford_frequency_percent','difference_frequency','difference_frequency_percent'])

    for n in range(1, 10):
        benford_law = benford_law.append({'n':n},ignore_index=True)
        benford_law.loc[benford_law['n']==n,'data_frequency'] = first_digit_frequencies[str(n)]
        benford_law.loc[benford_law['n']==n,'data_frequency_percent']= (first_digit_frequencies[str(n)]) / len(data)
        benford_law.loc[benford_law['n']==n,'benford_frequency']= len(data) * BENFORD_PERCENTAGES[n]
        benford_law.loc[benford_law['n']==n,'benford_frequency_percent']=BENFORD_PERCENTAGES[n]
        benford_law.loc[benford_law['n']==n,'difference_frequency']= first_digit_frequencies[str(n)] - (len(data) * BENFORD_PERCENTAGES[n])
        benford_law.loc[benford_law['n']==n,'difference_frequency_percent']= (first_digit_frequencies[str(n)]) - (first_digit_frequencies[str(n)] - (len(data) * BENFORD_PERCENTAGES[n]))
    fig = go.Figure()
    fig.add_trace(go.Bar(
    x = benford_law['n'],
    y = benford_law['data_frequency_percent']*100,
    name = 'Actual Data'))

    fig.add_trace(go.Bar(
    x = benford_law['n'],
    y = benford_law['benford_frequency_percent']*100,
    name = 'Benford Frequency Percentage'))

    fig.show()
    return benford_law

In [None]:
res = calculate(data['MntMeatProducts'])
res

Based on above plot its clearly visible that *MntMeatProducts* data follows Benford law and it dosen't have any anomalies

# Recency

In [None]:

fig = make_subplots(rows=1,cols=2)
fig.add_trace(go.Box(
    y=data['Recency'],
    name = 'Recency Boxplot'
),
row=1,col=1)

fig.add_trace(go.Bar(
    y=data['Recency'].value_counts(),
    x=data['Recency'].unique(),
    name= 'Recency Distribution'
),
row=1,col=2)

fig.show()