In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

from collections import Counter
import itertools

import matplotlib.pyplot as plt
%matplotlib inline

import seaborn as sns

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

### Supermarket Finance

Reading our 1st table data...

In [None]:
df = pd.read_csv('/kaggle/input/marketing-data-for-a-supermarket-in-united-states/supermarket_marketing/50_SupermarketBranches.csv')

print(df.shape)
df.head()

Let's convert the numbers to thousands:

In [None]:
for c in ('Advertisement Spend', 'Promotion Spend', 'Administration Spend', 'Profit'):
    df[c] = df[c] / 1000

According to this information, it would be interesting to compare how the costs relate to profit and to each other. 

The vertical axis corresponds to the Administration, the Promotion and the Advertising spends; the horizontal axis shows the profit.

In [None]:
plt.figure(figsize=(12, 8))

plt.scatter(df.Profit.values, df['Administration Spend'].values, c='blue', label='Administration')
plt.scatter(df.Profit.values, df['Promotion Spend'].values, c='red', label='Promotion', marker='P', s=64)
plt.scatter(df.Profit.values, df['Advertisement Spend'].values, c='green', label='Advertisement', marker='d', s=64)

plt.xlabel('$Profit$', fontsize=14)
plt.ylabel('$k\$$', fontsize=14)

plt.legend(fontsize=14)

As we can see, there is a correlation of profit for the Administration and the Advertising; let's look at correlation numerical values.

In [None]:
sns.heatmap(df[['Advertisement Spend', 'Promotion Spend', 'Administration Spend', 'Profit']].corr(),
            annot=True)

Here we can see a strong correlation between the Advertisement and the Profit, there is also a noticeable correlation between the Administration and the Profit.

Let's see how correlations vary by state:

In [None]:
df.groupby('State').corr()[['Profit']].unstack(level=1)['Profit'][['Advertisement Spend', 'Promotion Spend', 'Administration Spend']].plot(kind='bar', title ="correlations of Profit with Spends by States", figsize=(15, 10), legend=True, fontsize=12)

We can see that in the state of New York, the Promotion has the best effect on the Profit, while in the other states the Promotion had little to no effect at all.

### Interaction with advertisements

Reading data...

In [None]:
df2 = pd.read_csv('/kaggle/input/marketing-data-for-a-supermarket-in-united-states/supermarket_marketing/Ads_CTR_Optimisation.csv')

print(df2.shape)
df2.head()

Now let's see what the customers’ clicks look like. Here the elongated matrix is presented in the form of columns.

In [None]:
_, ax = plt.subplots(1, 15, figsize=(15, 10), sharey=True)

for i in range(15):
    ax[i].spy(df2.values[i * 100:(i + 1) * 100, :])

You can see that some people click a lot, some click a little. Moreover, it is clear that certain banner ads are clicked more clicks than the others.

Let us try to divide these advertisement banners into groups. Let's see how singular numbers decrease for the SVD of this matrix.

In [None]:
u, s, vh = np.linalg.svd(df2.values)
plt.plot(s)

We see that there is no significant increase, so it’s hard to identify any groups of banners. 

Now we will consider the number of clicks on the each specific banner ad.

In [None]:
plt.bar(range(1, 11), df2.sum(axis=0))

plt.xticks(range(1, 11))
plt.xlabel('Ad', fontsize=14)
plt.ylabel('count', fontsize=14)

It’s clear that some advertisements are found to be relatively uninteresting among the customers, for example, Ad #6. While some other banners get a good clicking response (5 and 8).

Now we will calculate the distribution for number of clicks per user:

In [None]:
hs, xs = np.histogram(df2.sum(axis=1), bins=np.arange(7) - 0.5)
plt.bar(xs[:-1] + 0.5, hs)

plt.xlabel('number of clicks', fontsize=14)
plt.ylabel('count', fontsize=14)

Most often, the users click on only one banner.

### Customer's baskets

In [None]:
with open('/kaggle/input/marketing-data-for-a-supermarket-in-united-states/supermarket_marketing/Market_Basket_Optimisation.csv', 'r') as fr:
    baskets = [l.strip().split(',') for l in fr.readlines()]

How many products do customers buy at a time?

In [None]:
plt.hist([len(b) for b in baskets], bins=np.arange(20) - 0.5, label='basket size')

plt.legend(fontsize=12)
plt.xticks(np.arange(20))
plt.xlim(0.5, 17);

Most often people buy one thing.

What products are bought most often:

In [None]:
Counter([i for b in baskets for i in b]).most_common(10)

Products that are most often bought together:

In [None]:
Counter(['| '.join(c) for b in baskets for c in itertools.combinations(b, 2)]).most_common(10)

### Сustomer information

In [None]:
df3 = pd.read_csv('/kaggle/input/marketing-data-for-a-supermarket-in-united-states/supermarket_marketing/Supermarket_CustomerMembers.csv', index_col='CustomerID')

print(df3.shape)
df3.head()

Now let us consider how the Spending Score and the Annual Income correlate to each other.

In [None]:
plt.figure(figsize=(10, 7))
plt.scatter(df3['Annual Income (k$)'].values, df3['Spending Score (1-100)'])

plt.xlabel('Annual Income', fontsize=14)
plt.ylabel('Spending Score', fontsize=14)

We can see an unusual artifact in the data – they are strangely scattered around the chart forming rectangular regions. Costs between 40 and 60 correspond to the income from only 40 to ~ 65. Also, there is a strange streak for the income = 54.

These facts can make one doubt the correctness of these data (considering the other dependencies with them).

It is also worth noting that a much deeper analysis could have been carried out if there were any links between the tables.
For example, if there was any information about which banner a specific _user_i_ clicked on and which products in which supermarket a _user_i_ bought. Also one could provide socio-demographic information about a specific _user_i_. It is worth adding that the sizes of these tables are quite small. These factors limit the variability and the accuracy of conclusions on these data.

---