

# Black Friday - DataViz


# Sumário
1. <a href="#Introducao">Introdução</a>
2. <a href="#Exploratory Data Analysis (EDA)"> Análise dos Dados</a>
    1. <a href="#Gender"> Gênero</a>
    2. <a href="#Top Sellers"> Top Vendedores</a>
    3. <a href="#Age"> Idade</a>
    4. <a href="#City"> Cidade</a>
    5. <a href="#Stay in Current City"> Permanece na cidade</a>
    6. <a href="#Purchase"> Compra</a>
    7. <a href="#Marital Status"> Estado Civil</a>
    8. <a href="#Top Shoppers"> Principais Compradores</a>
    9. <a href="#Occupation"> Ocupação</a>
4. <a href="#Conclusion"> Conclusão</a>

<a id="Introducao">
# Introdução
Apresentação do Dataset

<a id="Exploratory Data Analysis (EDA)">
# Análise do DataSet


Vamos importar as bibliotecas que iremos utilizar e o dataset

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.ticker as mticker
import plotly.plotly as py
import plotly.graph_objs as go
import seaborn as sns
from collections import Counter
from math import log


In [None]:
data_polluted = pd.read_csv('../input/blackfriday/BlackFriday.csv')

data_raw = data_polluted.dropna(subset=['Purchase'])

### Funções:
Segue alguma funções simples para facilitar a análise do dataset

In [None]:

##############################################################
### Aplica a regra de Sturges para definição de bins
def bins_sturges(df):
    n = df.size
    k = round(1+ log(n)/log(2))
    return k

##############################################################

def pizza(labels, sizes):
    fig1, ax1 = plt.subplots()
    ax1.pie(sizes, labels=labels,shadow=True, autopct='%1.1f%%',startangle=90)
    plt.show()
    plt.clf()
    plt.cla()
    plt.close()
    
##############################################################
## Agrupar e contar todos os grupos de um data frame(df) de uma coluna(column) 
def df_group(df, column):
    name_list = []
    count_list = []
    ele_col_list = pd.unique(data_raw[column])
    ele_col_list.sort()
    for i in ele_col_list:
        discriminated = df.loc[data_raw[column] == i]
        count = discriminated.size
        name_list.append(i)
        count_list.append(count)
        
    return name_list, count_list
 
##############################################################
##Histograma de uma coluna de dataset
def df_hist(df, column):
    listed = pd.DataFrame.from_dict(Counter(sorted(df[column])), orient='index')
    listed.plot(kind='bar')
    
    
##############################################################
##Histograma baseado em valores

def df_hist_sturges_purchase(df, common_group, raw_column, product_id):
    x = df.loc[df[raw_column] == common_group]
    k = bins_sturges(x)
    x.loc[x['Product_ID'] == product_id,'Purchase'].hist(bins = k)
    

### Relação de clientes por gênero

In [None]:
data_gender_M = data_raw.loc[data_raw['Gender'] == 'M']
data_gender_F = data_raw.loc[data_raw['Gender'] == 'F']
labels, counts = df_group(data_raw, 'Gender')
male_list = pd.unique(data_gender_M['User_ID'])
fem_list = pd.unique(data_gender_F['User_ID'])
pizza(labels, [fem_list.size, male_list.size])

Comparação de dados entre gêneros:

 ### Valor médio de compra por gênero

In [None]:
plt.boxplot ([data_gender_F['Purchase'].values, data_gender_M['Purchase'].values], labels = labels)

O box plot mostra o complortamento de gasto dos gêneros, e a princípio o valores esperados em cada item são muito similares.

In [None]:
labels, counts = df_group(data_raw[['User_ID','Marital_Status']].drop_duplicates(), 'Marital_Status')
pizza(labels, counts)

A maioria dos clientes tem o estado civil "0", o que muito provavelmente siginifica solteiro, como parte do melhor entendimento desse dado vamos separar entre os dois generos e compara-los

In [None]:
labels, counts = df_group(data_gender_M[['User_ID','Marital_Status']].drop_duplicates(), 'Marital_Status')
labels2, counts2 = df_group(data_gender_F[['User_ID','Marital_Status']].drop_duplicates(), 'Marital_Status')

fig, axs = plt.subplots(1,2)
fig.tight_layout(pad=5.0)
axs[0].pie(counts, labels=labels,shadow=True, radius=2, autopct='%1.1f%%',startangle=90)

axs[1].pie(counts2, labels=labels2,shadow=True, radius=2, autopct='%1.1f%%',startangle=90)

axs[0].set_title(label = 'Homens')
axs[1].set_title(label = 'Mulheres')
plt.show()
plt.clf()
plt.cla()
plt.close()

In [None]:
sns.boxplot(x='Gender', y='Purchase', hue='City_Category', data=data_raw)

In [None]:
data_age_0_17 = data_raw.loc[data_raw['Age'] == '0-17']
data_age_18_25 = data_raw.loc[data_raw['Age'] == '18-25']
data_age_26_35 = data_raw.loc[data_raw['Age'] == '26-35']
data_age_36_45 = data_raw.loc[data_raw['Age'] == '36-45']
data_age_46_50 = data_raw.loc[data_raw['Age'] == '46-50']
data_age_51_55 = data_raw.loc[data_raw['Age'] == '51-55']
data_age_55 = data_raw.loc[data_raw['Age'] == '55+']

In [None]:
labels, counts = df_group(data_raw[['User_ID','Age']].drop_duplicates(), 'Age')
pizza(labels, counts)

In [None]:
labels, counts = df_group(data_gender_M[['User_ID','Age']].drop_duplicates(), 'Age')
labels2, counts2 = df_group(data_gender_F[['User_ID','Age']].drop_duplicates(), 'Age')

fig, axs = plt.subplots(1,2)
fig.tight_layout(pad=5.0)
axs[0].pie(counts, labels=labels,shadow=True, radius=2, autopct='%1.1f%%',startangle=90)

axs[1].pie(counts2, labels=labels2,shadow=True, radius=2, autopct='%1.1f%%',startangle=90)

axs[0].set_title(label = 'Homens')
axs[1].set_title(label = 'Mulheres')
plt.show()
plt.clf()
plt.cla()
plt.close()

A faixa etária de 26-35 anos detém o maior número de clientes, mas importante notar que a distribuição de faixa etária continua extremamente similar entre os generos masculino e feminino.

In [None]:
labels, counts = df_group(data_raw, 'Age')
plt.boxplot ([data_age_0_17['Purchase'].values, data_age_18_25['Purchase'].values, data_age_26_35['Purchase'].values, data_age_36_45['Purchase'].values, data_age_46_50['Purchase'].values,  data_age_51_55['Purchase'].values, data_age_55['Purchase'].values], labels= labels)


Outro dado, o valor médio gasto por produto é extremamente similar em todas as faixas etárias.

<a id="City">
## Cidade


In [None]:
labels, counts = df_group(data_raw[['User_ID','City_Category']].drop_duplicates(), 'City_Category')
pizza(labels, counts)

Visivelmente a cidade C possui mais clientes do que as outras duas juntas.

### Número de vendas por categoria de cidade

In [None]:
labels, counts = df_group(data_raw[['City_Category']], 'City_Category')
pizza(labels, counts)

A categoria B vende mais produtos do que as outras cidades.

### Valor médio de produtos vendidos

In [None]:
labels, counts = df_group(data_raw, 'City_Category')
plt.boxplot([data_geo_A['Purchase'].values, data_geo_B['Purchase'].values, data_geo_C['Purchase'].values], labels=labels)


Embora pouco significativo, a cidade C tende a fazer compras mais caras.

In [None]:
sns.violinplot(x='City_Category', y='Purchase', hue='Gender', data=data_raw)

In [None]:
sns.boxplot(x='City_Category', y='Purchase', hue='Gender', data=data_raw)

<a id="Stay in Current City">
## Permanece na Cidade


In [None]:
customers_stay = dataset %>%
                    select(User_ID, City_Category, Stay_In_Current_City_Years) %>%
                    group_by(User_ID) %>%
                    distinct()
head(customers_stay)

In [None]:
residence = customers_stay %>%
                group_by(City_Category) %>%
                tally()
head(residence)

In [None]:
customers_stay_vis = ggplot(data = customers_stay, aes(x = Stay_In_Current_City_Years, y = ..count.., fill = Stay_In_Current_City_Years)) +
                              geom_bar(stat = 'count') +
                              scale_fill_brewer(palette = 15) +
                              labs(title = 'Os clientes ficam na cidade atual', y = 'Count', x = 'Stay in Current City', fill = 'Number of Years in Current City')
print(customers_stay_vis)

In [None]:
stay_cities = customers_stay %>%
                group_by(City_Category, Stay_In_Current_City_Years) %>%
                tally() %>%
                mutate(Percentage = (n/sum(n))*100)
head(stay_cities)

In [None]:
ggplot(data = stay_cities, aes(x = City_Category, y = n, fill = Stay_In_Current_City_Years)) + 
    geom_bar(stat = "identity", color = 'white') + 
    scale_fill_brewer(palette = 2) + 
    labs(title = "Categoria da cidade + permanecer na cidade atual", 
            y = "Total(Anos)", 
            x = "Cidade", 
            fill = "Stay Years") 

<a id="Purchase">
## Compra


In [None]:
customers_total_purchase_amount = dataset %>%
                                    group_by(User_ID) %>%
                                    summarise(Purchase_Amount = sum(Purchase))

head(customers_total_purchase_amount)

In [None]:
customers_total_purchase_amount = arrange(customers_total_purchase_amount, desc((Purchase_Amount)))

head(customers_total_purchase_amount)

In [None]:
summary(customers_total_purchase_amount)

In [None]:
ggplot(customers_total_purchase_amount, aes(Purchase_Amount)) +
  geom_density(adjust = 1) +
  geom_vline(aes(xintercept=median(Purchase_Amount)),
             color="blue", linetype="dashed", size=1) +
  geom_vline(aes(xintercept=mean(Purchase_Amount)),
             color="red", linetype="dashed", size=1) +
  geom_text(aes(x=mean(Purchase_Amount), label=round(mean(Purchase_Amount)), y=1.2e-06), color = 'red', angle=360,
            size=4, vjust=3, hjust=-.1) +
  geom_text(aes(x=median(Purchase_Amount), label=round(median(Purchase_Amount)), y=1.2e-06), color = 'blue', angle=360,
            size=4, vjust=0, hjust=-.1) +
  scale_x_continuous(name="Purchase Amount", limits=c(0, 7500000), breaks = seq(0,7500000, by = 1000000), expand = c(0,0)) +
  scale_y_continuous(name="Density", limits=c(0, .00000125), labels = scientific, expand = c(0,0)) 

<a id="Marital Status">
## Estado Civil


In [None]:
dataset_maritalStatus = dataset %>%
                            select(User_ID, Marital_Status) %>%
                            group_by(User_ID) %>%
                            distinct()
                    
head(dataset_maritalStatus)

In [None]:
dataset_maritalStatus$Marital_Status = as.character(dataset_maritalStatus$Marital_Status)
typeof(dataset_maritalStatus$Marital_Status)

In [None]:
marital_vis = ggplot(data = dataset_maritalStatus) +
                    geom_bar(mapping = aes(x = Marital_Status, y = ..count.., fill = Marital_Status)) +
                    labs(title = 'Estado Civil') +
                    scale_fill_brewer(palette = 'Pastel2')
print(marital_vis)

In [None]:
dataset_maritalStatus = dataset_maritalStatus %>%
                            full_join(customers_stay, by = 'User_ID') 
head(dataset_maritalStatus)

In [None]:
maritalStatus_cities = dataset_maritalStatus %>%
                        group_by(City_Category, Marital_Status) %>%
                        tally()
head(maritalStatus_cities)

In [None]:
ggplot(data = maritalStatus_cities, aes(x = City_Category, y = n, fill = Marital_Status)) + 
    geom_bar(stat = "identity", color = 'black') + 
    scale_fill_brewer(palette = 2) + 
    labs(title = "Cidade + Estado Civil", 
            y = "Total (Compradores)", 
            x = "Cidade", 
            fill = "Estado Civil")

In [None]:
Users_Age = dataset %>%
                select(User_ID, Age) %>%
                distinct()
head(Users_Age)

In [None]:
dataset_maritalStatus = dataset_maritalStatus %>%
                            full_join(Users_Age, by = 'User_ID')
head(dataset_maritalStatus)

In [None]:
City_A = dataset_maritalStatus %>%
            filter(City_Category == 'A')
City_B = dataset_maritalStatus %>%
            filter(City_Category == 'B')
City_C = dataset_maritalStatus %>%
            filter(City_Category == 'C')
head(City_A)
head(City_B)
head(City_C)

In [None]:
City_A_stay_vis = ggplot(data = City_A, aes(x = Age, y = ..count.., fill = Age)) + 
                              geom_bar(stat = 'count') +
                              scale_fill_brewer(palette = 8) +
                              theme(legend.position="none", axis.text = element_text(size = 6)) +
                              labs(title = 'Cidade A', y = 'Count', x = 'Idade', fill = 'Age')
City_B_stay_vis = ggplot(data = City_B, aes(x = Age, y = ..count.., fill = Age)) +
                              geom_bar(stat = 'count') +
                              scale_fill_brewer(palette = 9) +
                              theme(legend.position="none", axis.text = element_text(size = 6)) +
                              labs(title = 'Cidade B', y = 'Count', x = 'Idade', fill = 'Age')
City_C_stay_vis = ggplot(data = City_C, aes(x = Age, y = ..count.., fill = Age)) +
                              geom_bar(stat = 'count') +
                              scale_fill_brewer(palette = 11) +
                              theme(legend.position="none", axis.text = element_text(size = 6)) +
                              labs(title = 'Cidade C', y = 'Count', x = 'Idade', fill = 'Age')

grid.arrange(City_A_stay_vis, City_B_stay_vis, City_C_stay_vis, ncol = 3)

<a id="Top Shoppers">
## Top Compradores


In [None]:
top_shoppers = dataset %>%
                count(User_ID, sort = TRUE)

head(top_shoppers)

In [None]:
top_shoppers =  top_shoppers %>%
                    select(User_ID, n) %>%
                    left_join(customers_total_purchase_amount, Purchase_Amount, by = 'User_ID')

head(top_shoppers)

In [None]:
top_shoppers = mutate(top_shoppers,
                  Average_Purchase_Amount = Purchase_Amount/n)

head(top_shoppers)

In [None]:
top_shoppers_averagePurchase = top_shoppers %>%
                                    arrange(desc(Average_Purchase_Amount))

head(top_shoppers_averagePurchase)

<a id="Occupation">
## Ocupação


In [None]:
customers_Occupation =  dataset %>%
                          select(User_ID, Occupation) %>%
                          group_by(User_ID) %>%
                          distinct() %>%
                          left_join(customers_total_purchase_amount, Occupation, by = 'User_ID')

head(customers_Occupation)

In [None]:
totalPurchases_Occupation = customers_Occupation %>%
                              group_by(Occupation) %>%
                              summarise(Purchase_Amount = sum(Purchase_Amount)) %>%
                              arrange(desc(Purchase_Amount))

totalPurchases_Occupation$Occupation = as.character(totalPurchases_Occupation$Occupation)
typeof(totalPurchases_Occupation$Occupation)

head(totalPurchases_Occupation)

In [None]:
occupation = ggplot(data = totalPurchases_Occupation) +
                  geom_bar(mapping = aes(x = reorder(Occupation, -Purchase_Amount), y = Purchase_Amount, fill = Occupation), stat = 'identity') +
                  scale_x_discrete(name="Occupation", breaks = seq(0,20, by = 1), expand = c(0,0)) +
                  scale_y_continuous(name="Purchase Amount ($)", expand = c(0,0), limits = c(0, 750000000)) +
                  labs(title = 'Montante total da compra por ocupação') + 
                  theme(legend.position="none")
print(occupation)

In [None]:
library(arules)
library(arulesViz)
library(tidyverse)

In [None]:

customers_products = dataset %>%
                        select(User_ID, Product_ID) %>%   # Selecting the columns we will need
                        group_by(User_ID) %>%             # Grouping by "User_ID"          
                        arrange(User_ID) %>%              # Arranging by "User_ID" 
                        mutate(id = row_number()) %>%     # Defining a key column for each "Product_ID" and its corresponding "User_ID" (Must do this for spread() to work properly)
                        spread(User_ID, Product_ID) %>%   # Converting our dataset from tall to wide format, and grouping "Product_IDs" to their corresponding "User_ID"
                        t()                               # Transposing the dataset from columns of "User_ID" to rows of "User_ID"


customers_products = customers_products[-1,]

In [None]:
write.csv(customers_products, file = 'customers_products.csv')

customersProducts = read.transactions('customers_products.csv', sep = ',', rm.duplicates = TRUE) # remove duplicates with rm.duplicates

In [None]:
summary(customersProducts)

In [None]:
summary(customersProducts)

In [None]:
rules = apriori(data = customersProducts,
               parameter = list(support = 0.008, confidence = 0.80, maxtime = 0)) # maxtime = 0 will allow our algorithim to run until completion with no time limit

In [None]:
inspect(sort(rules, by = 'lift'))

Now, lets visualize these rules using the [arulesViz][1] package.  

[1]: https://cran.r-project.org/web/packages/arulesViz/vignettes/arulesViz.pdf

In [None]:
rules = apriori(data = customersProducts,
               parameter = list(support = 0.008, confidence = 0.75, maxtime = 0))

In [None]:
inspect(head(sort(rules, by = 'lift'))) # limiting to the top 6 rules

In [None]:
import pandas as pd
test = pd.read_csv("../input/black-friday/test.csv")
train = pd.read_csv("../input/black-friday/train.csv")