In [None]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# load data into dataframes
path = "../input/h-and-m-personalized-fashion-recommendations/"

articles = pd.read_csv(path + "articles.csv")
customers = pd.read_csv(path + "customers.csv")

#### Let's see the first five rows from each dataframe to make an idea about our data.

In [None]:
articles.shape

In [None]:
customers.shape

In [None]:
articles.head()

In [None]:
customers.head()

#### Looks like we some missing values
#### Let's see ......

In [None]:
customers.info()

* We have missing values for all columns except 'customer_id' and 'postal_code'.
* Many customers are missing 'FN' and 'Active'. Maybe there are just ones so we can put zeros instead of NaN's and treat them as boolean columns (i.e. if Active is 1 this means yes the customer is active and if Active is 0 that means the customer is not active; same for FN)

In [None]:
print(customers.FN.mean(skipna=True))
print(customers.Active.mean(skipna=True)) 

* The non-NA mean is 1.0 so we can deduce that there are only ones. Let's fill with zeros where we have NaN.

In [None]:
customers['FN'] = customers['FN'].fillna(0)
customers['Active'] = customers['Active'].fillna(0)

In [None]:
customers.info() # yey, no missing values for FN and Active

#### Let's make some simple plots for this two columns

In [None]:
x = customers.FN.values

plt.figure(figsize=(8, 6), dpi=80)
plt.hist(x)
plt.xlabel('FN')
plt.ylabel('Number of people')
plt.show()

In [None]:
x = customers.Active.values

plt.figure(figsize=(8, 6), dpi=80)
plt.hist(x)
plt.xlabel('Active')
plt.ylabel('Number of people')
plt.show()

* The two histograms seems to be the same. Maybe there is no difference between FN and Actve


In [None]:
s = customers.FN + customers.Active
print(customers.FN.unique())
print(customers.Active.unique())
print(s.unique())

* No, I was wrong. There aren't the same.

In [None]:
customers.fashion_news_frequency.unique()

In [None]:
customers.loc[customers['fashion_news_frequency'] == 'NONE', 'fashion_news_frequency'] = 'None'
customers.fashion_news_frequency = customers.fashion_news_frequency.fillna('None')

In [None]:
customers.fashion_news_frequency.unique()

In [None]:
customers.club_member_status.unique()

In [None]:
customers.club_member_status = customers.club_member_status.fillna("NO INFO")

In [None]:
customers.club_member_status.unique()

In [None]:
customers.age.unique()

In [None]:
customers.age = customers.age.fillna(int(customers.age.mean(skipna=True)))

In [None]:
customers.age.unique()

In [None]:
customers.info()

#### We handled all missing values for customers table

In [None]:
articles.info() # pretty good for articles

In [None]:
articles.detail_desc = articles.detail_desc.fillna('')

In [None]:
articles.info()

In [None]:
articles.isnull().sum()

In [None]:
customers.isnull().sum()

#### End with missing values

### Questions

#### What is the mean age of all customers? Max and Min?

In [None]:
customers.age.mean()

* There are people around 36 years old that are registered as customers at H&M


In [None]:
customers.age.max()

* This is pretty old, but how many of them are this old?

In [None]:
very_old_customers = customers[customers.age == customers.age.max()]
very_old_customers.shape[0]

In [None]:
customers.age.min()

In [None]:
plt.figure(figsize=(8, 6), dpi=80)
plt.hist(customers.age, bins=np.linspace(customers.age.min(), customers.age.max(), num=100))
plt.title("Customer's age")
plt.show

In [None]:
very_young_customers = customers[customers.age == customers.age.min()]
very_young_customers.shape[0]

* Okay, let's divide them into categories like under 20, between 20 and 30, ..., over 60 to answear some questions.

In [None]:
age_categories = {}

# tuple (a,b) means interval [a,b)
age_categories[(0,20)] = customers[customers.age < 20].shape[0] 
age_categories[(20,30)] = customers[customers.age.between(20, 30, inclusive='left')].shape[0]
age_categories[(30,40)] = customers[customers.age.between(30, 40, inclusive='left')].shape[0]
age_categories[(40,50)] = customers[customers.age.between(40, 50, inclusive='left')].shape[0]
age_categories[(50,60)] = customers[customers.age.between(50, 60, inclusive='left')].shape[0]
age_categories[(60,150)] = customers[customers.age >= 60].shape[0]

In [None]:
x = list(map(lambda x: "[{0}, {1})".format(x[0], x[1]), age_categories.keys()))
y = age_categories.values()

plt.figure(figsize=(8, 6), dpi=80)
plt.bar(x, y)
plt.xlabel('Age')
plt.ylabel('Number of people')
plt.title('Customers by age')
plt.show()

#### So the most people are between 20 and 30, and the mean is not far from 30. That means H&M has a lot of young customers

#### Now, based on this categories, let's find out the answers at the following questions:
            1. What category spends a lot of money on clothes?
            2. In whitch category are the most active members?
            3. What types of articles are they buying?

#### 1. What category spends a lot of money on clothes??

In [None]:
chunks = pd.read_csv(path + "transactions_train.csv", chunksize=5298054) #this will result in 6 chunks

In [None]:
chunk_list = []

for chunk in chunks:
    chunk_filter = chunk.drop(['sales_channel_id', 't_dat'], axis=1) #for the purpose of the analysis 
    
    chunk_list.append(chunk_filter)
transactions = pd.concat(chunk_list)

In [None]:
transactions.head()

In [None]:
transactions_money = transactions.drop('article_id', axis=1)
transactions_money_grouped = transactions_money.groupby('customer_id').sum() 

In [None]:
transactions_money_grouped = transactions_money_grouped.reset_index()

In [None]:
transactions_money_grouped

In [None]:
from math import floor

def get_customer_age_category(c_id):
    left = int(customers.age[customers.customer_id == c_id]) // 10 * 10
    if left == 10:
        left = 0
    if left > 60:
        left = 60
    
    right = left + 10
    
    if left == 0:
        right = 20
    if left == 60:
        right = 150
        
    # tuple (a,b) means interval [a,b)
    return (left, right)

In [None]:
money_spend = {}

# to much computations, need a better method
#for index, row in transactions_money_grouped.iterrows():
    #money_spend[get_customer_age_category(row.customer_id)] = row.price

#### 2. In whitch category are the most active members?

In [None]:
active_categorires = {}

# tuple (a,b) means interval [a,b)
active_categorires[(0,20)] = customers[(customers.age < 20) & (customers.Active)].shape[0] 
active_categorires[(20,30)] = customers[(customers.age.between(20, 30, inclusive='left')) & (customers.Active)].shape[0]
active_categorires[(30,40)] = customers[(customers.age.between(30, 40, inclusive='left')) & (customers.Active == 1.0)].shape[0]
active_categorires[(40,50)] = customers[(customers.age.between(40, 50, inclusive='left')) & (customers.Active == 1.0)].shape[0]
active_categorires[(50,60)] = customers[(customers.age.between(50, 60, inclusive='left')) & (customers.Active == 1.0)].shape[0]
active_categorires[(60,150)] = customers[(customers.age >= 60) & (customers.Active == 1.0)].shape[0]

In [None]:
x = list(map(lambda x: "[{0}, {1})".format(x[0], x[1]), active_categorires.keys()))
y = active_categorires.values()

plt.figure(figsize=(8, 6), dpi=80)
plt.bar(x, y)
plt.xlabel('Age category')
plt.ylabel('Number of active people')
plt.title('Active customers by age category')
plt.show()

#### As we would expect, the majority of people who are active customers has the age between 20, inlcusive, and 30.

#### 3. What types of articles are they buying?

In [None]:
transations_articles = transactions.drop('price', axis=1)
transations_articles_grouped = transations_articles.groupby('customer_id')['article_id'].agg(list)

In [None]:
transations_articles_grouped = transations_articles_grouped.reset_index()

In [None]:
articles_category = {}

# very intensive task
'''
for index, row in transations_articles_grouped.iterrows():
    articles_category[get_customer_age_category(row['customer_id'])] = articles.prod_name[
        articles.apply(lambda r: r['article_id'] in row['article_id'], axis=1)
    ]
'''

#### Let's see if Active == 1 is the same as club_member_status == 'Active'. If this is true we can drop Active column from customers DataFrame

In [None]:
 active_members = customers.Active[customers.club_member_status == 'ACTIVE']

In [None]:
print(active_members.count()) 
print(customers.Active[customers.Active == 1.0].count())

#### So we have customers that are active club members, but in reality they are not Active or maybey we don't have information about how active are they in real life. Remember that we considered NaN values for Active to be 0. This will help us with machine learning models. We can't drop that column.

#### Now we start to look at the correlations between tables attributes

In [None]:
def create_correlation_heatmap(corr_matrix):
    plt.figure(figsize=(16, 6))

    mask = np.triu(np.ones_like(corr_matrix)) 
    # the matrix is symmetric so we can view just the under or above triangle relative to main diagonal

    heatmap = sns.heatmap(corr_matrix, mask=mask, vmin=-1, vmax=1, annot=True)

    heatmap.set_title('Correlation for customers', fontdict={'fontsize':18}, pad=16)

#### 1. Articles correlation

In [None]:
create_correlation_heatmap(articles.corr())

#### This heatmap shows that article_id and product_code are 100% related to each other, which seems logical to me. Others, except some *some_name*_no columns, are either negative or close to 0.

#### 2. Customers correlation

In [None]:
customers.replace(['None', 'Regularly', 'Monthly'], [0, 1, 2])                        
customers.replace(['ACTIVE', 'NO INFO', 'PRE-CREATE', 'LEFT CLUB'], [1, 0, 2, 3])

In [None]:
create_correlation_heatmap(customers.corr())

#### Here we can see that fashion_news_frequency is strongly correlated with FN and Active. It makes sens because, intuitively speaking, the probability that active customers receive news from H&M should be close to 1. Also FN and Active are strongly correlated. Others are very close to 0 or negative.

#### TODO: Add time series for transactions and some visualization for articles

#### Save data

In [None]:
customers.to_csv('/kaggle/working/customers_data.csv',index=False)
articles.to_csv('/kaggle/working/articles_data.csv',index=False)
transations_articles_grouped.to_csv('/kaggle/working/transactions_data.csv',index=False)