# Overview of this notebook
Like the notebook that this is forked from, we're going to use k-means grouping. This time we're grouping customers, rather than articles.

The forked notebook for k-means for articles: https://www.kaggle.com/code/beezus666/k-means-and-feature-importance-for-articles


In [None]:
import numpy as np
import pandas as pd 
pd.options.plotting.backend = "matplotlib"
import matplotlib.pyplot as plt
from datetime import datetime, timedelta
import gc
import cudf
import cupy as cp
from cuml.cluster import KMeans
from cuml.datasets import make_blobs


# Create metrics for each customer
The end result we're going for here is one line per customer, the columns added will be a count of how many times they bought something in each category.

So, we'll need to start with grouping transactions like was done in the articles k-means, to result in the number of times something was bought. 

Then will find low-ish cardinatlity features in articles that we can one-hot-encode (the k-means feature built last time included). Then group and count those columns for each customer.


In [None]:
#some nice ideas on reducing memory: https://www.kaggle.com/c/h-and-m-personalized-fashion-recommendations/discussion/308635
transactions = cudf.read_csv('../input/h-and-m-personalized-fashion-recommendations/transactions_train.csv', parse_dates=['t_dat'])
transactions['customer_id'] = transactions['customer_id'].str[-16:].str.hex_to_int().astype('float64')
transactions['article_id'] = transactions.article_id.astype('int32')
transactions.t_dat = cudf.to_datetime(transactions.t_dat)
transactions = transactions[['t_dat','customer_id','article_id']]
#transactions.to_parquet('train.pqt',index=False)
print( transactions.shape )
transactions.head()

In [None]:
#low_cardinality.append('article_id')
articles = cudf.read_parquet('../input/k-means-and-feature-importance-for-articles/articles.parquet', columns = ['perceived_colour_value_id', 'clusters', 'article_id'])
print(articles.shape)
articles.head()

In [None]:
# adding data from articles onto transactions
transactions = cudf.merge(transactions, articles, on='article_id', how='left')
print(transactions.shape)
transactions.head()

In [None]:
# don't want to group + sum on date. 
# Maybe should have used date to split off the last week to do a proper train/test split
transactions.drop(columns=['t_dat', 'article_id'], inplace = True)

In [None]:
transactions_group = transactions.groupby(['customer_id', 'clusters']).size().to_frame('grouped_count').reset_index()
transactions_group.rename(columns={"clusters": "article_clusters"}, inplace=True)
transactions_group.head()

In [None]:
f'group by: {len(transactions_group):,}, original: {len(transactions):,}'

In [None]:
#customers = cudf.read_parquet('../input/radek-customers-parquet-output/customers.parquet')
cols_to_use = ['customer_id', 'age']
customers = cudf.read_csv('../input/h-and-m-personalized-fashion-recommendations/customers.csv', usecols = cols_to_use)
customers.head()

In [None]:
customers['customer_id'] = customers['customer_id'].str[-16:].str.hex_to_int().astype('float64')
customers_grouped_trans = cudf.merge(customers, transactions_group, on='customer_id', how='left')
customers_grouped_trans.head()

In [None]:
customers_grouped_trans.dtypes, customers_grouped_trans.shape

# K-means cluster
Using K-means to create a feature on articles

In [None]:
# k-means can't take int, needs float
grouped_cols = customers_grouped_trans.columns
for i in grouped_cols:
    customers_grouped_trans[i] = customers_grouped_trans[i].astype(float)

    
customers_grouped_trans.dtypes

In [None]:
customers_grouped_trans.fillna(0, inplace = True)

In [None]:
%%time
# elbow method to determine the best number of clusters
# so fast on GPU!
Sum_of_squared_distances = []
K = range(1, 7)
for num_clusters in K :
 kmeans = KMeans(n_clusters=num_clusters)
 kmeans.fit(customers_grouped_trans)
 Sum_of_squared_distances.append(kmeans.inertia_)
plt.plot(K,Sum_of_squared_distances,'bx-')
plt.xlabel('Values of K') 
plt.ylabel('Sum of squared distances/Inertia') 
plt.title('Elbow Method For Optimal k')
plt.show()

In [None]:
# elbow looks like 3
kmeans_float = KMeans(n_clusters=3)
kmeans_fit = kmeans_float.fit(customers_grouped_trans)

In [None]:
print("labels:")
print(kmeans_float.labels_)
print("cluster_centers:")
print(kmeans_float.cluster_centers_)

In [None]:
kmeans_float.fit_predict(customers_grouped_trans)

In [None]:
labels = kmeans_float.labels_

#Glue back to originaal data
customers_grouped_trans['clusters'] = labels

In [None]:
customers_grouped_trans.clusters.value_counts()

In [None]:
customers_grouped_trans.tail()

In [None]:
customers_grouped_trans.to_parquet('customers_group.parquet', index=False)

# Make transactions DF
For future notebook, make transactions DF to determine if 

In [None]:
transactions = cudf.read_csv('../input/h-and-m-personalized-fashion-recommendations/transactions_train.csv', parse_dates=['t_dat'])
transactions['customer_id'] = transactions['customer_id'].str[-16:].str.hex_to_int().astype('float64')
transactions['article_id'] = transactions.article_id.astype('int32')
transactions.t_dat = cudf.to_datetime(transactions.t_dat)
transactions = transactions[['t_dat','customer_id','article_id']]
#transactions.to_parquet('train.pqt',index=False)
print( transactions.shape )
transactions.head()

In [None]:
transactions = cudf.merge(transactions, customers_grouped_trans, on='customer_id', how='left')
transactions.head()

In [None]:
transactions.isnull().sum(axis = 0)

t_dat                 0
customer_id           0
article_id            0
FN             18209837
Active         18412468
age              140258

In [None]:
#trans_pd = transactions.to_pandas()
#trans_pd.to_parquet('transactions_articles_customers.parquet', index=False)

In [None]:
#transactions.to_parquet('transactions_articles_customers.parquet', index=False)