**Experiment**
**Predict next article**

Create the dataset by adding the next article bought by the customer, creating embeddings for articles by applying PCA to one hot encoded article features.

**Expected reason of low accuracy is the high variance in target variable as, only considering the first 100K transactions, there is around 14K unique values of next_article_id with almost half of them only occur once.**

In [None]:
import numpy as np 
import pandas as pd 
from sklearn.tree import DecisionTreeClassifier as dtc
from sklearn.model_selection import train_test_split
from sklearn.metrics import *
from sklearn.decomposition import PCA

In [None]:
df = pd.read_csv('../input/h-and-m-personalized-fashion-recommendations/transactions_train.csv', chunksize=100000, dtype=str)
articles = pd.read_csv('../input/h-and-m-personalized-fashion-recommendations/articles.csv', dtype=str)
users = next(df)
users = users.merge(articles, on='article_id')

In [None]:
group_by_customer = users.groupby('customer_id')

**Create next_feature for every feature of each article for experimentation**

In [None]:
groups = []
for key in group_by_customer.groups.keys():
    group = group_by_customer.get_group(key).sort_values(by='t_dat')
    for column in group.columns:
        group['next_{}'.format(column)] = group[column].shift(-1)
        
    group = group.drop_duplicates(subset=['customer_id', 'article_id']).iloc[:-1]
    groups.append(group)

In [None]:
grouped_by_customers= pd.concat(groups)

In [None]:
grouped_by_customers.head()

In [None]:
grouped_by_customers.nunique()[['customer_id', 'article_id', 'next_article_id']]

In [None]:
grouped_by_customers.to_csv('df_with_next_article.csv')

In [None]:
df = grouped_by_customers[['t_dat', 'customer_id', 'article_id', 'prod_name', 'product_type_name',
       'product_group_name', 
       'graphical_appearance_name', 'colour_group_name',
       'perceived_colour_value_name',
       'perceived_colour_master_name',
       'department_name', 'index_name',
       'index_group_name', 'section_name',
       'garment_group_name',
       'next_article_id']].reset_index().drop('index', axis=1)

In [None]:
df.head()

In [None]:
input_features = df[[
       'product_group_name', 
       'graphical_appearance_name', 'colour_group_name',
       'perceived_colour_value_name',
       'perceived_colour_master_name',
       'department_name', 'index_name',
       'index_group_name', 'section_name',
       'garment_group_name']]
dummies = pd.get_dummies(input_features)

In [None]:
n_components = 100
pca = PCA(n_components)
pca.fit(dummies)
pca.explained_variance_ratio_.sum()

In [None]:
exp_var_cumul = np.cumsum(pca.explained_variance_ratio_)
plt.fill_between(range(1, 101), exp_var_cumul)

In [None]:
features_pca = pca.transform(dummies)

In [None]:
from sklearn.preprocessing import LabelEncoder

target = df.next_article_id

le = LabelEncoder()
target = le.fit_transform(target)

In [None]:
x_train, x_test, y_train, y_test = train_test_split(features_pca, target, test_size=0.2, random_state=42)

In [None]:
tree = dtc()

tree.fit(x_train, y_train)

In [None]:
tree.score(x_train, y_train), tree.score(x_test, y_test)