In [None]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

customers = pd.read_csv('../input/h-and-m-personalized-fashion-recommendations/customers.csv')
transactions = pd.read_csv('../input/h-and-m-personalized-fashion-recommendations/transactions_train.csv', dtype={'article_id': str})
transactions.set_index('customer_id', inplace=True)
customers.set_index('customer_id', inplace=True)

# Summary

There are 9699 customers in customers.csv who don't occur in transactions_train.csv. They either purchased products before 2018-09-20, their transactions were not recorded, they created an account without ever purchasing anything **or their transactions only take place after 2020-09-22**. The latter would mean, they're of very high importance for the prediction period and should be treated with care. The age distribution is bimodal for both groups, but they are not randomly drawn from the same distribution. The customers with no prior transactions are on average younger than the other customers.

In [None]:
new_c = customers[~customers.index.isin(transactions.index)]
old_c = customers[customers.index.isin(transactions.index)]
new_c.shape

In [None]:
new_c.head()

In [None]:
# Missing values
print('Missing values:\n')
print(new_c.isna().sum())
# Unique values
print('\n\nUnique values:\n')
for c in new_c.columns:
    print(f'{c}: {len(customers[c].unique())}')

In [None]:
# age distribution
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(10,7))
ax1.set_title('Customers not in transactions')
sns.histplot(new_c.age, ax=ax1, bins=15, kde = True, stat='probability')
ax2.set_title('Customers in transactions')
sns.histplot(old_c.age, ax=ax2, bins=15, kde = True, stat='probability')
plt.show()

In [None]:
from scipy.stats import kstest, mannwhitneyu

In [None]:
print('mean new: ' + str(new_c.age.mean()))
print('median new: ' +str(new_c.age.median()))
print('std new: ' + str(new_c.age.std()))
print('mean old: ' + str(old_c.age.mean()))
print('median old: ' +str(old_c.age.median()))
print('std old: ' + str(old_c.age.std()))

In [None]:
kstest(old_c.age, new_c.age)

In [None]:
mannwhitneyu(new_c.age.dropna(), old_c.age.dropna(), alternative='less')