### Summary
For my final project I have chosen to compete in a kaggle competition. <br>
The competition is being put on by HM clothing and the goal is to predict what shoppers will buy in the next 7 days.

The competition details are here -> https://www.kaggle.com/c/h-and-m-personalized-fashion-recommendations


### Data

The data comprises of 4 files:
1. articles.csv - detailed metadata for each article_id available for purchase
2. sample_submission.csv - a sample submission file in the correct format
3. customers.csv - metadata for each customer_id in dataset
4. transactions_train.csv - the training data, consisting of the purchases each customer for each date, as well as additional information. Duplicate rows correspond to multiple purchases of the same item. Your task is to predict the article_ids each customer will purchase during the 7-day period immediately after the training data period.

### Data Strategy

My strategy is to first clean the data so that I can feed multiple days of shopping data into the model. I will see if I can cluster shoppers into categories. I will see if I can cluster items into categories. I think that it will be easier to first predict that the shopper will buy a clothing type, like shoes, then predict the type of shoe rather than trying to predict the exact shoe first. 

In [1]:
import pandas as pd
import numpy as np
import sklearn as sk
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import KNNImputer
import scipy as sp
import math
from pandas_profiling import ProfileReport
import lightgbm as lgb
import optuna as op


In [None]:
trainingdata = pd.read_csv('data/transactions_train.csv', row=13039505)
articlesdata = pd.read_csv('data/articles.csv')
customersdata = pd.read_csv('data/customers.csv')

In [37]:
trainingdata['t_dat'] = pd.to_datetime(trainingdata['t_dat'], yearfirst = True)
trainingdata['days_from_start'] = (trainingdata['t_dat'] - trainingdata['t_dat'].min()).dt.days.astype('int16')
trainingdata['bucket'] = (trainingdata['days_from_start'] / 7).apply(np.floor)


In [38]:
#merged_set = trainingdata.merge(articlesdata, left_on='article_id', right_on='article_id',
#          suffixes=('_left', '_right'))
merged_set =trainingdata.merge(customersdata, left_on='customer_id', right_on='customer_id',
          suffixes=('_left', '_right'))

In [39]:
merged_set =merged_set.drop(columns=['FN','Active'])

In [40]:
#profile = ProfileReport(merged_set, title="Pandas Profiling Report", explorative=True)

In [41]:
#profile.to_notebook_iframe()

In [42]:
merged_set.head()

Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id,days_from_start,bucket,club_member_status,fashion_news_frequency,age,postal_code
0,2018-09-20,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,663713001,0.050831,2,0,0.0,ACTIVE,NONE,24.0,64f17e6a330a85798e4998f62d0930d14db8db1c054af6...
1,2018-09-20,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,541518023,0.030492,2,0,0.0,ACTIVE,NONE,24.0,64f17e6a330a85798e4998f62d0930d14db8db1c054af6...
2,2018-09-24,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,663713001,0.050831,2,4,0.0,ACTIVE,NONE,24.0,64f17e6a330a85798e4998f62d0930d14db8db1c054af6...
3,2019-03-01,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,578020002,0.013542,2,162,23.0,ACTIVE,NONE,24.0,64f17e6a330a85798e4998f62d0930d14db8db1c054af6...
4,2020-02-03,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,723529001,0.025407,2,501,71.0,ACTIVE,NONE,24.0,64f17e6a330a85798e4998f62d0930d14db8db1c054af6...


In [43]:
x = merged_set.drop(columns=['t_dat','article_id','price','sales_channel_id','days_from_start','bucket'])
x.drop_duplicates(inplace=True)
x.reset_index(inplace=True)
x.head()

Unnamed: 0,index,customer_id,club_member_status,fashion_news_frequency,age,postal_code
0,0,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,ACTIVE,NONE,24.0,64f17e6a330a85798e4998f62d0930d14db8db1c054af6...
1,18,00007d2de826758b65a93dd24ce629ed66842531df6699...,ACTIVE,Regularly,32.0,8d6f45050876d059c830a0fe63f1a4c022de279bb68ce3...
2,138,00083cda041544b2fbb0e0d2905ad17da7cf1007526fb4...,ACTIVE,NONE,29.0,24e3594738f327e8a7671ec6d1e18b308fb0282e1f7e23...
3,165,0008968c0d451dbc5a9968da03196fe20051965edde741...,ACTIVE,Regularly,50.0,860a370c01b1eeebe3f3c78241f3236c79b0c41669c19e...
4,226,000aa7f0dc06cd7174389e76c9e132a67860c5f65f9706...,ACTIVE,NONE,22.0,6d98a3ca1370c33de04f4a0c2e81cd9c9e36d9501ca6dc...


In [44]:
#pivoted_merged_data = merged_set.pivot_table(index='customer_id', columns='t_dat', values='article_id', aggfunc=lambda x: ' '.join(x))
pivoted_merged_data = merged_set.pivot_table(index=['customer_id'],
                             columns=['bucket'],
                             values=['article_id'],
                             aggfunc=lambda x: ' '.join(str(v) for v in x))
pivoted_merged_data.head()


Unnamed: 0_level_0,article_id,article_id,article_id,article_id,article_id,article_id,article_id,article_id,article_id,article_id,article_id,article_id,article_id,article_id,article_id,article_id,article_id,article_id,article_id,article_id,article_id
bucket,0.0,1.0,2.0,3.0,4.0,5.0,6.0,7.0,8.0,9.0,...,95.0,96.0,97.0,98.0,99.0,100.0,101.0,102.0,103.0,104.0
customer_id,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
00000dbacae5abe5e23885899a1fa44253a17956c6d1c3d25f88aa139fdfc657,,,,,,,,,,,...,,,,,,,,568601043.0,,
0000423b00ade91418cceaf3b26c6af3dd342b51fd051eec9c12fb36984420fa,583558001 639677008 640244003 521269001 666448006,583558001 673677002 666448006,,,,,,,,,...,,,,,,,,,,
000058a12d5b43e67d225668fa1f8d618c13dc232df0cad8ffe7ad4a1091e318,663713001 541518023 663713001,,,,,,,,,,...,,,,,,,,,794321007.0,
00005ca1c9ed5f5146b52ac8639a40ca9d57aeff4d1bd2c5feb1ca5dff07c43e,,,,,,,,,,,...,,,,,,,,,,
00006413d8573cd20ed7128e53b7b13819fe5cfc2d801fe7fc0f26dd8d65a85a,,,,634249005 677049001,,,,,,,...,,,,896152002 730683050 927530004 791587015,,,,,,


In [45]:
training_data_set = x.join(pivoted_merged_data,on=['customer_id'], how='inner')
training_data_set.drop(columns=['index'], inplace = True)
#training_data_set.rename(columns={"('article_id', 0.0)": "0", "('article_id', 1.0)": "1"}, errors="raise", inplace = True)
#training_data_set["('article_id', 0.0)"].head()
i = 0
for each in range(5,len(training_data_set.columns)):
    training_data_set.columns.values[each] = f'{i}_week'
    i+=1
for each in range(5,len(training_data_set.columns)):  
    values = {f'{i}_week': '99999999'}
    column_name = training_data_set.columns.tolist()[each]
    print(each)
    print(len(training_data_set.columns)-1)
    if each == len(training_data_set.columns)-1:
        print('yay')
        training_data_set[column_name]=training_data_set[column_name].replace(np.nan, '1')                     
    else:
        training_data_set[column_name]=training_data_set[column_name].replace(np.nan, '2')
    i+=1
training_data_set.head(1000)

  return merge(


5
109
6
109
7
109
8
109
9
109
10
109
11
109
12
109
13
109
14
109
15
109
16
109
17
109
18
109
19
109
20
109
21
109
22
109
23
109
24
109
25
109
26
109
27
109
28
109
29
109
30
109
31
109
32
109
33
109
34
109
35
109
36
109
37
109
38
109
39
109
40
109
41
109
42
109
43
109
44
109
45
109
46
109
47
109
48
109
49
109
50
109
51
109
52
109
53
109
54
109
55
109
56
109
57
109
58
109
59
109
60
109
61
109
62
109
63
109
64
109
65
109
66
109
67
109
68
109
69
109
70
109
71
109
72
109
73
109
74
109
75
109
76
109
77
109
78
109
79
109
80
109
81
109
82
109
83
109
84
109
85
109
86
109
87
109
88
109
89
109
90
109
91
109
92
109
93
109
94
109
95
109
96
109
97
109
98
109
99
109
100
109
101
109
102
109
103
109
104
109
105
109
106
109
107
109
108
109
109
109
yay


Unnamed: 0,customer_id,club_member_status,fashion_news_frequency,age,postal_code,0_week,1_week,2_week,3_week,4_week,...,95_week,96_week,97_week,98_week,99_week,100_week,101_week,102_week,103_week,104_week
0,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,ACTIVE,NONE,24.0,64f17e6a330a85798e4998f62d0930d14db8db1c054af6...,663713001 541518023 663713001,2,2,2,2,...,2,2,2,2,2,2,2,2,794321007,1
1,00007d2de826758b65a93dd24ce629ed66842531df6699...,ACTIVE,Regularly,32.0,8d6f45050876d059c830a0fe63f1a4c022de279bb68ce3...,505221004 685687003 685687004 685687001 505221...,528931002 349301001 590414001 590414002 570309...,2,2,464454004 550718001 583533001,...,2,2,2,2,2,2,2,2,2,1
2,00083cda041544b2fbb0e0d2905ad17da7cf1007526fb4...,ACTIVE,NONE,29.0,24e3594738f327e8a7671ec6d1e18b308fb0282e1f7e23...,688873012 501323011 598859003 688873020 688873011,2,2,2,568571002 599580001 559630003 687034001 687033...,...,2,2,2,2,2,2,2,2,2,1
3,0008968c0d451dbc5a9968da03196fe20051965edde741...,ACTIVE,Regularly,50.0,860a370c01b1eeebe3f3c78241f3236c79b0c41669c19e...,531310002 529841001,2,2,2,2,...,579302004,2,2,857812002 872575001,861173003 778064001,2,808462002 868134001,2,893059004,1
4,000aa7f0dc06cd7174389e76c9e132a67860c5f65f9706...,ACTIVE,NONE,22.0,6d98a3ca1370c33de04f4a0c2e81cd9c9e36d9501ca6dc...,501820043 501820043 674681001 671505001 671505...,2,2,2,2,...,2,2,2,2,2,2,2,2,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,133e8f5b5bc290d063b1cce066d6769e8a1f744d430ba1...,ACTIVE,Regularly,55.0,4bc11ab924e5871936d54e32fed654798f2c4fa0ce3aed...,573323002,2,2,2,2,...,2,890722002 698276009 698276009 811835002 843873...,854786001,2,2,2,2,2,2,1
996,133ffccbc4f8619d46ebf5fba7a61e07ef6907ded7a4a3...,ACTIVE,Regularly,46.0,62eb5ff8b0b76f18819e6580c82885d52e0ac6cb98a2f0...,634320003 634320003 625311006 625311006 630141...,2,2,2,2,...,2,2,2,2,2,2,2,2,2,1
997,134ba9649a005eab6e0aec50bc914afc34f45ca5d01fb3...,ACTIVE,NONE,54.0,ba33e4c950412d402fb391f1b174af390788dbbd8d8f81...,399201005 665477001 665477001 589222002 665477...,664133003 664133003,2,2,2,...,2,2,2,2,2,2,873678007 716670009,2,2,1
998,1351dd509397a2f5fab50094de8b84a6c23fb8786f88d2...,ACTIVE,Regularly,26.0,7ed99cc6da9c1b1936088736d51fa4ed91890d87551821...,646429001,2,2,2,2,...,2,2,2,2,2,2,2,2,2,1


In [46]:
#profile = ProfileReport(training_data_set, title="Pandas Profiling Report", explorative=True)
#profile.to_notebook_iframe()

In [47]:
customer_id_encoding = LabelEncoder()
training_data_set['customer_ids'] = customer_id_encoding.fit_transform(training_data_set['customer_id'])

club_member_status_encoding = LabelEncoder()
training_data_set['club_member_statuses'] = club_member_status_encoding.fit_transform(training_data_set['club_member_status'])

fashion_news_frequency_encoding = LabelEncoder()
training_data_set['fashion_news_frequencys'] = fashion_news_frequency_encoding.fit_transform(training_data_set['fashion_news_frequency'])

postal_code_encoding = LabelEncoder()
training_data_set['postal_codes'] = postal_code_encoding.fit_transform(training_data_set['postal_code'])

X = training_data_set.drop(columns=['customer_id','club_member_status','fashion_news_frequency','postal_code'])

column_to_move = X.pop("customer_ids")
X.insert(0, "customer_id", column_to_move)

column_to_move = X.pop("club_member_statuses")
X.insert(0, "club_member_status", column_to_move)

column_to_move = X.pop("fashion_news_frequencys")
X.insert(0, "fashion_news_frequency", column_to_move)

column_to_move = X.pop("postal_codes")
X.insert(0, "postal_codes", column_to_move)



In [48]:
i = 0
labels =[]
print(X.columns)
for each in range(5,len(X.columns)):
    labels.extend(X.iloc[:,each].tolist())
    print(i)
    print(len(labels))
    i+=1
encoder = LabelEncoder()
fit = encoder.fit(labels)


Index(['postal_codes', 'fashion_news_frequency', 'club_member_status',
       'customer_id', 'age', '0_week', '1_week', '2_week', '3_week', '4_week',
       ...
       '95_week', '96_week', '97_week', '98_week', '99_week', '100_week',
       '101_week', '102_week', '103_week', '104_week'],
      dtype='object', length=110)
0
1362281
1
2724562
2
4086843
3
5449124
4
6811405
5
8173686
6
9535967
7
10898248
8
12260529
9
13622810
10
14985091
11
16347372
12
17709653
13
19071934
14
20434215
15
21796496
16
23158777
17
24521058
18
25883339
19
27245620
20
28607901
21
29970182
22
31332463
23
32694744
24
34057025
25
35419306
26
36781587
27
38143868
28
39506149
29
40868430
30
42230711
31
43592992
32
44955273
33
46317554
34
47679835
35
49042116
36
50404397
37
51766678
38
53128959
39
54491240
40
55853521
41
57215802
42
58578083
43
59940364
44
61302645
45
62664926
46
64027207
47
65389488
48
66751769
49
68114050
50
69476331
51
70838612
52
72200893
53
73563174
54
74925455
55
76287736
56
77650017
57
79012

MemoryError: Unable to allocate 3.42 TiB for an array with shape (143039505,) and data type <U6569

In [None]:
i = 0
for each in range(5,len(X.columns)):
    print(f'{i}_week_encoding')
    original_column = f'{i}_week'
    new_column = f'{i}_weeks'
    X[f'{i}_weeks'] = fit.transform(training_data_set[f'{i}_week'])
    X[f'{i}_weeks']=X[f'{i}_weeks'].replace(0, np.nan)
    X = X.drop(columns=[f'{i}_week'])
    i+=1
X.head()

In [None]:
def objective(trial):
    neighbors = trial.suggest_int("neighbors", 2, 1000, log=True)
    #X_test = X.iloc[:1000]
    X_test = X
    y_true = np.array(X_test.iloc[: , -1].replace(np.nan, 0)).astype(int)
    y_true = encoder.inverse_transform(y_true)

    impute_knn = KNNImputer(n_neighbors=neighbors)
    y_pred = pd.DataFrame(impute_knn.fit_transform(X_test)).iloc[: , -1].round().astype(int)
    y_pred = encoder.inverse_transform(np.array(y_pred))
    accuracy = sk.metrics.accuracy_score(y_true, y_pred,normalize=True)
    print(accuracy)
    return accuracy
if __name__ == "__main__":
    study = op.create_study(direction="minimize")
    study.optimize(objective, n_trials=2)
    print(study.best_trial)