In [33]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics.pairwise import cosine_similarity

from collections import Counter
import operator

In [2]:
! pwd

/home/cjergen/code/sailormoonvicky/eCommerce/notebooks


In [3]:
filename = '../data/2019-Oct.csv_10%.csv'

In [4]:
import random
p = 1  # 1% of the lines
# keep the header, then take only 1% of lines
# if random from [0,1] interval is greater than 0.01 the row will be skipped
df = pd.read_csv(
         filename,
         header=0, 
         skiprows=lambda i: i>0 and random.random() > p
)
df.shape

(4248118, 9)

In [5]:
df.head()

Unnamed: 0,event_time,event_type,product_id,category_id,category_code,brand,price,user_id,user_session
0,2019-10-01 00:00:17 UTC,view,23100006,2053013561638126333,,,357.79,513642368,17566c27-0a8f-4506-9f30-c6a2ccbf583b
1,2019-10-01 00:00:20 UTC,view,4803399,2053013554658804075,electronics.audio.headphone,jbl,33.21,555428858,8a6afed4-77f8-40c9-8e76-e062b28216ce
2,2019-10-01 00:00:23 UTC,view,6200260,2053013552293216471,appliances.environment.air_heater,midea,47.62,538645907,7d9a8784-7b6c-426e-9924-9f688812fd71
3,2019-10-01 00:00:58 UTC,view,4802639,2053013554658804075,electronics.audio.headphone,sony,218.77,514808401,1877639d-46a4-44f8-bae9-a14456952240
4,2019-10-01 00:01:11 UTC,view,1004836,2053013555631882655,electronics.smartphone,samsung,241.19,546259103,6e2984c8-502e-4fe7-bbba-34087f760175


# Preprocess 

In [6]:
def preprocessing_x_sell(X):
    X_preprocessed = X.dropna(subset = ['category_code', 'brand']) #tbd!!
    X_preprocessed = X_preprocessed.drop_duplicates()
    X_purch = X_preprocessed[X_preprocessed['event_type']=='purchase']
    return X_purch

In [7]:
X_purch = preprocessing_x_sell(df)
X_purch.shape

(54716, 9)

In [34]:
X_purch

Unnamed: 0,event_time,event_type,product_id,category_id,category_code,brand,price,user_id,user_session
156,2019-10-01 02:19:34 UTC,purchase,1005104,2053013555631882655,electronics.smartphone,apple,975.57,555160488,a1395546-c124-456c-9752-0f5cf883c63d
173,2019-10-01 02:20:16 UTC,purchase,4700419,2053013560899928785,auto.accessories.videoregister,sho-me,141.86,512550575,84f21dc3-df18-4cf3-accd-bcd9655891ca
512,2019-10-01 02:28:29 UTC,purchase,1004794,2053013555631882655,electronics.smartphone,xiaomi,241.96,555420273,35954daf-bd58-45c6-b22a-40f369f77ca2
569,2019-10-01 02:29:40 UTC,purchase,1004750,2053013555631882655,electronics.smartphone,samsung,197.43,512592316,2f8c3cd6-2b93-4440-bd72-3c36f4a2fd6c
663,2019-10-01 02:31:18 UTC,purchase,12400121,2053013556252639687,construction.tools.drill,alteco,45.82,516201000,2a0abd92-f425-4ec0-882f-9fb5db4b192a
...,...,...,...,...,...,...,...,...,...
4247626,2019-10-31 23:18:55 UTC,purchase,1004249,2053013555631882655,electronics.smartphone,apple,722.40,528350072,592de524-5929-4fa3-94be-dbae6c59a0f2
4247744,2019-10-31 23:29:08 UTC,purchase,12300078,2053013556311359947,construction.tools.drill,bosch,96.76,550885932,7c8c0c0c-bc89-40dc-91f5-94550e291e0b
4247867,2019-10-31 23:38:58 UTC,purchase,1005182,2053013555631882655,electronics.smartphone,samsung,1029.34,545950136,53b5858a-3bf8-47eb-9739-f9a5b20bbc28
4247931,2019-10-31 23:44:44 UTC,purchase,1004249,2053013555631882655,electronics.smartphone,apple,722.40,533364737,3eb9f473-ca49-44f3-8669-4aaa9fd22a97


In [8]:
def filter(X):
    df = pd.DataFrame(X.groupby('user_session').event_type.count())
    df_new = df[df['event_type']>1]
    return df_new

In [9]:
X_filter = filter(X_purch)

In [10]:
X_filter.head()

Unnamed: 0_level_0,event_type
user_session,Unnamed: 1_level_1
004cbcc0-9763-49bd-a8a7-568e9003648f,2
00794112-3575-46b6-912b-42e8df0dc78b,2
0083e17d-90aa-4c03-a46e-ee71dea54f26,2
00848298-2ce1-4017-b488-cbe1ff454039,2
00848f4f-1fd4-4b7c-84ed-1dacc63bd352,2


In [11]:
df_cross = X_purch[X_purch['user_session'].isin(X_filter.index)]

In [12]:
df_cross.head()

Unnamed: 0,event_time,event_type,product_id,category_id,category_code,brand,price,user_id,user_session
2871,2019-10-01 03:06:10 UTC,purchase,1306310,2053013558920217191,computers.notebook,apple,1352.42,542966898,a6a87556-4169-4a7c-b68e-2e71379f5397
2998,2019-10-01 03:07:44 UTC,purchase,1005105,2053013555631882655,electronics.smartphone,apple,1415.48,542966898,a6a87556-4169-4a7c-b68e-2e71379f5397
3532,2019-10-01 03:14:32 UTC,purchase,1002544,2053013555631882655,electronics.smartphone,apple,464.08,554090147,c5633b76-2109-40a7-88f6-0260feebd9fd
3688,2019-10-01 03:16:37 UTC,purchase,1002544,2053013555631882655,electronics.smartphone,apple,464.08,554090147,c5633b76-2109-40a7-88f6-0260feebd9fd
3755,2019-10-01 03:17:29 UTC,purchase,1002544,2053013555631882655,electronics.smartphone,apple,464.08,554090147,c5633b76-2109-40a7-88f6-0260feebd9fd


# Counter

In [13]:
df_test = pd.DataFrame(df_cross.groupby('user_session')['product_id'].apply(list).tolist())
df_test.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,12400037,12400037,,,,,,,
1,4200545,3600952,,,,,,,
2,1004961,1201466,,,,,,,
3,1306316,1306315,,,,,,,
4,1004249,1004249,,,,,,,


In [14]:
def concat(X):
    X = X[X[0]!=X[1]]
    X['conc'] = X[0].astype(str) + '_' + X[1].astype(str)
    
    lst = list(X['conc'])
    
    a = list(Counter(lst).keys()) # equals to list(X['conc'])
    b = list(Counter(lst).values()) # counts the elements' frequency
    
    dct_1 = {a[i]: b[i] for i in range(len(a))}
    dct_2 = dict(sorted(dct_1.items(), key=operator.itemgetter(1),reverse=True))

    return pd.DataFrame.from_dict(dct_2, orient='index').reset_index().rename(columns={'index': 'combo', 0: 'counts'})


In [15]:
X_sell = concat(df_test)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['conc'] = X[0].astype(str) + '_' + X[1].astype(str)


In [16]:
X_sell

Unnamed: 0,combo,counts
0,1004767_1004856,6
1,1004870_1004767,5
2,1004833_1004856,5
3,1004767_1004833,4
4,1004873_1004767,3
...,...,...
498,1004659_1004873,1
499,28719302_28715597,1
500,1004767_1004750,1
501,1002629_1004838,1


In [17]:
def x_seller(X):
    X["product_1"] = X["combo"].str.split("_",expand=True)[0].astype('category')
    X["product_2"] = X["combo"].str.split("_",expand=True)[1].astype('category')
    return X
    

In [18]:
new_dataframe = x_seller(X_sell)

new_dataframe.product_1 = new_dataframe.product_1.apply(lambda x: int(x))
new_dataframe.product_2 = new_dataframe.product_2.apply(lambda x: int(x))


In [19]:
new_dataframe.head()

Unnamed: 0,combo,counts,product_1,product_2
0,1004767_1004856,6,1004767,1004856
1,1004870_1004767,5,1004870,1004767
2,1004833_1004856,5,1004833,1004856
3,1004767_1004833,4,1004767,1004833
4,1004873_1004767,3,1004873,1004767


In [20]:
new_df_1 = new_dataframe.merge(df, how='left', left_on='product_1', right_on='product_id').rename(columns={'price': 'price_1', 'category_code': 'category_code_1', 'brand': 'brand_1'})

In [21]:
new_df_1 = new_df_1.drop_duplicates(['product_1','product_2'])
new_df_1.head()

Unnamed: 0,combo,counts,product_1,product_2,event_time,event_type,product_id,category_id,category_code_1,brand_1,price_1,user_id,user_session
0,1004767_1004856,6,1004767,1004856,2019-10-01 00:02:15 UTC,view,1004767,2053013555631882655,electronics.smartphone,samsung,254.82,551417808,3f2209fb-0fe6-4093-8486-51e52dbda8cf
44076,1004870_1004767,5,1004870,1004767,2019-10-01 02:19:38 UTC,view,1004870,2053013555631882655,electronics.smartphone,samsung,286.86,512740727,20e584bf-6461-4940-a774-1ff1110c633b
66105,1004833_1004856,5,1004833,1004856,2019-10-01 02:23:16 UTC,view,1004833,2053013555631882655,electronics.smartphone,samsung,174.76,550422448,f9e45568-7f95-4367-8383-7dfdbd5c1791
90011,1004767_1004833,4,1004767,1004833,2019-10-01 00:02:15 UTC,view,1004767,2053013555631882655,electronics.smartphone,samsung,254.82,551417808,3f2209fb-0fe6-4093-8486-51e52dbda8cf
134087,1004873_1004767,3,1004873,1004767,2019-10-01 02:20:27 UTC,view,1004873,2053013555631882655,electronics.smartphone,samsung,388.81,512740727,20e584bf-6461-4940-a774-1ff1110c633b


In [22]:
new_df_2 = new_df_1.merge(df, how='left', left_on='product_2', right_on='product_id')

In [23]:
df_short = new_df_1[['combo', 'counts', 'product_1', 'product_2', 'price_1', 'category_code_1', 'brand_1']]
df_short.head()

Unnamed: 0,combo,counts,product_1,product_2,price_1,category_code_1,brand_1
0,1004767_1004856,6,1004767,1004856,254.82,electronics.smartphone,samsung
44076,1004870_1004767,5,1004870,1004767,286.86,electronics.smartphone,samsung
66105,1004833_1004856,5,1004833,1004856,174.76,electronics.smartphone,samsung
90011,1004767_1004833,4,1004767,1004833,254.82,electronics.smartphone,samsung
134087,1004873_1004767,3,1004873,1004767,388.81,electronics.smartphone,samsung


In [24]:
df_final = df_short.merge(df, how='left', left_on='product_2', right_on='product_id').rename(columns={'price': 'price_2', 'category_code': 'category_code_2', 'brand': 'brand_2'}).drop_duplicates(['product_1','product_2'])

# Final recommendation

In [25]:
df_final = df_final[['combo', 'counts', 'product_1', 'product_2', 'price_1', 'price_2', 'category_code_1', 'brand_1', 'category_code_2', 'brand_2']]

In [26]:
df_final.head()

Unnamed: 0,combo,counts,product_1,product_2,price_1,price_2,category_code_1,brand_1,category_code_2,brand_2
0,1004767_1004856,6,1004767,1004856,254.82,130.76,electronics.smartphone,samsung,electronics.smartphone,samsung
49567,1004870_1004767,5,1004870,1004767,286.86,254.82,electronics.smartphone,samsung,electronics.smartphone,samsung
93643,1004833_1004856,5,1004833,1004856,174.76,130.76,electronics.smartphone,samsung,electronics.smartphone,samsung
143210,1004767_1004833,4,1004767,1004833,254.82,174.76,electronics.smartphone,samsung,electronics.smartphone,samsung
167116,1004873_1004767,3,1004873,1004767,388.81,254.82,electronics.smartphone,samsung,electronics.smartphone,samsung


In [27]:
df_final['category_code_1'] = df_final['category_code_1'].str.replace('.',' ')
df_final['category_code_2'] = df_final['category_code_2'].str.replace('.',' ')

df_final.head()

  df_final['category_code_1'] = df_final['category_code_1'].str.replace('.',' ')
  df_final['category_code_2'] = df_final['category_code_2'].str.replace('.',' ')


Unnamed: 0,combo,counts,product_1,product_2,price_1,price_2,category_code_1,brand_1,category_code_2,brand_2
0,1004767_1004856,6,1004767,1004856,254.82,130.76,electronics smartphone,samsung,electronics smartphone,samsung
49567,1004870_1004767,5,1004870,1004767,286.86,254.82,electronics smartphone,samsung,electronics smartphone,samsung
93643,1004833_1004856,5,1004833,1004856,174.76,130.76,electronics smartphone,samsung,electronics smartphone,samsung
143210,1004767_1004833,4,1004767,1004833,254.82,174.76,electronics smartphone,samsung,electronics smartphone,samsung
167116,1004873_1004767,3,1004873,1004767,388.81,254.82,electronics smartphone,samsung,electronics smartphone,samsung


In [28]:
df_final

Unnamed: 0,combo,counts,product_1,product_2,price_1,price_2,category_code_1,brand_1,category_code_2,brand_2
0,1004767_1004856,6,1004767,1004856,254.82,130.76,electronics smartphone,samsung,electronics smartphone,samsung
49567,1004870_1004767,5,1004870,1004767,286.86,254.82,electronics smartphone,samsung,electronics smartphone,samsung
93643,1004833_1004856,5,1004833,1004856,174.76,130.76,electronics smartphone,samsung,electronics smartphone,samsung
143210,1004767_1004833,4,1004767,1004833,254.82,174.76,electronics smartphone,samsung,electronics smartphone,samsung
167116,1004873_1004767,3,1004873,1004767,388.81,254.82,electronics smartphone,samsung,electronics smartphone,samsung
...,...,...,...,...,...,...,...,...,...,...
4767822,1004659_1004873,1,1004659,1004873,787.18,388.81,electronics smartphone,samsung,electronics smartphone,samsung
4783228,28719302_28715597,1,28719302,28715597,94.98,100.36,apparel shoes keds,fila,apparel shoes keds,fila
4783245,1004767_1004750,1,1004767,1004750,254.82,197.43,electronics smartphone,samsung,electronics smartphone,samsung
4795246,1002629_1004838,1,1002629,1004838,377.29,179.38,electronics smartphone,apple,electronics smartphone,oppo


In [29]:
df_final['metadata_1'] = df_final['category_code_1'] + ' ' +  df_final['brand_1']
df_final['metadata_2'] = df_final['category_code_2'] + ' ' +  df_final['brand_2']

In [30]:
df_final.head()

Unnamed: 0,combo,counts,product_1,product_2,price_1,price_2,category_code_1,brand_1,category_code_2,brand_2,metadata_1,metadata_2
0,1004767_1004856,6,1004767,1004856,254.82,130.76,electronics smartphone,samsung,electronics smartphone,samsung,electronics smartphone samsung,electronics smartphone samsung
49567,1004870_1004767,5,1004870,1004767,286.86,254.82,electronics smartphone,samsung,electronics smartphone,samsung,electronics smartphone samsung,electronics smartphone samsung
93643,1004833_1004856,5,1004833,1004856,174.76,130.76,electronics smartphone,samsung,electronics smartphone,samsung,electronics smartphone samsung,electronics smartphone samsung
143210,1004767_1004833,4,1004767,1004833,254.82,174.76,electronics smartphone,samsung,electronics smartphone,samsung,electronics smartphone samsung,electronics smartphone samsung
167116,1004873_1004767,3,1004873,1004767,388.81,254.82,electronics smartphone,samsung,electronics smartphone,samsung,electronics smartphone samsung,electronics smartphone samsung


In [31]:
df_final = df_final[df_final['metadata_1'] != df_final['metadata_2']]

In [32]:
df_final.head(50)

Unnamed: 0,combo,counts,product_1,product_2,price_1,price_2,category_code_1,brand_1,category_code_2,brand_2,metadata_1,metadata_2
211192,1002544_1004767,3,1002544,1004767,464.13,254.82,electronics smartphone,apple,electronics smartphone,samsung,electronics smartphone apple,electronics smartphone samsung
322353,1002633_1004767,3,1002633,1004767,360.08,254.82,electronics smartphone,apple,electronics smartphone,samsung,electronics smartphone apple,electronics smartphone samsung
641784,1002544_1004873,2,1002544,1004873,464.13,388.81,electronics smartphone,apple,electronics smartphone,samsung,electronics smartphone apple,electronics smartphone samsung
764197,1004249_4804055,2,1004249,4804055,739.81,189.91,electronics smartphone,apple,electronics audio headphone,apple,electronics smartphone apple,electronics audio headphone apple
829605,1004249_1004875,2,1004249,1004875,739.81,388.68,electronics smartphone,apple,electronics smartphone,samsung,electronics smartphone apple,electronics smartphone samsung
833719,1002524_1004767,2,1002524,1004767,515.67,254.82,electronics smartphone,apple,electronics smartphone,samsung,electronics smartphone apple,electronics smartphone samsung
889392,4200545_3600952,1,4200545,3600952,437.57,423.89,appliances environment air_conditioner,elenberg,appliances kitchen washer,samsung,appliances environment air_conditioner elenberg,appliances kitchen washer samsung
889529,1004961_1201466,1,1004961,1201466,179.36,237.27,electronics smartphone,oppo,electronics tablet,samsung,electronics smartphone oppo,electronics tablet samsung
890265,5100722_1004767,1,5100722,1004767,169.62,254.82,electronics clocks,huawei,electronics smartphone,samsung,electronics clocks huawei,electronics smartphone samsung
972243,1005015_1005175,1,1005015,1005175,604.88,643.23,electronics smartphone,samsung,electronics smartphone,,electronics smartphone samsung,
