In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics.pairwise import cosine_similarity

from collections import Counter
import operator

In [2]:
filename = '../data/2019-Oct.csv_10%.csv'

In [1]:
1

1

In [3]:
import random
p = 0.5  # 1% of the lines
# keep the header, then take only 1% of lines
# if random from [0,1] interval is greater than 0.01 the row will be skipped
df = pd.read_csv(
         filename,
         header=0, 
         skiprows=lambda i: i>0 and random.random() > p
)
df.shape

(2124617, 9)

# Preprocess

In [4]:
def preprocessing_feat(X, drop_event_time=False):
    if drop_event_time:
        X = X.drop("event_time", axis=1)
    
    X_preprocessed = X.dropna(subset = ['category_code', 'brand']) #tbd!!
    X_preprocessed = X_preprocessed.drop_duplicates()
    X_preprocessed['category_code'] = X_preprocessed['category_code'].str.replace('.',' ')
    return X_preprocessed

In [5]:
X_preprocessed = preprocessing_feat(df, drop_event_time=True)

  X_preprocessed['category_code'] = X_preprocessed['category_code'].str.replace('.',' ')


In [6]:
X_preprocessed.shape

(1293444, 8)

In [7]:
def make_column(row):
    if row["price"] < row["25%"]:
        return "low"
    elif row["price"] < row["75%"]:
        return "medium"
    else:
        return "high"

def pricing_criterion(X):
    pricing_guide = X.groupby('category_code')['price'].describe()[["25%", "75%"]].reset_index()
    X_merged = X.merge(pricing_guide, on="category_code", how="right")
    X_merged["price_category"] = X_merged.apply(lambda row: make_column(row), axis=1)
    return X_merged


In [8]:
X_merged = pricing_criterion(X_preprocessed)

In [9]:
def metadata(X):
    X['metadata'] = X[['category_code', 'brand', 'price_category']].apply(lambda x: ' '.join(x), axis = 1)
    return X


In [10]:
X_meta = metadata(X_merged)
X_meta.shape

(1293444, 12)

In [11]:
X_meta.head()

Unnamed: 0,event_type,product_id,category_id,category_code,brand,price,user_id,user_session,25%,75%,price_category,metadata
0,view,32900083,2055156924407612189,accessories bag,a-elita,8.49,549437633,62760980-f427-42e0-a891-8b590fd54584,16.28,76.96,low,accessories bag a-elita low
1,view,28401176,2053013566209917945,accessories bag,karya,100.39,549348524,933c58ab-5185-44d6-94d6-c2ccb335d7f9,16.28,76.96,high,accessories bag karya high
2,view,28400461,2053013566209917945,accessories bag,fabretti,31.92,512406877,c1ae47b0-9ab4-4f5f-947a-2e0d80970e6c,16.28,76.96,medium,accessories bag fabretti medium
3,view,28401054,2053013566209917945,accessories bag,karya,100.39,514312434,0d0c3d87-ed6f-4f58-9b9e-2cc44b89d25c,16.28,76.96,high,accessories bag karya high
4,view,18300169,2053013558945383017,accessories bag,hp,17.82,555465687,1fa735b1-1d98-4562-9b7d-74211f89394a,16.28,76.96,medium,accessories bag hp medium


In [12]:
X_purch = X_meta[X_meta['event_type']=='purchase']

In [13]:
X_purch.head()

Unnamed: 0,event_type,product_id,category_id,category_code,brand,price,user_id,user_session,25%,75%,price_category,metadata
336,purchase,28400931,2053013566209917945,accessories bag,respect,82.11,513848273,affb8be2-7b49-4a2c-a66c-4c18142b7615,16.28,76.96,high,accessories bag respect high
347,purchase,49800014,2126679654801604876,accessories bag,ritmix,12.86,514694624,655725f6-f9ff-4088-b60b-6c8a542a72f6,16.28,76.96,low,accessories bag ritmix low
560,purchase,49800017,2126679654801604876,accessories bag,ritmix,12.32,512700216,d1faefa6-ce4a-474e-b952-26b0483a6cfa,16.28,76.96,low,accessories bag ritmix low
616,purchase,28400343,2053013566209917945,accessories bag,karya,100.39,522957356,29d61a4a-c1fa-4c38-acd0-8685cac830eb,16.28,76.96,high,accessories bag karya high
635,purchase,28401078,2053013566209917945,accessories bag,respect,66.67,547677690,b17a59b8-42c3-4c8a-ac3b-f462b83c3caa,16.28,76.96,medium,accessories bag respect medium


In [14]:
def filter(X):
    df = pd.DataFrame(X.groupby('user_session').event_type.count())
    df_new = df[df['event_type']>1]
    return df_new

In [15]:
X_filter = filter(X_purch)

In [16]:
df_cross = X_purch[X_purch['user_session'].isin(X_filter.index)]

In [17]:
df_cross.head()

Unnamed: 0,event_type,product_id,category_id,category_code,brand,price,user_id,user_session,25%,75%,price_category,metadata
33424,purchase,28715723,2053013565782098913,apparel shoes,caprice,64.61,562669160,a82f9713-d470-4b5e-bb79-256b2ffdd78b,62.81,105.54,medium,apparel shoes caprice medium
44921,purchase,28718097,2053013565069067197,apparel shoes keds,respect,51.22,548555298,b8f60d64-f0cb-41aa-8e3c-26f21e79918f,51.22,100.39,medium,apparel shoes keds respect medium
47054,purchase,28719302,2053013565480109009,apparel shoes keds,fila,94.98,547066883,fe1a5c19-a144-473e-8db5-5a97a4f80aa5,51.22,100.39,medium,apparel shoes keds fila medium
47057,purchase,28715120,2053013565480109009,apparel shoes keds,baden,51.22,547066883,fe1a5c19-a144-473e-8db5-5a97a4f80aa5,51.22,100.39,medium,apparel shoes keds baden medium
64255,purchase,35200115,2090228413380952337,apparel underwear,milavitsa,14.36,556897343,7b647592-c071-48d4-8a6c-54cce9eecf9a,14.28,31.51,medium,apparel underwear milavitsa medium


# Counter

In [18]:
df_test = pd.DataFrame(df_cross.groupby('user_session')['product_id'].apply(list).tolist())
df_test.head()

Unnamed: 0,0,1,2
0,4200545,3600952,
1,1004961,1201466,
2,5100722,1004767,
3,1004833,1004579,
4,1005138,1004856,


In [19]:
def concat(X):
    X = X[X[0]!=X[1]]
    X['conc'] = X[0].astype(str) + '_' + X[1].astype(str)
    
    lst = list(X['conc'])
    
    a = list(Counter(lst).keys()) # equals to list(X['conc'])
    b = list(Counter(lst).values()) # counts the elements' frequency
    
    dct_1 = {a[i]: b[i] for i in range(len(a))}
    dct_2 = dict(sorted(dct_1.items(), key=operator.itemgetter(1),reverse=True))

    return pd.DataFrame.from_dict(dct_2, orient='index').reset_index().rename(columns={'index': 'combo', 0: 'counts'})


In [20]:
X_sell = concat(df_test)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['conc'] = X[0].astype(str) + '_' + X[1].astype(str)


In [21]:
X_sell

Unnamed: 0,combo,counts
0,1004870_1004836,2
1,1003306_1002544,2
2,1004767_1004833,2
3,1004870_1004767,2
4,1004836_1004856,2
...,...,...
141,1005115_1004258,1
142,1005105_1005118,1
143,1004565_1005003,1
144,28719302_28715120,1


In [22]:
def x_seller(X):
    X["product_1"] = X["combo"].str.split("_",expand=True)[0].astype('category')
    X["product_2"] = X["combo"].str.split("_",expand=True)[1].astype('category')
    return X
    

In [23]:
new_dataframe = x_seller(X_sell)

new_dataframe.product_1 = new_dataframe.product_1.apply(lambda x: int(x))
new_dataframe.product_2 = new_dataframe.product_2.apply(lambda x: int(x))


In [24]:
new_df_1 = new_dataframe.merge(X_meta, how='left', left_on='product_1', right_on='product_id').rename(columns={'price': 'price_1', 'metadata': 'metadata_1'})

In [25]:
new_df_1.head()

Unnamed: 0,combo,counts,product_1,product_2,event_type,product_id,category_id,category_code,brand,price_1,user_id,user_session,25%,75%,price_category,metadata_1
0,1004870_1004836,2,1004870,1004836,view,1004870,2053013555631882655,electronics smartphone,samsung,286.86,551658360,f5aa89f7-e4a8-4886-b564-c2ace1cb0a44,179.26,725.09,medium,electronics smartphone samsung medium
1,1004870_1004836,2,1004870,1004836,view,1004870,2053013555631882655,electronics smartphone,samsung,286.86,516414268,e06fdaa6-6062-428d-a332-1607ec63fc17,179.26,725.09,medium,electronics smartphone samsung medium
2,1004870_1004836,2,1004870,1004836,view,1004870,2053013555631882655,electronics smartphone,samsung,286.86,514715527,85216168-a50f-48a4-a127-4ee9cc9421e4,179.26,725.09,medium,electronics smartphone samsung medium
3,1004870_1004836,2,1004870,1004836,view,1004870,2053013555631882655,electronics smartphone,samsung,286.86,541967721,b51df33d-d8f8-44d0-94ca-3feeced863a4,179.26,725.09,medium,electronics smartphone samsung medium
4,1004870_1004836,2,1004870,1004836,view,1004870,2053013555631882655,electronics smartphone,samsung,286.86,512730342,403d3545-0e32-4d7d-a6ef-eb75efc7be38,179.26,725.09,medium,electronics smartphone samsung medium


In [26]:
new_df_1 = new_df_1.drop_duplicates(['product_1','product_2'])
new_df_1.shape

(146, 16)

In [27]:
new_df_1.head()

Unnamed: 0,combo,counts,product_1,product_2,event_type,product_id,category_id,category_code,brand,price_1,user_id,user_session,25%,75%,price_category,metadata_1
0,1004870_1004836,2,1004870,1004836,view,1004870,2053013555631882655,electronics smartphone,samsung,286.86,551658360,f5aa89f7-e4a8-4886-b564-c2ace1cb0a44,179.26,725.09,medium,electronics smartphone samsung medium
10650,1003306_1002544,2,1003306,1002544,view,1003306,2053013555631882655,electronics smartphone,apple,588.77,555462420,54376386-c6a2-4a33-8271-a2cc7aa1bbe5,179.26,725.09,medium,electronics smartphone apple medium
16217,1004767_1004833,2,1004767,1004833,view,1004767,2053013555631882655,electronics smartphone,samsung,254.82,551417808,3f2209fb-0fe6-4093-8486-51e52dbda8cf,179.26,725.09,medium,electronics smartphone samsung medium
37473,1004870_1004767,2,1004870,1004767,view,1004870,2053013555631882655,electronics smartphone,samsung,286.86,551658360,f5aa89f7-e4a8-4886-b564-c2ace1cb0a44,179.26,725.09,medium,electronics smartphone samsung medium
48123,1004836_1004856,2,1004836,1004856,view,1004836,2053013555631882655,electronics smartphone,samsung,241.19,546259103,6e2984c8-502e-4fe7-bbba-34087f760175,179.26,725.09,medium,electronics smartphone samsung medium


In [28]:
df_short = new_df_1[['combo', 'counts', 'product_1', 'product_2', 'price_1', 'metadata_1']]
df_short.head()

Unnamed: 0,combo,counts,product_1,product_2,price_1,metadata_1
0,1004870_1004836,2,1004870,1004836,286.86,electronics smartphone samsung medium
10650,1003306_1002544,2,1003306,1002544,588.77,electronics smartphone apple medium
16217,1004767_1004833,2,1004767,1004833,254.82,electronics smartphone samsung medium
37473,1004870_1004767,2,1004870,1004767,286.86,electronics smartphone samsung medium
48123,1004836_1004856,2,1004836,1004856,241.19,electronics smartphone samsung medium


In [29]:
df_final = df_short.merge(X_meta, how='left', left_on='product_2', right_on='product_id').rename(columns={'price': 'price_2', 'metadata': 'metadata_2'}).drop_duplicates(['product_1','product_2'])

In [30]:
df_final

Unnamed: 0,combo,counts,product_1,product_2,price_1,metadata_1,event_type,product_id,category_id,category_code,brand,price_2,user_id,user_session,25%,75%,price_category,metadata_2
0,1004870_1004836,2,1004870,1004836,286.86,electronics smartphone samsung medium,view,1004836,2053013555631882655,electronics smartphone,samsung,241.19,546259103,6e2984c8-502e-4fe7-bbba-34087f760175,179.26,725.09,medium,electronics smartphone samsung medium
7950,1003306_1002544,2,1003306,1002544,588.77,electronics smartphone apple medium,view,1002544,2053013555631882655,electronics smartphone,apple,464.13,515544667,f619453a-5696-43ba-8cd5-3752e31d811d,179.26,725.09,medium,electronics smartphone apple medium
17976,1004767_1004833,2,1004767,1004833,254.82,electronics smartphone samsung medium,view,1004833,2053013555631882655,electronics smartphone,samsung,174.76,519403613,36c36d74-311f-4b3a-b554-3ae63824c1ab,179.26,725.09,low,electronics smartphone samsung low
29508,1004870_1004767,2,1004870,1004767,286.86,electronics smartphone samsung medium,view,1004767,2053013555631882655,electronics smartphone,samsung,254.82,551417808,3f2209fb-0fe6-4093-8486-51e52dbda8cf,179.26,725.09,medium,electronics smartphone samsung medium
50764,1004836_1004856,2,1004836,1004856,241.19,electronics smartphone samsung medium,view,1004856,2053013555631882655,electronics smartphone,samsung,130.76,515757896,4938043e-e50f-44ad-944d-958d04df62d6,179.26,725.09,low,electronics smartphone samsung low
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
795168,1005115_1004258,1,1005115,1004258,975.57,electronics smartphone apple high,view,1004258,2053013555631882655,electronics smartphone,apple,735.05,540451356,66e9533a-cd23-4dfc-a335-92612d3c06a2,179.26,725.09,high,electronics smartphone apple high
799550,1005105_1005118,1,1005105,1005118,1415.48,electronics smartphone apple high,view,1005118,2053013555631882655,electronics smartphone,apple,975.57,513574985,ee749cd7-2ecd-4b1c-9b17-0df8c52b487c,179.26,725.09,high,electronics smartphone apple high
801230,1004565_1005003,1,1004565,1005003,177.47,electronics smartphone huawei low,view,1005003,2053013555631882655,electronics smartphone,huawei,258.21,530102906,0c74e0cc-d340-46ab-a2ec-22991081648b,179.26,725.09,medium,electronics smartphone huawei medium
803541,28719302_28715120,1,28719302,28715120,94.98,apparel shoes keds fila medium,view,28715120,2053013565480109009,apparel shoes keds,baden,51.22,548016731,50bd640a-55ea-440b-8b2f-5fa4b4f9d39c,51.22,100.39,medium,apparel shoes keds baden medium


# Final recommendation

In [31]:
df_final = df_final[['combo', 'counts', 'product_1', 'product_2', 'price_1', 'price_2', 'metadata_1', 'metadata_2']]

In [32]:
df_final

Unnamed: 0,combo,counts,product_1,product_2,price_1,price_2,metadata_1,metadata_2
0,1004870_1004836,2,1004870,1004836,286.86,241.19,electronics smartphone samsung medium,electronics smartphone samsung medium
7950,1003306_1002544,2,1003306,1002544,588.77,464.13,electronics smartphone apple medium,electronics smartphone apple medium
17976,1004767_1004833,2,1004767,1004833,254.82,174.76,electronics smartphone samsung medium,electronics smartphone samsung low
29508,1004870_1004767,2,1004870,1004767,286.86,254.82,electronics smartphone samsung medium,electronics smartphone samsung medium
50764,1004836_1004856,2,1004836,1004856,241.19,130.76,electronics smartphone samsung medium,electronics smartphone samsung low
...,...,...,...,...,...,...,...,...
795168,1005115_1004258,1,1005115,1004258,975.57,735.05,electronics smartphone apple high,electronics smartphone apple high
799550,1005105_1005118,1,1005105,1005118,1415.48,975.57,electronics smartphone apple high,electronics smartphone apple high
801230,1004565_1005003,1,1004565,1005003,177.47,258.21,electronics smartphone huawei low,electronics smartphone huawei medium
803541,28719302_28715120,1,28719302,28715120,94.98,51.22,apparel shoes keds fila medium,apparel shoes keds baden medium


In [33]:
df_final = df_final[df_final['metadata_1'] != df_final['metadata_2']]

In [34]:
df_final.head()

Unnamed: 0,combo,counts,product_1,product_2,price_1,price_2,metadata_1,metadata_2
17976,1004767_1004833,2,1004767,1004833,254.82,174.76,electronics smartphone samsung medium,electronics smartphone samsung low
50764,1004836_1004856,2,1004836,1004856,241.19,130.76,electronics smartphone samsung medium,electronics smartphone samsung low
74757,4200545_3600952,1,4200545,3600952,437.57,423.89,appliances environment air_conditioner elenber...,appliances kitchen washer samsung high
74818,1004961_1201466,1,1004961,1201466,179.36,237.27,electronics smartphone oppo medium,electronics tablet samsung medium
75119,5100722_1004767,1,5100722,1004767,169.62,254.82,electronics clocks huawei medium,electronics smartphone samsung medium


In [35]:
df_final.head()

Unnamed: 0,combo,counts,product_1,product_2,price_1,price_2,metadata_1,metadata_2
17976,1004767_1004833,2,1004767,1004833,254.82,174.76,electronics smartphone samsung medium,electronics smartphone samsung low
50764,1004836_1004856,2,1004836,1004856,241.19,130.76,electronics smartphone samsung medium,electronics smartphone samsung low
74757,4200545_3600952,1,4200545,3600952,437.57,423.89,appliances environment air_conditioner elenber...,appliances kitchen washer samsung high
74818,1004961_1201466,1,1004961,1201466,179.36,237.27,electronics smartphone oppo medium,electronics tablet samsung medium
75119,5100722_1004767,1,5100722,1004767,169.62,254.82,electronics clocks huawei medium,electronics smartphone samsung medium


In [36]:
df_final["1.1"] = df_final["metadata_1"].str.split(" ",expand=True)[0].astype('category')
df_final["1.2"] = df_final["metadata_1"].str.split(" ",expand=True)[1].astype('category')
df_final.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_final["1.1"] = df_final["metadata_1"].str.split(" ",expand=True)[0].astype('category')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_final["1.2"] = df_final["metadata_1"].str.split(" ",expand=True)[1].astype('category')


Unnamed: 0,combo,counts,product_1,product_2,price_1,price_2,metadata_1,metadata_2,1.1,1.2
17976,1004767_1004833,2,1004767,1004833,254.82,174.76,electronics smartphone samsung medium,electronics smartphone samsung low,electronics,smartphone
50764,1004836_1004856,2,1004836,1004856,241.19,130.76,electronics smartphone samsung medium,electronics smartphone samsung low,electronics,smartphone
74757,4200545_3600952,1,4200545,3600952,437.57,423.89,appliances environment air_conditioner elenber...,appliances kitchen washer samsung high,appliances,environment
74818,1004961_1201466,1,1004961,1201466,179.36,237.27,electronics smartphone oppo medium,electronics tablet samsung medium,electronics,smartphone
75119,5100722_1004767,1,5100722,1004767,169.62,254.82,electronics clocks huawei medium,electronics smartphone samsung medium,electronics,clocks


In [37]:
df_final["2.1"] = df_final["metadata_2"].str.split(" ",expand=True)[0].astype('category')
df_final["2.2"] = df_final["metadata_2"].str.split(" ",expand=True)[1].astype('category')
df_final.shape

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_final["2.1"] = df_final["metadata_2"].str.split(" ",expand=True)[0].astype('category')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_final["2.2"] = df_final["metadata_2"].str.split(" ",expand=True)[1].astype('category')


(93, 12)

In [38]:
df_final['test_1'] = df_final[['1.1', '1.2']].apply(lambda x: ' '.join(x), axis = 1)
df_final['test_2'] = df_final[['2.1', '2.2']].apply(lambda x: ' '.join(x), axis = 1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_final['test_1'] = df_final[['1.1', '1.2']].apply(lambda x: ' '.join(x), axis = 1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_final['test_2'] = df_final[['2.1', '2.2']].apply(lambda x: ' '.join(x), axis = 1)


In [39]:
df_final.head()

Unnamed: 0,combo,counts,product_1,product_2,price_1,price_2,metadata_1,metadata_2,1.1,1.2,2.1,2.2,test_1,test_2
17976,1004767_1004833,2,1004767,1004833,254.82,174.76,electronics smartphone samsung medium,electronics smartphone samsung low,electronics,smartphone,electronics,smartphone,electronics smartphone,electronics smartphone
50764,1004836_1004856,2,1004836,1004856,241.19,130.76,electronics smartphone samsung medium,electronics smartphone samsung low,electronics,smartphone,electronics,smartphone,electronics smartphone,electronics smartphone
74757,4200545_3600952,1,4200545,3600952,437.57,423.89,appliances environment air_conditioner elenber...,appliances kitchen washer samsung high,appliances,environment,appliances,kitchen,appliances environment,appliances kitchen
74818,1004961_1201466,1,1004961,1201466,179.36,237.27,electronics smartphone oppo medium,electronics tablet samsung medium,electronics,smartphone,electronics,tablet,electronics smartphone,electronics tablet
75119,5100722_1004767,1,5100722,1004767,169.62,254.82,electronics clocks huawei medium,electronics smartphone samsung medium,electronics,clocks,electronics,smartphone,electronics clocks,electronics smartphone


In [40]:
df_final_2 = df_final[df_final['test_1'] != df_final['test_2']]

In [41]:
df_final_2

Unnamed: 0,combo,counts,product_1,product_2,price_1,price_2,metadata_1,metadata_2,1.1,1.2,2.1,2.2,test_1,test_2
74757,4200545_3600952,1,4200545,3600952,437.57,423.89,appliances environment air_conditioner elenber...,appliances kitchen washer samsung high,appliances,environment,appliances,kitchen,appliances environment,appliances kitchen
74818,1004961_1201466,1,1004961,1201466,179.36,237.27,electronics smartphone oppo medium,electronics tablet samsung medium,electronics,smartphone,electronics,tablet,electronics smartphone,electronics tablet
75119,5100722_1004767,1,5100722,1004767,169.62,254.82,electronics clocks huawei medium,electronics smartphone samsung medium,electronics,clocks,electronics,smartphone,electronics clocks,electronics smartphone
121175,4600614_1801514,1,4600614,1801514,373.75,154.42,appliances kitchen dishwasher flavia medium,electronics video tv haier low,appliances,kitchen,electronics,video,appliances kitchen,electronics video
150165,3701056_3801070,1,3701056,3801070,64.33,26.49,appliances environment vacuum samsung low,appliances iron panasonic low,appliances,environment,appliances,iron,appliances environment,appliances iron
197741,1004750_1801690,1,1004750,1801690,197.43,368.04,electronics smartphone samsung medium,electronics video tv samsung medium,electronics,smartphone,electronics,video,electronics smartphone,electronics video
201638,3100541_1801623,1,3100541,1801623,36.01,395.2,appliances kitchen blender polaris medium,electronics video tv lg medium,appliances,kitchen,electronics,video,appliances kitchen,electronics video
224451,5100855_1005115,1,5100855,1005115,617.52,975.57,electronics clocks apple high,electronics smartphone apple high,electronics,clocks,electronics,smartphone,electronics clocks,electronics smartphone
242500,1307238_1004886,1,1307238,1004886,303.46,153.04,computers notebook lenovo low,electronics smartphone oppo low,computers,notebook,electronics,smartphone,computers notebook,electronics smartphone
253467,4802036_1005135,1,4802036,1005135,171.56,1747.79,electronics audio headphone apple high,electronics smartphone apple high,electronics,audio,electronics,smartphone,electronics audio,electronics smartphone


In [48]:
def X_seller(X, product_id, n=2):
    X.sort_values(by='counts')
    X = X.loc[X['product_1']==product_id]
    X = X[['combo', 'counts', 'product_1', 'product_2', 'price_1', 'price_2', 'metadata_1', 'metadata_2']]
    return X

In [52]:
X_seller(df_final_2, 4802036)

Unnamed: 0,combo,counts,product_1,product_2,price_1,price_2,metadata_1,metadata_2
253467,4802036_1005135,1,4802036,1005135,171.56,1747.79,electronics audio headphone apple high,electronics smartphone apple high
