In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics.pairwise import cosine_similarity

from collections import Counter
import operator

In [2]:
filename = '../data/2019-Oct.csv_10%.csv'

In [1]:
1

1

In [3]:
import random
p = 0.5  # 1% of the lines
# keep the header, then take only 1% of lines
# if random from [0,1] interval is greater than 0.01 the row will be skipped
df = pd.read_csv(
         filename,
         header=0, 
         skiprows=lambda i: i>0 and random.random() > p
)
df.shape

(2124617, 9)

# Preprocess

In [4]:
def preprocessing_feat(X, drop_event_time=False):
    if drop_event_time:
        X = X.drop("event_time", axis=1)
    
    X_preprocessed = X.dropna(subset = ['category_code', 'brand']) #tbd!!
    X_preprocessed = X_preprocessed.drop_duplicates()
    X_preprocessed['category_code'] = X_preprocessed['category_code'].str.replace('.',' ')
    return X_preprocessed

In [5]:
X_preprocessed = preprocessing_feat(df, drop_event_time=True)

  X_preprocessed['category_code'] = X_preprocessed['category_code'].str.replace('.',' ')


In [6]:
X_preprocessed.shape

(1293444, 8)

In [7]:
def make_column(row):
    if row["price"] < row["25%"]:
        return "low"
    elif row["price"] < row["75%"]:
        return "medium"
    else:
        return "high"

def pricing_criterion(X):
    pricing_guide = X.groupby('category_code')['price'].describe()[["25%", "75%"]].reset_index()
    X_merged = X.merge(pricing_guide, on="category_code", how="right")
    X_merged["price_category"] = X_merged.apply(lambda row: make_column(row), axis=1)
    return X_merged


In [8]:
X_merged = pricing_criterion(X_preprocessed)

In [9]:
def metadata(X):
    X['metadata'] = X[['category_code', 'brand', 'price_category']].apply(lambda x: ' '.join(x), axis = 1)
    return X


In [10]:
X_meta = metadata(X_merged)
X_meta.shape

(1293444, 12)

In [11]:
X_meta.head()

Unnamed: 0,event_type,product_id,category_id,category_code,brand,price,user_id,user_session,25%,75%,price_category,metadata
0,view,32900083,2055156924407612189,accessories bag,a-elita,8.49,549437633,62760980-f427-42e0-a891-8b590fd54584,16.28,76.96,low,accessories bag a-elita low
1,view,28401176,2053013566209917945,accessories bag,karya,100.39,549348524,933c58ab-5185-44d6-94d6-c2ccb335d7f9,16.28,76.96,high,accessories bag karya high
2,view,28400461,2053013566209917945,accessories bag,fabretti,31.92,512406877,c1ae47b0-9ab4-4f5f-947a-2e0d80970e6c,16.28,76.96,medium,accessories bag fabretti medium
3,view,28401054,2053013566209917945,accessories bag,karya,100.39,514312434,0d0c3d87-ed6f-4f58-9b9e-2cc44b89d25c,16.28,76.96,high,accessories bag karya high
4,view,18300169,2053013558945383017,accessories bag,hp,17.82,555465687,1fa735b1-1d98-4562-9b7d-74211f89394a,16.28,76.96,medium,accessories bag hp medium


In [4]:
filename = '../data/oct_final/X_meta_with_100pct_data_oct19.csv'

In [51]:
import random
p = 0.25  # 1% of the lines
# keep the header, then take only 1% of lines
# if random from [0,1] interval is greater than 0.01 the row will be skipped
df = pd.read_csv(
         filename,
         header=0, 
         skiprows=lambda i: i>0 and random.random() > p
)
df.shape

(1330380, 14)

In [52]:
X_meta = df

In [53]:
X_meta.shape

(1330380, 14)

In [54]:
X_meta.drop(columns=['product_id.1', 'Unnamed: 0'], inplace=True)

In [55]:
X_purch = X_meta[X_meta['event_type']=='purchase']

In [56]:
X_purch.head()

Unnamed: 0,product_id,event_type,category_id,category_code,brand,price,user_id,user_session,25%,75%,price_category,metadata
18,28401176,purchase,2053013566209917945,accessories bag,karya,100.39,512478069,a06ea679-2769-4ac8-ac2e-89225ed5e34b,16.32,76.96,high,accessories bag karya high
113,18300389,purchase,2053013558945383017,accessories bag,continent,36.01,541455363,efeb5712-ff06-4975-a6bd-8d690c6fb217,16.32,76.96,medium,accessories bag continent medium
415,28400930,purchase,2053013566209917945,accessories bag,respect,82.11,539475687,99948b73-fd37-43e7-89a5-e3ab35e4bcf7,16.32,76.96,high,accessories bag respect high
431,18300163,purchase,2053013558945383017,accessories bag,sumdex,19.53,531550634,b9f71ac1-3e4b-40f0-b1e0-f4e33387202e,16.32,76.96,medium,accessories bag sumdex medium
778,28400774,purchase,2053013566209917945,accessories bag,baden,46.08,512594229,8682b196-9fc7-49a0-82c2-35096419d731,16.32,76.96,medium,accessories bag baden medium


In [57]:
def filter(X):
    df = pd.DataFrame(X.groupby('user_session').event_type.count())
    df_new = df[df['event_type']>1]
    return df_new

In [58]:
X_filter = filter(X_purch)

In [59]:
df_cross = X_purch[X_purch['user_session'].isin(X_filter.index)]

In [60]:
df_cross.head()

Unnamed: 0,product_id,event_type,category_id,category_code,brand,price,user_id,user_session,25%,75%,price_category,metadata
3240,18300088,purchase,2053013558945383017,accessories bag,sumdex,22.63,515867009,a551d776-a1bf-4d1f-ac61-b8de424e9d88,16.32,76.96,medium,accessories bag sumdex medium
34034,28715723,purchase,2053013565782098913,apparel shoes,caprice,64.61,562669160,a82f9713-d470-4b5e-bb79-256b2ffdd78b,62.81,105.54,medium,apparel shoes caprice medium
44850,28716214,purchase,2053013565069067197,apparel shoes keds,respect,66.67,555685958,21fdafde-1fa4-4805-9783-2273e30ec9c2,51.22,100.72,medium,apparel shoes keds respect medium
44851,28716214,purchase,2053013565069067197,apparel shoes keds,respect,66.67,555685958,21fdafde-1fa4-4805-9783-2273e30ec9c2,51.22,100.72,medium,apparel shoes keds respect medium
73817,6200717,purchase,2053013552293216471,appliances environment air_heater,oasis,9.01,515577708,d466a233-fe39-4974-ab93-96bb49862654,23.91,58.94,low,appliances environment air_heater oasis low


# Counter

In [61]:
df_test = pd.DataFrame(df_cross.groupby('user_session')['product_id'].apply(list).tolist())
df_test.head()

Unnamed: 0,0,1,2,3
0,4804056,4804056,,
1,4804056,1004249,,
2,1005165,1005165,,
3,3800106,3800106,,
4,1004905,1004905,,


In [62]:
def concat(X):
    X = X[X[0]!=X[1]]
    X['conc'] = X[0].astype(str) + '_' + X[1].astype(str)
    
    lst = list(X['conc'])
    
    a = list(Counter(lst).keys()) # equals to list(X['conc'])
    b = list(Counter(lst).values()) # counts the elements' frequency
    
    dct_1 = {a[i]: b[i] for i in range(len(a))}
    dct_2 = dict(sorted(dct_1.items(), key=operator.itemgetter(1),reverse=True))

    return pd.DataFrame.from_dict(dct_2, orient='index').reset_index().rename(columns={'index': 'combo', 0: 'counts'})


In [63]:
X_sell = concat(df_test)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['conc'] = X[0].astype(str) + '_' + X[1].astype(str)


In [64]:
X_sell

Unnamed: 0,combo,counts
0,1004870_1004767,3
1,1004767_1004750,2
2,1004249_1003306,2
3,1004767_1004833,2
4,1004833_1004856,2
...,...,...
171,3801108_1004863,1
172,1004777_16200168,1
173,1802038_1801837,1
174,3200549_2900958,1


In [65]:
def x_seller(X):
    X["product_1"] = X["combo"].str.split("_",expand=True)[0].astype('category')
    X["product_2"] = X["combo"].str.split("_",expand=True)[1].astype('category')
    return X
    

In [66]:
new_dataframe = x_seller(X_sell)

new_dataframe.product_1 = new_dataframe.product_1.apply(lambda x: int(x))
new_dataframe.product_2 = new_dataframe.product_2.apply(lambda x: int(x))


In [67]:
new_df_1 = new_dataframe.merge(X_meta, how='left', left_on='product_1', right_on='product_id').rename(columns={'price': 'price_1', 'metadata': 'metadata_1'})

In [68]:
new_df_1.head()

Unnamed: 0,combo,counts,product_1,product_2,product_id,event_type,category_id,category_code,brand,price_1,user_id,user_session,25%,75%,price_category,metadata_1
0,1004870_1004767,3,1004870,1004767,1004870,view,2053013555631882655,electronics smartphone,samsung,286.86,555461801,5e840781-e6f7-496d-a523-f7ad05c585a3,179.21,721.34,medium,electronics smartphone samsung medium
1,1004870_1004767,3,1004870,1004767,1004870,view,2053013555631882655,electronics smartphone,samsung,286.86,555461801,5e840781-e6f7-496d-a523-f7ad05c585a3,179.21,721.34,medium,electronics smartphone samsung medium
2,1004870_1004767,3,1004870,1004767,1004870,view,2053013555631882655,electronics smartphone,samsung,286.86,544148702,bc59e177-17c3-4495-bd40-23c2d3366e86,179.21,721.34,medium,electronics smartphone samsung medium
3,1004870_1004767,3,1004870,1004767,1004870,view,2053013555631882655,electronics smartphone,samsung,286.86,549849367,a2d711f9-812d-40cf-bb15-761a9fd0e1b2,179.21,721.34,medium,electronics smartphone samsung medium
4,1004870_1004767,3,1004870,1004767,1004870,view,2053013555631882655,electronics smartphone,samsung,286.86,526595771,7448d4c3-9cbb-4111-b953-5224419069ba,179.21,721.34,medium,electronics smartphone samsung medium


In [69]:
new_df_1 = new_df_1.drop_duplicates(['product_1','product_2'])
new_df_1.shape

(176, 16)

In [70]:
new_df_1.head()

Unnamed: 0,combo,counts,product_1,product_2,product_id,event_type,category_id,category_code,brand,price_1,user_id,user_session,25%,75%,price_category,metadata_1
0,1004870_1004767,3,1004870,1004767,1004870,view,2053013555631882655,electronics smartphone,samsung,286.86,555461801,5e840781-e6f7-496d-a523-f7ad05c585a3,179.21,721.34,medium,electronics smartphone samsung medium
11122,1004767_1004750,2,1004767,1004750,1004767,view,2053013555631882655,electronics smartphone,samsung,254.82,512730342,4a34d086-4940-45e1-b70b-b04b1b8e96aa,179.21,721.34,medium,electronics smartphone samsung medium
33195,1004249_1003306,2,1004249,1003306,1004249,view,2053013555631882655,electronics smartphone,apple,739.81,516298680,350a08d5-db6a-4d66-a7a8-284a8520754e,179.21,721.34,high,electronics smartphone apple high
44580,1004767_1004833,2,1004767,1004833,1004767,view,2053013555631882655,electronics smartphone,samsung,254.82,512730342,4a34d086-4940-45e1-b70b-b04b1b8e96aa,179.21,721.34,medium,electronics smartphone samsung medium
66653,1004833_1004856,2,1004833,1004856,1004833,view,2053013555631882655,electronics smartphone,samsung,174.76,541414887,fcbaf678-f89e-4e2a-a5c4-b1c307640d31,179.21,721.34,low,electronics smartphone samsung low


In [71]:
df_short = new_df_1[['combo', 'counts', 'product_1', 'product_2', 'price_1', 'metadata_1']]
df_short.head()

Unnamed: 0,combo,counts,product_1,product_2,price_1,metadata_1
0,1004870_1004767,3,1004870,1004767,286.86,electronics smartphone samsung medium
11122,1004767_1004750,2,1004767,1004750,254.82,electronics smartphone samsung medium
33195,1004249_1003306,2,1004249,1003306,739.81,electronics smartphone apple high
44580,1004767_1004833,2,1004767,1004833,254.82,electronics smartphone samsung medium
66653,1004833_1004856,2,1004833,1004856,174.76,electronics smartphone samsung low


In [72]:
df_final = df_short.merge(X_meta, how='left', left_on='product_2', right_on='product_id').rename(columns={'price': 'price_2', 'metadata': 'metadata_2'}).drop_duplicates(['product_1','product_2'])

In [73]:
df_final

Unnamed: 0,combo,counts,product_1,product_2,price_1,metadata_1,product_id,event_type,category_id,category_code,brand,price_2,user_id,user_session,25%,75%,price_category,metadata_2
0,1004870_1004767,3,1004870,1004767,286.86,electronics smartphone samsung medium,1004767,view,2053013555631882655,electronics smartphone,samsung,254.82,512730342,4a34d086-4940-45e1-b70b-b04b1b8e96aa,179.2100,721.34,medium,electronics smartphone samsung medium
22073,1004767_1004750,2,1004767,1004750,254.82,electronics smartphone samsung medium,1004750,view,2053013555631882655,electronics smartphone,samsung,197.43,534079025,1f9ea674-626f-4ed8-a3e1-10afbf4f843d,179.2100,721.34,medium,electronics smartphone samsung medium
28218,1004249_1003306,2,1004249,1003306,739.81,electronics smartphone apple high,1003306,view,2053013555631882655,electronics smartphone,apple,588.77,513781008,ab1c76e1-94e1-47a2-9084-04e8be98e747,179.2100,721.34,medium,electronics smartphone apple medium
34074,1004767_1004833,2,1004767,1004833,254.82,electronics smartphone samsung medium,1004833,view,2053013555631882655,electronics smartphone,samsung,174.76,541414887,fcbaf678-f89e-4e2a-a5c4-b1c307640d31,179.2100,721.34,low,electronics smartphone samsung low
46068,1004833_1004856,2,1004833,1004856,174.76,electronics smartphone samsung low,1004856,view,2053013555631882655,electronics smartphone,samsung,130.76,552783882,0ac92940-754e-4cee-baaf-e6a1701a7062,179.2100,721.34,low,electronics smartphone samsung low
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1038203,3801108_1004863,1,3801108,1004863,386.08,appliances iron braun high,1004863,view,2053013555631882655,electronics smartphone,samsung,174.76,546953010,52438bd6-8549-4429-9819-2690bb891e57,179.2100,721.34,low,electronics smartphone samsung low
1039905,1004777_16200168,1,1004777,16200168,136.40,electronics smartphone xiaomi low,16200168,view,2053013556344914381,kids fmcg diapers,huggies,2.63,515535611,52ffe7a7-8976-4230-b480-751114c03e7d,9.0025,18.47,low,kids fmcg diapers huggies low
1039909,1802038_1801837,1,1802038,1801837,195.37,electronics video tv tcl low,1801837,view,2053013554415534427,electronics video tv,rebus,179.92,517672882,e96c4aa7-06fd-415c-a5d1-3087e5a95c0a,220.0100,496.48,low,electronics video tv rebus low
1039988,3200549_2900958,1,3200549,2900958,25.71,appliances kitchen meat_grinder dauscher low,2900958,view,2053013554776244595,appliances kitchen microwave,arg,56.60,555775937,bb8c29c8-c507-90a5-bc5f-4bae60d282a3,56.6000,115.58,medium,appliances kitchen microwave arg medium


# Final recommendation

In [74]:
df_final = df_final[['combo', 'counts', 'product_1', 'product_2', 'price_1', 'price_2', 'metadata_1', 'metadata_2']]

In [75]:
df_final

Unnamed: 0,combo,counts,product_1,product_2,price_1,price_2,metadata_1,metadata_2
0,1004870_1004767,3,1004870,1004767,286.86,254.82,electronics smartphone samsung medium,electronics smartphone samsung medium
22073,1004767_1004750,2,1004767,1004750,254.82,197.43,electronics smartphone samsung medium,electronics smartphone samsung medium
28218,1004249_1003306,2,1004249,1003306,739.81,588.77,electronics smartphone apple high,electronics smartphone apple medium
34074,1004767_1004833,2,1004767,1004833,254.82,174.76,electronics smartphone samsung medium,electronics smartphone samsung low
46068,1004833_1004856,2,1004833,1004856,174.76,130.76,electronics smartphone samsung low,electronics smartphone samsung low
...,...,...,...,...,...,...,...,...
1038203,3801108_1004863,1,3801108,1004863,386.08,174.76,appliances iron braun high,electronics smartphone samsung low
1039905,1004777_16200168,1,1004777,16200168,136.40,2.63,electronics smartphone xiaomi low,kids fmcg diapers huggies low
1039909,1802038_1801837,1,1802038,1801837,195.37,179.92,electronics video tv tcl low,electronics video tv rebus low
1039988,3200549_2900958,1,3200549,2900958,25.71,56.60,appliances kitchen meat_grinder dauscher low,appliances kitchen microwave arg medium


In [76]:
df_final = df_final[df_final['metadata_1'] != df_final['metadata_2']]

In [77]:
df_final.head()

Unnamed: 0,combo,counts,product_1,product_2,price_1,price_2,metadata_1,metadata_2
28218,1004249_1003306,2,1004249,1003306,739.81,588.77,electronics smartphone apple high,electronics smartphone apple medium
34074,1004767_1004833,2,1004767,1004833,254.82,174.76,electronics smartphone samsung medium,electronics smartphone samsung low
71264,1004767_1004856,2,1004767,1004856,254.82,130.76,electronics smartphone samsung medium,electronics smartphone samsung low
96460,4804056_1005159,2,4804056,1005159,161.98,231.41,electronics audio headphone apple high,electronics smartphone xiaomi medium
99504,1004767_1005100,2,1004767,1005100,254.82,154.42,electronics smartphone samsung medium,electronics smartphone samsung low


In [78]:
df_final.head()

Unnamed: 0,combo,counts,product_1,product_2,price_1,price_2,metadata_1,metadata_2
28218,1004249_1003306,2,1004249,1003306,739.81,588.77,electronics smartphone apple high,electronics smartphone apple medium
34074,1004767_1004833,2,1004767,1004833,254.82,174.76,electronics smartphone samsung medium,electronics smartphone samsung low
71264,1004767_1004856,2,1004767,1004856,254.82,130.76,electronics smartphone samsung medium,electronics smartphone samsung low
96460,4804056_1005159,2,4804056,1005159,161.98,231.41,electronics audio headphone apple high,electronics smartphone xiaomi medium
99504,1004767_1005100,2,1004767,1005100,254.82,154.42,electronics smartphone samsung medium,electronics smartphone samsung low


In [79]:
df_final["1.1"] = df_final["metadata_1"].str.split(" ",expand=True)[0].astype('category')
df_final["1.2"] = df_final["metadata_1"].str.split(" ",expand=True)[1].astype('category')
df_final.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_final["1.1"] = df_final["metadata_1"].str.split(" ",expand=True)[0].astype('category')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_final["1.2"] = df_final["metadata_1"].str.split(" ",expand=True)[1].astype('category')


Unnamed: 0,combo,counts,product_1,product_2,price_1,price_2,metadata_1,metadata_2,1.1,1.2
28218,1004249_1003306,2,1004249,1003306,739.81,588.77,electronics smartphone apple high,electronics smartphone apple medium,electronics,smartphone
34074,1004767_1004833,2,1004767,1004833,254.82,174.76,electronics smartphone samsung medium,electronics smartphone samsung low,electronics,smartphone
71264,1004767_1004856,2,1004767,1004856,254.82,130.76,electronics smartphone samsung medium,electronics smartphone samsung low,electronics,smartphone
96460,4804056_1005159,2,4804056,1005159,161.98,231.41,electronics audio headphone apple high,electronics smartphone xiaomi medium,electronics,audio
99504,1004767_1005100,2,1004767,1005100,254.82,154.42,electronics smartphone samsung medium,electronics smartphone samsung low,electronics,smartphone


In [80]:
df_final["2.1"] = df_final["metadata_2"].str.split(" ",expand=True)[0].astype('category')
df_final["2.2"] = df_final["metadata_2"].str.split(" ",expand=True)[1].astype('category')
df_final.shape

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_final["2.1"] = df_final["metadata_2"].str.split(" ",expand=True)[0].astype('category')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_final["2.2"] = df_final["metadata_2"].str.split(" ",expand=True)[1].astype('category')


(125, 12)

In [81]:
df_final['test_1'] = df_final[['1.1', '1.2']].apply(lambda x: ' '.join(x), axis = 1)
df_final['test_2'] = df_final[['2.1', '2.2']].apply(lambda x: ' '.join(x), axis = 1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_final['test_1'] = df_final[['1.1', '1.2']].apply(lambda x: ' '.join(x), axis = 1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_final['test_2'] = df_final[['2.1', '2.2']].apply(lambda x: ' '.join(x), axis = 1)


In [82]:
df_final.head()

Unnamed: 0,combo,counts,product_1,product_2,price_1,price_2,metadata_1,metadata_2,1.1,1.2,2.1,2.2,test_1,test_2
28218,1004249_1003306,2,1004249,1003306,739.81,588.77,electronics smartphone apple high,electronics smartphone apple medium,electronics,smartphone,electronics,smartphone,electronics smartphone,electronics smartphone
34074,1004767_1004833,2,1004767,1004833,254.82,174.76,electronics smartphone samsung medium,electronics smartphone samsung low,electronics,smartphone,electronics,smartphone,electronics smartphone,electronics smartphone
71264,1004767_1004856,2,1004767,1004856,254.82,130.76,electronics smartphone samsung medium,electronics smartphone samsung low,electronics,smartphone,electronics,smartphone,electronics smartphone,electronics smartphone
96460,4804056_1005159,2,4804056,1005159,161.98,231.41,electronics audio headphone apple high,electronics smartphone xiaomi medium,electronics,audio,electronics,smartphone,electronics audio,electronics smartphone
99504,1004767_1005100,2,1004767,1005100,254.82,154.42,electronics smartphone samsung medium,electronics smartphone samsung low,electronics,smartphone,electronics,smartphone,electronics smartphone,electronics smartphone


In [83]:
df_final_2 = df_final[df_final['test_1'] != df_final['test_2']]

In [91]:
df_final_2.head()

Unnamed: 0,combo,counts,product_1,product_2,price_1,price_2,metadata_1,metadata_2,1.1,1.2,2.1,2.2,test_1,test_2
96460,4804056_1005159,2,4804056,1005159,161.98,231.41,electronics audio headphone apple high,electronics smartphone xiaomi medium,electronics,audio,electronics,smartphone,electronics audio,electronics smartphone
105599,4804056_1004249,1,4804056,1004249,161.98,739.81,electronics audio headphone apple high,electronics smartphone apple high,electronics,audio,electronics,smartphone,electronics audio,electronics smartphone
116984,3600231_5100610,1,3600231,5100610,100.38,360.34,appliances kitchen washer midea low,electronics clocks apple medium,appliances,kitchen,electronics,clocks,appliances kitchen,electronics clocks
127443,2701735_1307406,1,2701735,1307406,1297.27,553.39,appliances kitchen refrigerators lg high,computers notebook lenovo medium,appliances,kitchen,computers,notebook,appliances kitchen,computers notebook
135330,3701104_1004238,1,3701104,1004238,175.01,1205.23,appliances environment vacuum lg medium,electronics smartphone apple high,appliances,environment,electronics,smartphone,appliances environment,electronics smartphone


In [85]:
def X_seller(X, product_id, n=5):
    X.sort_values(by='counts')
    X = X.loc[X['product_1']==product_id]
    X = X[['combo', 'counts', 'product_1', 'product_2', 'price_1', 'price_2', 'metadata_1', 'metadata_2']]
    return X

In [93]:
X_seller(df_final_2, 2701735)

Unnamed: 0,combo,counts,product_1,product_2,price_1,price_2,metadata_1,metadata_2
127443,2701735_1307406,1,2701735,1307406,1297.27,553.39,appliances kitchen refrigerators lg high,computers notebook lenovo medium


In [96]:
df_final_2.to_csv('x_seller.csv')

In [95]:
!pwd

/home/cjergen/code/sailormoonvicky/eCommerce/notebooks
