In [2]:
import csv
import pandas as pd
from sklearn import preprocessing

In [6]:
## Read full metadata data set
meta_df = pd.read_json('meta.json')
meta_df = meta_df.T

# process metadata to be merged with split data
def filter_prices(x):
    if x.startswith('$'):
        return x
    else:
        return ''

del meta_df["tech1"]
del meta_df["fit"] 
del meta_df["also_buy"] 
del meta_df["tech2"] 
del meta_df["rank"] 
del meta_df["also_view"] 
del meta_df["main_cat"] 
del meta_df["similar_item"] 
del meta_df["imageURL"]
del meta_df["imageURLHighRes"]
del meta_df["details"]
del meta_df["description"] 
del meta_df["feature"]
del meta_df["title"]
del meta_df["brand"]
del meta_df["category"]

meta_df['price'] = meta_df['price'].astype(str)
meta_df['date'] = meta_df['date'].astype(str) 
meta_df["price"] = meta_df["price"].apply(lambda x: filter_prices(x))

le = preprocessing.LabelEncoder()
le.fit(meta_df["price"])
new_price = le.transform(meta_df["price"])
meta_df["price"] = new_price

le = preprocessing.LabelEncoder()
le.fit(meta_df["date"])
new_date = le.transform(meta_df["date"])
meta_df["date"] = new_date

meta_df = meta_df.rename(columns={"asin": "productID"})
meta_df.head(5)

Unnamed: 0,date,price,productID
0,0,0,42000742
1,0,0,78764343
2,0,16,276425316
3,0,0,324411812
4,0,0,439335310


In [5]:
# read in real and synthetic datasets
real_df = pd.read_csv('reviews_categorical.csv')
real_df.columns = ["userID", "productID", "vote", "rating", "verified"]
syn_df = pd.read_csv('synthetic_review_data.csv')

In [7]:
real_df.head(5)

Unnamed: 0,userID,productID,vote,rating,verified
0,A1HP7NVNPFMA4N,700026657,449,1,1
1,A1JGAP0185YJI6,700026657,449,1,0
2,A1YJWEXHQBWK2B,700026657,449,0,1
3,A2204E1TH211HT,700026657,449,0,1
4,A2RF5B5H74JLPE,700026657,449,1,1


## 100 Synth

In [20]:
syn_df = syn_df.rename(columns={"0": "rating"})
syn_df.head(5)

Unnamed: 0,userID,productID,vote,verified,rating
0,A8J93B28Z3YDD,B00FLLFJOU,219,1,0
1,AWG2O9C42XW5G,B00W8FYFBA,434,1,1
2,ALXA8BMGZ2HTH,B00DHF39EO,61,1,0
3,A16VB612MQUC8R,B00ZJRHSZO,368,1,0
4,A2DF6L1W8YCCJS,B01GKGVI8U,152,1,1


In [21]:
# merge metadata and full synth 
result1 = pd.merge(meta_df, syn_df, on='productID')

In [22]:
result1.head()

Unnamed: 0,date,price,productID,userID,vote,verified,rating
0,0,3471,700026657,APDZU24PSMOYE,449,0,1
1,0,3471,700026657,A11V6ZJ2FVQY1D,449,1,1
2,0,3471,700026657,A1KXJ1ELZIU05C,449,0,0
3,0,0,700026398,A2GPRA9HHLOC4B,39,0,1
4,0,0,700099867,A2582KMXLK2P06,322,0,1


In [23]:
len(result1)

107835

In [24]:
# save synthetic dataset
synth_100 = result1.sample(n=100000, replace=False, random_state=35)
synth_100.to_csv("100_synth_data.csv", index=False)

## 75/25 

In [25]:
# create 75/25 split
real_df1 = real_df.sample(n=75000, replace=False, random_state=3)
syn_df1 = syn_df.sample(n=25000, replace=False, random_state=35)

In [26]:
real_df1.head(3)

Unnamed: 0,userID,productID,vote,rating,verified
411783,AP4LPGVNZAZ8E,B00ZQB28XK,28,1,0
447376,A3TEDIF9P1FUSW,B00002ST36,449,1,1
142,A2F4Q24VGS5U4H,7293000936,449,1,1


In [27]:
syn_df1.head(3)

Unnamed: 0,userID,productID,vote,verified,rating
50158,A2TCG2HV1VJP6V,B003O6E800,127,1,1
60666,A2WCQTBTIKLFPT,B0012N5EQE,368,1,1
41028,A2SERLGL7JIR5F,B003O6EB70,449,0,1


In [28]:
df_75_25 = pd.concat([real_df1, syn_df1])
df_75_25

Unnamed: 0,userID,productID,vote,rating,verified
411783,AP4LPGVNZAZ8E,B00ZQB28XK,28,1,0
447376,A3TEDIF9P1FUSW,B00002ST36,449,1,1
142,A2F4Q24VGS5U4H,7293000936,449,1,1
168250,ANJXPB6WDM85P,B001AZ7RJ6,449,0,0
46994,AWE0R9MSS9988,B00009VE6B,344,1,0
...,...,...,...,...,...
40788,A4M15FDF2LIAI,B00OM6SAK0,400,0,0
60828,ACVKFV8LD7HDH,B000ZKA0J6,28,0,0
94417,AMGGZCWY3VNZC,B00BMFIXT2,449,1,1
27083,A61HUNZS38YMB,B000040OEI,449,1,0


In [29]:
result2 = pd.merge(meta_df, df_75_25, on='productID')

In [30]:
result2.head(10)

Unnamed: 0,date,price,productID,userID,vote,rating,verified
0,0,3471,700026657,A1EO9BFUHTGWKZ,449,1,1
1,0,3471,700026657,A1HP7NVNPFMA4N,449,1,1
2,0,3471,700026657,AV969NA4CBP10,449,1,1
3,0,3471,700026657,A2RF5B5H74JLPE,449,1,1
4,0,0,700026398,A1NQ759X8WPIVV,449,0,1
5,0,0,700026398,A2RGUDIF7VB7JZ,449,1,1
6,0,0,700099867,A361M14PU2GUEG,127,1,1
7,0,0,700099867,A20DRRKAN5Z9Q,449,1,0
8,0,0,700099867,A1BHRNLW2L8KLD,449,0,1
9,0,0,700099867,A1QJJU33VNC4S7,449,1,0


In [32]:
len(result2)

113843

In [33]:
# save 75/25 dataset
real_75 = result2.sample(n=100000, replace=False, random_state=35)
real_75.to_csv("75_real_25_synth_data.csv", index=False)

## 50/50

In [35]:
# create 50/50 split
real_df2 = real_df.sample(n=50000, replace=False, random_state=43)
syn_df2 = syn_df.sample(n=50000, replace=False, random_state=35)
df_50_50 = pd.concat([real_df2, syn_df2])
result3 = pd.merge(meta_df, df_50_50, on='productID')
print(len(result3))
synth_50 = result3.sample(n=100000, replace=False, random_state=35)
synth_50.to_csv("50_real_50_synth_data.csv", index=False)

113400


In [36]:
# create 90/10 split
real_df3 = real_df.sample(n=90000, replace=False, random_state=43)
syn_df3 = syn_df.sample(n=10000, replace=False, random_state=35)
df_90_10 = pd.concat([real_df3, syn_df3])
result4 = pd.merge(meta_df, df_90_10, on='productID')
print(len(result4))
synth_10 = result4.sample(n=100000, replace=False, random_state=35)
synth_10.to_csv("90_real_10_synth_data.csv", index=False)

114250


In [None]:
real_df = real_df.sample(n=100000, replace=False, random_state=3)

In [None]:
result = pd.merge(meta_df, reviews_df, on='productID')