## Data preparation for the streamlit app

In [1]:
import numpy as np
import pandas as pd
import altair as alt

In [2]:
# enable correct rendering
alt.renderers.enable('default')
# uses intermediate json files to speed things up
alt.data_transformers.enable('json')

DataTransformerRegistry.enable('json')

In [3]:
DATA_DIR = '/home/shiv/Documents/DataScience/Capstone/Data/' # change the path to the output csv files

df_lightgbm = pd.read_csv(DATA_DIR + 'lightgbm/output_8009.csv')
df_widedeep = pd.read_csv(DATA_DIR + 'wide_deep/Electronics/output_8009.csv')
df_xdeepfm = pd.read_csv(DATA_DIR + 'xdeepfm/output_8009.csv')
df_slirec = pd.read_csv(DATA_DIR + 'slirec/Electronics/output_8009.csv')
df_sasrec = pd.read_csv(DATA_DIR + 'sasrec/output_8009.csv')

### Sequential models: SLi-Rec, SASRec

In [4]:
# binary classification models: SLi-Rec, SASRec
df_slirec.shape, df_sasrec.shape

((63417, 6), (63417, 6))

In [5]:
df_slirec.head(4)

Unnamed: 0,itemID,category_x,score,asin,title,category_y
0,B00SNCLGL4,Computers,0.99819,B00SNCLGL4,ICY DOCK Dual 2.5 SSD 1 x 3.5 HDD Device Bay t...,Electronics|Computers & Accessories|Computer A...
1,B00LV8YZLK,Computers,0.99784,B00LV8YZLK,Ubiquiti Unifi Security Gateway (USG),Electronics|Computers & Accessories|Networking...
2,B01EO5A4TI,Computers,0.997828,B01EO5A4TI,TP-Link AV1200 Powerline Ethernet Adapter - Gi...,Electronics|Computers & Accessories|Networking...
3,B00J42AFUA,Computers,0.996984,B00J42AFUA,OWC In-Line Digital Thermal Sensor HDD Upgrade...,Electronics|Computers & Accessories|Data Stora...


In [6]:
df_sasrec.head(4) # note we have logits here as score!

Unnamed: 0,itemID,score,asin,title,main_cat,category
0,22576,4.999486,B00L0YLRUW,NETGEAR N300 WiFi Range Extender (EX2700),Computers,Electronics|Computers & Accessories|Networking...
1,40901,4.904821,B010OYASRG,OontZ Angle 3 Enhanced Stereo Edition IPX5 Spl...,Cell Phones & Accessories,Electronics|Portable Audio & Video|Portable Sp...
2,63295,4.463625,B00HFJWKWK,AmazonBasics 6-Sheet Cross-Cut Paper and Credi...,All Electronics,Office Products|Office Electronics|Other Offic...
3,52392,4.434051,B005NF5NTK,"Anker PowerCore+ Mini, 3350mAh Lipstick-Sized ...",Cell Phones & Accessories,Accessories|Batteries & Battery Packs|Portable...


In [7]:
sas_ids = set(df_sasrec['asin'])
sli_ids = set(df_slirec['asin'])
sas_ids - sli_ids, sli_ids - sas_ids # all good here!

(set(), set())

#### Average the results of the two models

- Note that we need to take the sigmoid of the logits output by the SASRec model

In [8]:
df_seq = df_slirec.merge(df_sasrec, on='asin')
df_seq = df_seq[['category_x', 'score_x', 'title_x', 'score_y', 'category']]
df_seq.columns = ['main category', 'score_x', 'title', 'score_y', 'category']
df_seq['score_y'] = 1/(1 + np.exp(-df_seq['score_y'].values)) # sigmoid of logits
df_seq['score'] = (df_seq['score_x'] + df_seq['score_y'])/2.
df_seq.drop(columns=['score_x', 'score_y'], inplace=True)
df_seq.sort_values(['score'], ascending=False, inplace=True)
df_seq.head(10)

Unnamed: 0,main category,title,category,score
1390,Computers,Sabrent 4-Port USB 3.0 Hub with Individual LED...,Electronics|Computers & Accessories|Networking...,0.975179
678,Computers,SanDisk Ultra CZ48 32GB USB 3.0 Flash Drive Tr...,Electronics|Computers & Accessories|Data Stora...,0.972433
652,Home Audio & Theater,"AmazonBasics High-Speed HDMI Cable, 10 Feet, 2...",Electronics|Accessories & Supplies|Audio & Vid...,0.972169
735,Home Audio & Theater,AmazonBasics HL-007350 HDMI to DVI Output Ada...,Electronics|Computers & Accessories|Computer A...,0.970648
1154,Computers,Anker 4-Port USB 3.0 Ultra Slim Data Hub for M...,Electronics|Computers & Accessories|Networking...,0.970051
161,Computers,"WD Blue 6TB PC Hard Drive - 5400 RPM Class, SA...",Electronics|Computers & Accessories|Data Stora...,0.969016
1071,Computers,LG Electronics 8X USB 2.0 Super Multi Ultra Sl...,Electronics|Computers & Accessories|Computer C...,0.966508
1465,Computers,Samsung 32GB BAR (METAL) USB 3.0 Flash Drive (...,Electronics|Computers & Accessories|Data Stora...,0.966128
2466,Computers,AmazonBasics AC Powered Computer Speakers,Electronics|Computers & Accessories|Computer A...,0.963522
682,Computers,Seagate Expansion 1TB Portable External Hard D...,Electronics|Computers & Accessories|Data Stora...,0.962074


In [9]:
df_seq['category'].fillna('', inplace=True)

### Regression models: LightGBM, Wide & Deep, xDeepFM

In [10]:
# regression based models
df_lightgbm.shape, df_widedeep.shape, df_xdeepfm.shape # some missing items

((63478, 4), (63447, 4), (63478, 4))

In [11]:
df_lightgbm.head(4)

Unnamed: 0,itemID,prediction,title,genre
0,45050,5.316779,Amazon Kindle 2 (2nd Generation) USB Car Charg...,Amazon Devices|Cell Phones & Accessories|Acces...
1,45055,5.294511,Zeikos ZE-SG26R 3 Pieces Ultra Clear Deluxe Sc...,Camera & Photo|Cell Phones & Accessories|Acces...
2,50146,5.285888,ANSMANN Individual cell battery charger Energy...,Home Audio & Theater|Cell Phones & Accessories...
3,44737,5.276212,"CAVN 2-Pack Compatible Fitbit Surge Charger, R...",Cell Phones & Accessories|Sports & Outdoors|Sp...


In [12]:
df_widedeep.head(4)

Unnamed: 0,itemID,genre,prediction,title
0,40268,Camera & Photo|Electronics|Computers & Accesso...,4.932795,WT-AF-5v10w 802.3af PoE Splitter with 5 Volts ...
1,56373,Cell Phones & Accessories,4.904454,APPLE IPHONE 4 & 4S AT&T Verizon Sprint FAIRY ...
2,6231,Camera & Photo|Electronics,4.89804,Pelican 1400 Case With Foam (Black)
3,56842,Cell Phones & Accessories|Accessories,4.888552,Galaxy S9 Plus Screen Protector Loopilops [9H ...


In [13]:
df_xdeepfm.head(4)

Unnamed: 0,prediction,itemID,title,genre
0,4.844718,59970,"Scott 75143 Scott Shop Towels, Blue (3 Rolls, ...",Industrial & Scientific|Automotive
1,4.839595,58057,", ( ) - Masha and the Bear, moving arms an...",Cell Phones & Accessories
2,4.837111,59905,FW1 Cleaner With Carnauba Wax by RGS Labs (17....,Industrial & Scientific|Automotive
3,4.824506,41412,DROK Reusable 30 Pcs Adhesive Fastening Cable ...,Industrial & Scientific|Electronics|Accessorie...


In [14]:
lgb_ids = set(df_lightgbm['itemID'])
xd_ids = set(df_xdeepfm['itemID'])
lgb_ids - xd_ids, xd_ids - lgb_ids # all good here!

(set(), set())

In [15]:
wd_ids = set(df_widedeep['itemID'])
missing_ids = lgb_ids - wd_ids
len(missing_ids) # oops, we have a few missing items; for now drop them!

31

In [16]:
print(df_lightgbm.shape)
df_lightgbm = df_lightgbm[~df_lightgbm['itemID'].isin(missing_ids)]
print(df_lightgbm.shape)

(63478, 4)
(63447, 4)


In [17]:
print(df_xdeepfm.shape)
df_xdeepfm = df_xdeepfm[~df_xdeepfm['itemID'].isin(missing_ids)]
print(df_xdeepfm.shape)

(63478, 4)
(63447, 4)


#### Average the results of the three models

In [18]:
df_reg = df_lightgbm.merge(df_widedeep, on='itemID')
df_reg = df_reg[['itemID', 'prediction_x', 'title_x', 'genre_x', 'prediction_y']]
df_reg.columns = ['itemID', 'pred_lgb', 'title', 'genre', 'pred_wd']
df_reg = df_reg.merge(df_xdeepfm, on='itemID')
df_reg = df_reg[['itemID', 'pred_lgb', 'title_x', 'genre_x', 'pred_wd', 'prediction']]
df_reg.columns = ['itemID', 'pred_lgb', 'title', 'genre', 'pred_wd', 'pred_xd']
df_reg['score'] = (df_reg['pred_lgb'] + df_reg['pred_wd'] + df_reg['pred_xd'])/3.
df_reg.drop(columns= ['itemID', 'pred_lgb', 'pred_wd', 'pred_xd'], inplace=True)
df_reg.sort_values('score', ascending=False, inplace=True)
df_reg.reset_index(drop=True, inplace=True)
df_reg.head(10)

Unnamed: 0,title,genre,score
0,Piero Lorenzo for Apple Watch 44mm Screen Prot...,Cell Phones & Accessories|Accessories,4.813503
1,ThruNite MCC-2 Universal Charger for Charging ...,Cell Phones & Accessories|Accessories,4.779016
2,"Samsung Galaxy S5 Glass Screen Protector, Tech...",Cell Phones & Accessories|Accessories,4.766703
3,"5C Case, iPhone 5C Case, MagicSky [Shock Absor...",Cell Phones & Accessories,4.766676
4,"TUSITA Fitbit One Charging Cable, Replacement ...",Cell Phones & Accessories|Sports & Outdoors|Sp...,4.764348
5,Getwow 10-Pack Silicon Fastener Ring for Fitbi...,Cell Phones & Accessories|Sports & Outdoors|Sp...,4.760903
6,Nillkin Premium Matte Hard Cover Case + Guard ...,Cell Phones & Accessories,4.756411
7,Quad Lock Case for iPhone 5 / 5s / SE,Cell Phones & Accessories,4.746475
8,Speck Products CandyShell Grip Case for iPhone...,Cell Phones & Accessories,4.743813
9,Fire HD 6 Case - Poetic Fire HD 6 Case [Turtle...,All Electronics|Cell Phones & Accessories,4.742936


## Altair Charts for the dashboard

In [19]:
genres = set()
for _, row in df_reg.iterrows():
    for genre in row['genre'].split('|'):
        genres.add(genre)
print(len(genres))
print(sorted(genres))

36
['Accessories', 'Accessories & Supplies', 'All Electronics', 'Amazon Devices', 'Apple Products', 'Audio & Video Accessories', 'Automotive', 'Camera & Photo', 'Car & Vehicle Electronics', 'Car Electronics', 'Cell Phones & Accessories', 'Clothing, Shoes & Jewelry', 'Computer Accessories & Peripherals', 'Computer Components', 'Computers', 'Computers & Accessories', 'Controllers', 'Electrical', 'Electronics', 'GPS & Navigation', 'Home & Kitchen', 'Home Audio', 'Home Audio & Theater', 'Industrial & Scientific', 'Laptop Accessories', 'Musical Instruments', 'Office & School Supplies', 'Office Electronics', 'Office Products', 'Portable Audio & Accessories', 'Portable Audio & Video', 'Sports & Fitness', 'Sports & Outdoors', 'Tools & Home Improvement', 'Toys & Games', 'Video Games']


In [20]:
for genre in genres:
    df_reg[genre] = 0
    df_seq[genre] = 0

In [21]:
def apply_cat_reg(row):
    for genre in row['genre'].split('|'):
       row[genre] = 1
    return row
df_reg = df_reg.apply(apply_cat_reg, axis=1)
df_reg.drop(columns=['genre'], inplace=True)
df_reg.to_csv('./data/df_reg.csv', index=False)

In [22]:
def apply_cat_seq(row):
    row[row['main category']] = 1
    for genre in row['category'].split('|'):
        if genre in df_seq.columns[4:]:
            row[genre] = 1
    return row
df_seq = df_seq.apply(apply_cat_seq, axis=1)
df_seq.drop(columns=['category', 'main category'], inplace=True)
df_seq.reset_index(drop=True, inplace=True)
df_seq.to_csv('./data/df_seq.csv', index=False)

In [23]:
sel_cat = ['Amazon Devices', 'Apple Products']

In [24]:
query_seq = ''
query_reg = ''
for i, c in enumerate(sel_cat):
    query_seq += f'(df_seq["{c}"] == 1)'
    query_reg += f'(df_reg["{c}"] == 1)'
    if i != len(sel_cat)-1:
        query_seq += ' | '
        query_reg += ' | '
print(query_reg)
print(query_seq)

(df_reg["Amazon Devices"] == 1) | (df_reg["Apple Products"] == 1)
(df_seq["Amazon Devices"] == 1) | (df_seq["Apple Products"] == 1)


In [25]:
sel_df_reg = df_reg[eval(query_reg)][["title", "score"] + sel_cat]
sel_df_reg = sel_df_reg.iloc[:10]
sel_df_reg.reset_index(inplace=True, drop=True)
sel_df_seq = df_seq[eval(query_seq)][["title", "score"] + sel_cat]
sel_df_seq = sel_df_seq.iloc[:10]
sel_df_seq.reset_index(inplace=True, drop=True)

In [26]:
sel_df_reg.reset_index(inplace=True)
sel_df_reg['index'] = sel_df_reg['index'] + 1

In [27]:
sel_df_seq.reset_index(inplace=True)
sel_df_seq['index'] = sel_df_seq['index'] + 1

In [28]:
df_output_seq = sel_df_seq.melt(id_vars=['index','title'], value_vars=sel_cat, var_name='category')
df_output_reg = sel_df_reg.melt(id_vars=['index','title'], value_vars=sel_cat, var_name='category')

In [29]:
circles_seq = alt.Chart(df_output_seq).mark_circle().encode(
    x=alt.X('category:N',axis=alt.Axis(title=None, labelColor='black')),
    y=alt.Y('title:O', sort=None, axis=alt.Axis(title=None, grid=False)),#, axis=alt.Axis(title=None, grid=False, labels=False)),
    size=alt.Size('value:Q', legend=None),
    color='category:N'
).properties(
    width=50,
    height=300,
    title='Sequential Models')
circles_seq

In [30]:
circles_reg = alt.Chart(df_output_reg).mark_circle().encode(
    x=alt.X('category:N',axis=alt.Axis(title=None, labelColor='black')),
    y=alt.Y('title:O', sort=None, axis=alt.Axis(title=None, grid=False)),
    size=alt.Size('value:Q', legend=None),
    color=alt.Color('category:N')#, legend=None)
).properties(
    width=50,
    height=300,
    title='Regression Models')

circles_reg

In [31]:
middle = alt.Chart(sel_df_seq).encode(
    y=alt.Y('index:O', axis=None),
    text=alt.Text('index:Q')
).mark_text().properties(
    width=20,
    height=300
)

In [32]:
(circles_reg | middle | circles_seq)

## Tables for the sidebar in the dashboard

In [33]:
dict_ds = {'# reviews': 5_613_183, '# users': 830_668, '# categories': 36}
df = pd.DataFrame(dict_ds, index=[0]).T

In [34]:
df.columns = ['count']
df

Unnamed: 0,count
# reviews,5613183
# users,830668
# categories,36


In [35]:
dict_ds = {'type':['Collaborative', 'Collaborative', 'Content-based', 'Hybrid', 'Hybrid'],
           'name': ['SLi-Rec', 'SASRec', 'LightGBM', 'Wide & Deep', 'xDeepFM'],
           'ndcg@10': [0.404, 0.392, 0.0725, 0.1256, 0.1881],
           'hit@10': [0.6654, 0.628, 0.1631, 0.2781, 0.3497]}
df = pd.DataFrame(dict_ds)

In [36]:
df

Unnamed: 0,type,name,ndcg@10,hit@10
0,Collaborative,SLi-Rec,0.404,0.6654
1,Collaborative,SASRec,0.392,0.628
2,Content-based,LightGBM,0.0725,0.1631
3,Hybrid,Wide & Deep,0.1256,0.2781
4,Hybrid,xDeepFM,0.1881,0.3497


In [37]:
df.set_index(['type', 'name'], inplace=True)
df

Unnamed: 0_level_0,Unnamed: 1_level_0,ndcg@10,hit@10
type,name,Unnamed: 2_level_1,Unnamed: 3_level_1
Collaborative,SLi-Rec,0.404,0.6654
Collaborative,SASRec,0.392,0.628
Content-based,LightGBM,0.0725,0.1631
Hybrid,Wide & Deep,0.1256,0.2781
Hybrid,xDeepFM,0.1881,0.3497


## Data visualizations of the reviews data for the chosen user

In [38]:
DATA_DIR = '/home/shiv/Documents/DataScience/Capstone/Data/wide_deep/Electronics/'
ratings_df = pd.read_csv(DATA_DIR + 'wide_deep_amzn_e_20.csv', header=None, low_memory=False)
ratings_df.columns=['userID','itemID', 'rating','genre','unixTimeStamp','title','price','main_cat','category']

In [39]:
user_df = ratings_df[ratings_df['userID'] == 8009].copy()
print(user_df.shape)

(324, 9)


In [40]:
user_df = user_df[['rating', 'genre', 'title', 'main_cat', 'unixTimeStamp']]
user_df.sort_values('unixTimeStamp', ascending=False, inplace=True)
user_df.reset_index(inplace=True, drop=True)
user_df.head(10)

Unnamed: 0,rating,genre,title,main_cat,unixTimeStamp
0,5.0,Home Audio & Theater|Electronics|Accessories &...,BlueRigger High Speed MicroBlueRigger High Spe...,Home Audio & Theater,1521936000
1,5.0,All Electronics|Electronics|Computers & Access...,Corsair CMSA8GX3M2A1066C7 Apple 8 GB Dual Chan...,All Electronics,1521936000
2,5.0,All Electronics|Electronics|Computers & Access...,D-Link 8 Port 10/100 Unmanaged Metal Desktop S...,All Electronics,1521936000
3,5.0,Computers|Electronics|Computers & Accessories,"New iPad 9.7"" (2018 & 2017) / iPad Pro 9.7 / i...",Computers,1521936000
4,5.0,Computers|Electronics|Computers & Accessories,StarTech.com CABSHELF Black Standard Universal...,Computers,1521936000
5,5.0,All Electronics|Electronics|Accessories & Supp...,ESYNIC DAC Digital to Analog Audio Converter O...,All Electronics,1521936000
6,5.0,All Electronics|Office Products|Office Electro...,HP Laserjet Pro M402dw Wireless Monochrome Pri...,All Electronics,1520899200
7,4.0,All Electronics|Electronics|Accessories & Supp...,VCE 4K x 2K Mini HDMI Male to HDMI Female Conv...,All Electronics,1520899200
8,5.0,Computers|Electronics|Computers & Accessories|...,Timetec Hynix IC 4GB DDR3L 1600MHz PC3L-12800 ...,Computers,1520899200
9,4.0,Home Audio & Theater|Home & Kitchen,VIVO Universal LCD LED Flat Screen TV Table To...,Home Audio & Theater,1504828800


In [41]:
from collections import defaultdict

genre_count = defaultdict(int)
genre_rating = defaultdict(float)
for _,row in user_df.iterrows():
    genres = row['genre'].split('|')
    for genre in genres:
        genre_count[genre] += 1
        genre_rating[genre] += row['rating']

for k in genre_rating.keys():
    genre_rating[k] = round(genre_rating[k]/genre_count[k],2)

count_dict = {v[0]: v[1] for v in sorted(genre_count.items(), key=lambda x: (x[1],x[0]), reverse=True)}
rating_dict = {v[0]: genre_rating[v[0]] for v in sorted(genre_count.items(), key=lambda x: (x[1],x[0]), reverse=True)}

In [54]:
df_tmp = pd.DataFrame(count_dict, index=[0]).T
df_tmp2 = pd.DataFrame(rating_dict, index=[0]).T
df_tmp = pd.concat([df_tmp, df_tmp2], axis=1)
df_tmp.reset_index(inplace=True)
df_tmp.columns=['category', 'count', 'avg rating']

In [55]:
df_tmp.to_csv('./data/df_user.csv', index=False)

In [62]:
df_tmp = df_tmp[df_tmp['category'].isin(sel_cat)]

In [68]:
m = df_tmp['count'].max() + 10
bars = alt.Chart(df_tmp).mark_bar().encode(
    x=alt.X('count:Q', axis=alt.Axis(title='Count', grid=False), scale=alt.Scale(domain=[0, m])),
    y=alt.Y('category:O', sort='-x', axis=alt.Axis(title=None, grid=False)),
    color=alt.Color('category:N')
)

text = bars.mark_text(
    align='left',
    baseline='middle',
    dx=3  # Nudges text to right so it doesn't appear on top of the bar
).encode(
    text='avg rating:Q'
)

# (bars + text).properties(height=900)
(bars + text)

In [69]:
user_df

Unnamed: 0,rating,genre,title,main_cat,unixTimeStamp
0,5.0,Home Audio & Theater|Electronics|Accessories &...,BlueRigger High Speed MicroBlueRigger High Spe...,Home Audio & Theater,1521936000
1,5.0,All Electronics|Electronics|Computers & Access...,Corsair CMSA8GX3M2A1066C7 Apple 8 GB Dual Chan...,All Electronics,1521936000
2,5.0,All Electronics|Electronics|Computers & Access...,D-Link 8 Port 10/100 Unmanaged Metal Desktop S...,All Electronics,1521936000
3,5.0,Computers|Electronics|Computers & Accessories,"New iPad 9.7"" (2018 & 2017) / iPad Pro 9.7 / i...",Computers,1521936000
4,5.0,Computers|Electronics|Computers & Accessories,StarTech.com CABSHELF Black Standard Universal...,Computers,1521936000
...,...,...,...,...,...
319,5.0,Apple Products,Apple AirPort Extreme,Apple Products,1435276800
320,5.0,Cell Phones & Accessories|Accessories,MediaDevil Apple iPhone 3G / 3GS Screen Protec...,Cell Phones & Accessories,1435276800
321,5.0,All Electronics|Electronics|Computers & Access...,TP-Link 16-Port Gigabit Ethernet Unmanaged Swi...,All Electronics,1435276800
322,5.0,Computers|Electronics|Computers & Accessories|...,"Rain Design mStand Laptop Stand, Silver (Paten...",Computers,1435276800


In [70]:
user_df['date'] = pd.to_datetime(user_df['unixTimeStamp'], unit='s')
user_df

Unnamed: 0,rating,genre,title,main_cat,unixTimeStamp,date
0,5.0,Home Audio & Theater|Electronics|Accessories &...,BlueRigger High Speed MicroBlueRigger High Spe...,Home Audio & Theater,1521936000,2018-03-25
1,5.0,All Electronics|Electronics|Computers & Access...,Corsair CMSA8GX3M2A1066C7 Apple 8 GB Dual Chan...,All Electronics,1521936000,2018-03-25
2,5.0,All Electronics|Electronics|Computers & Access...,D-Link 8 Port 10/100 Unmanaged Metal Desktop S...,All Electronics,1521936000,2018-03-25
3,5.0,Computers|Electronics|Computers & Accessories,"New iPad 9.7"" (2018 & 2017) / iPad Pro 9.7 / i...",Computers,1521936000,2018-03-25
4,5.0,Computers|Electronics|Computers & Accessories,StarTech.com CABSHELF Black Standard Universal...,Computers,1521936000,2018-03-25
...,...,...,...,...,...,...
319,5.0,Apple Products,Apple AirPort Extreme,Apple Products,1435276800,2015-06-26
320,5.0,Cell Phones & Accessories|Accessories,MediaDevil Apple iPhone 3G / 3GS Screen Protec...,Cell Phones & Accessories,1435276800,2015-06-26
321,5.0,All Electronics|Electronics|Computers & Access...,TP-Link 16-Port Gigabit Ethernet Unmanaged Swi...,All Electronics,1435276800,2015-06-26
322,5.0,Computers|Electronics|Computers & Accessories|...,"Rain Design mStand Laptop Stand, Silver (Paten...",Computers,1435276800,2015-06-26


In [81]:
user_df = user_df[['date','main_cat']]
user_df.to_csv('./data/user_hist.csv', index=False)

In [82]:
user_df = pd.read_csv('./data/user_hist.csv')

In [88]:
alt.Chart(user_df).mark_circle(
).encode(
    alt.X('date:T', axis=alt.Axis(labelAngle=0, format='%b, %Y')),
    alt.Y('main_cat:N', axis=alt.Axis(title=None, grid=False)),
    alt.Color('main_cat:N', legend=None)
)#.properties(width=900)

In [89]:
from PIL import Image