In [8]:
import json
import gzip
import pandas as pd
import numpy as np
from tqdm import tqdm

import gcpy
import fasttext.util

# Create the titles dataframe

In [2]:
data = []
with gzip.open('../data/meta_Books.json.gz') as f:
    for l in tqdm(f):
        data.append(json.loads(l.strip()))
        
# Pre-Processing: https://colab.research.google.com/drive/1Zv6MARGQcrBbLHyjPVVMZVnRWsRnVMpV#scrollTo=LgWrDtZ94w89
# Convert list into pandas dataframe
df = pd.DataFrame.from_dict(data)

# Filter unformatted rows
df = df[~df.title.str.contains('getTime')] 

# Restrict to just 'Books'
df = df[df['main_cat']=='Books']

# Reset index
df.reset_index(inplace=True, drop=True)

# Only keep the title columns
df = df[['title']]

# Remove empty spaces and empty strings
df = df.replace(r'^\s*$', np.nan, regex=True)

# Drop na
df.dropna(inplace=True)

# Check the df
print(df.shape)
df.head()

2934949it [01:58, 24714.50it/s]


(2850218, 1)


Unnamed: 0,title
0,Biology Gods Living Creation Third Edition 10 ...
1,Mksap 16 Audio Companion: Medical Knowledge Se...
2,"Flex! Discography of North American Punk, Hard..."
3,Heavenly Highway Hymns: Shaped-Note Hymnal
4,Georgina Goodman Nelson Womens Size 8.5 Purple...


In [3]:
df.to_csv('../data/amazon_books.csv')

# Upload to bigquery

In [5]:
result_table = gcpy.pd_to_bq(source_df=df, target_dataset='amazon_products', target_tablename='books')
print(f'Source Dataframe uploaded to BQ table: {result_table}')

2021-07-03 18:31:11.366 INFO:	<google.cloud.bigquery.job.load.LoadJob object at 0x7f26647e70a0>


Source Dataframe uploaded to BQ table: leo-gcp-sanbox.amazon_products.books


# Download the Fasttext model

In [9]:
fasttext.util.download_model('en', if_exists='ignore')

Downloading https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.en.300.bin.gz


'cc.en.300.bin'

# Upload the fasttext model to GCS

In [10]:
!gsutil cp cc.en.300.bin gs://leo-models/fasttext

Copying file://cc.en.300.bin [Content-Type=application/octet-stream]...
==> NOTE: You are uploading one or more large file(s), which would run          
significantly faster if you enable parallel composite uploads. This
feature can be enabled by editing the
"parallel_composite_upload_threshold" value in your .boto
configuration file. However, note that if you do this large files will
be uploaded as `composite objects
<https://cloud.google.com/storage/docs/composite-objects>`_,which
means that any user who downloads such objects will need to have a
compiled crcmod installed (see "gsutil help crcmod"). This is because
without a compiled crcmod, computing checksums on composite objects is
so slow that gsutil disables downloads of composite objects.

- [1 files][  6.7 GiB/  6.7 GiB]   13.8 MiB/s                                   
Operation completed over 1 objects/6.7 GiB.                                      
