In [1]:
import pandas as pd
import os
import json
from datetime import datetime

In [2]:
local_modules_path = '../src'
os.sys.path.append(local_modules_path)

In [3]:
from process import *

This notebooks was created with the aim of visualizing Tennis demand and create a simple ML model

First, how can we get this data? We are able to pull data from the Amazon's public API via rapidapi. So, we need to search in the product categories, where the Tennis is. It might be likely needed to pull both Women and Men Tenis. Let's take a look at the Women Tenis

In [None]:
amazon_api_client = api_fetcher.ApiClient()
amazon_api_client.http_request(
    method="GET",
    request_url="/products-by-category?category_id=2478868012&page=1&country=US&sort_by=RELEVANCE&product_condition=ALL&is_prime=false&deals_and_discounts=NONE"
)
amazon_api_client.save_response(filename='product_list_us')

In [16]:
with open('../data/api-calls/us_product_list.json', 'r') as file:
    product_list_json = json.load(file)

In [17]:
api_call_response = eval(product_list_json['response'])

In [18]:
api_call_data = api_call_response['data']

In [None]:
pd.DataFrame(api_call_data).sort_values(by="name")

It seems we need to delve depet into to both `fashion-mens` and `fashion-womens`

Let's try out with the path parameter `product-details`

In [None]:
amazon_api_client.http_request(
    method="GET",
    request_url="/search?query=Tenis&country=US&sort_by=RELEVANCE&product_condition=ALL&page=100"
)

In [None]:
amazon_api_client.save_response(filename="tenis_products_all")

In [21]:
with open(
    "../data/api-calls/tenis_products_all.json",
    'r'
) as file:
    prod_details_json = json.load(file)

In [None]:
json.loads(prod_details_json['response'])['data']

In [5]:
product_data = json.loads(prod_details_json['response'])['data']

In [8]:
tenis_data = pd.DataFrame(product_data['products'])

In [None]:
tenis_data.shape

In [None]:
tenis_data.iloc[0]['sales_volume']

In [None]:
tenis_data['sales_volume'].unique()

In [19]:
todays_datestr = datetime.today().strftime(format="%Y-%m-%d")

In [None]:
tenis_data.sales_volume.unique()

In [None]:
tenis_data

Once we got a data sample, it is natural to think "what can we model?". As first proposal, me might leverage the `sales_volume` field to forecast demand. As we have lower-bound estimates of the sold figures and this exercise is made exclusively for learning purposes, we could map these estimates into float numbers adding Gaussian Noise.

# Data Transformation and Cleaning

In [22]:
import numpy as np

In [None]:
tenis_data[tenis_data['currency']!='USD']

In [None]:
tenis_data.head(20)

In [25]:
import re

In [26]:
filtered_columns = [
    'product_price',
    'product_original_price',
    'product_star_rating',
    'product_num_ratings',
    'product_minimum_offer_price',
    'is_prime',
    'climate_pledge_friendly',
    'has_variations',
    'coupon_text',   
    'sales_volume'
]

In [27]:
tenis_data_filtered = tenis_data[filtered_columns]

We have filtered the columns we found useful to estimate `sales_volume`  such as

- `product_price`
- `product_star_ratings`
- `product_num_ratings`

now proceed to clean and transform columns 

In [107]:
tenis_data_processed = tenis_data_filtered.copy()

In [None]:
list(tenis_data_processed['sales_volume'].unique())

In [None]:
tenis_data_processed[tenis_data_processed['sales_volume']=='List: ']

In [None]:
tenis_data_processed.head(5)

In [None]:
tenis_data_processed.info()

In [112]:
# Process price format


tenis_data_processed[['product_price']] = tenis_data_processed[[
    'product_price'
    ]].map(lambda price_raw: float(price_raw[1:]) if price_raw != None else price_raw)

tenis_data_processed[['product_original_price']] = tenis_data_processed[[
    'product_original_price'
    ]].map(lambda price_raw: float(price_raw[1:]) if price_raw != None else price_raw)

tenis_data_processed[['product_minimum_offer_price']] = tenis_data_processed[[
    'product_minimum_offer_price'
    ]].map(lambda price_raw: float(price_raw[1:]) if price_raw != None else price_raw
)

In [113]:
# Convert to float

tenis_data_processed['product_star_rating'] = tenis_data_processed['product_star_rating'].astype(float);

In [114]:
tenis_data_processed["coupon_discount"] = tenis_data_processed["coupon_text"].map(
    lambda coupon_txt: 
        re.search(pattern="\d{1,2}(\.+\d{1,2})*", string=coupon_txt).group()
        if type(coupon_txt) != float
        else '0.0'
)

# conver to float

tenis_data_processed['coupon_discount'] = tenis_data_processed['coupon_discount'].map(
    lambda discount_str: float(discount_str)
) 

tenis_data_processed.drop(labels=["coupon_text"], axis=1);

In [115]:
# Process categorical data

tenis_data_processed["is_prime"] = pd.get_dummies(
    tenis_data_processed["is_prime"], 
    dtype=float
    )[True]

tenis_data_processed["climate_pledge_friendly"] = pd.get_dummies(
    tenis_data_processed["climate_pledge_friendly"], 
    dtype=float
    )[True]

tenis_data_processed["has_variations"] = pd.get_dummies(
    tenis_data_processed["has_variations"], 
    dtype=float
    )[True]

In [None]:
tenis_data_processed.columns

In [117]:
# Reorder columns to leave the predict variable at the end

tenis_data_processed = tenis_data_processed[[
    'product_price',
    'product_original_price',
    'product_star_rating',
    'product_num_ratings',
    'product_minimum_offer_price',
    'is_prime',
    'climate_pledge_friendly',
    'has_variations',
    'coupon_discount',   
    'sales_volume'
]]

### Add Gaussian Noise

In [None]:
tenis_data_processed.head(5)

In [None]:
tenis_data_processed.sales_volume.unique()

In [120]:
from scipy.stats import norm

In [None]:
tenis_data_processed.head(5)

In [None]:
tenis_data_processed.head(5)

In [176]:
from process import gaussian_noise

In [182]:
tenis_data_processed_test = gaussian_noise(
    df=tenis_data_processed.copy().copy(),
    target_column="sales_volume"
)

In [None]:
tenis_data_processed_test

In [None]:
todays_datestr

In [None]:
tenis_data_processed_test.to_csv(f"../data/processed/tennis_{todays_datestr}.csv", index=False)

In [119]:
tenis_data.to_csv('../data/raw/tenis_products_all_' + todays_datestr + '.csv')

In [None]:
product_data

In [100]:
raw_tenis_data = str(product_data)

In [101]:
import re

In [102]:
regexp_search = re.search(pattern="Under", string=raw_tenis_data)

In [None]:
regexp_search

In [104]:
prod_details_df = pd.DataFrame(
    data=product_data
)

In [None]:
prod_details_df

In [None]:
import http.client

conn = http.client.HTTPSConnection("real-time-amazon-data.p.rapidapi.com")

headers = {
    'x-rapidapi-key': "dfa8842b83msha2bc48dbc5792bdp1cbbd0jsn0bde68569041",
    'x-rapidapi-host': "real-time-amazon-data.p.rapidapi.com"
}

conn.request("GET", "/product-details?asin=B07ZPKBL9V&country=US", headers=headers)

res = conn.getresponse()
data = res.read()

print(data.decode("utf-8"))

In [109]:
a = json.loads(data.decode("utf-8"))

In [None]:
a

#### Pull multiple pages and consolidate into a single df

In [46]:
for page_num in range(50):
    amazon_api_client.http_request(
        method="GET",
        request_url="/search?query=Tenis&country=US&sort_by=RELEVANCE&product_condition=ALL&page=" + str(page_num)
    )
    amazon_api_client.save_response("tenis_products_" + str(page_num))

In [40]:
with open('../data/api-calls/tenis_products_0.json', 'r') as f:
    d = json.load(f)

In [13]:
with open("../data/api-calls/tenis_products_" + str(8) + ".json", "r") as f:
    a = json.load(f)

In [None]:
json.loads(a["response"])['data']['products']

In [None]:
[str(i) for i in range(1)]

In [4]:
merged_df = extract_json_df([
    "../data/api-calls/tenis_products_" + str(i) + ".json" 
    for i in range(1, 8)
])

In [10]:
merged_df.head()

Unnamed: 0,product_price,product_original_price,product_star_rating,product_num_ratings,product_minimum_offer_price,is_prime,climate_pledge_friendly,has_variations,coupon_discount,sales_volume
0,44.52,80.0,4.6,709,44.52,1.0,0.0,1.0,0.0,400+ bought in past month
1,42.5,75.0,4.7,2028,42.5,1.0,0.0,1.0,0.0,50+ bought in past month
2,38.5,70.0,4.5,502,38.5,1.0,0.0,1.0,0.0,100+ bought in past month
3,30.69,50.0,4.5,40996,30.69,1.0,0.0,1.0,0.0,50+ bought in past month
4,60.0,80.0,4.6,397,60.0,1.0,0.0,1.0,0.0,100+ bought in past month


In [6]:
merged_df = clean_data(df=merged_df.copy())

In [7]:
del gaussian_noise

In [8]:
from process import gaussian_noise

In [None]:
help(gaussian_noise)

In [None]:
merged_df.

In [9]:
print(merged_df.sales_volume.unique())

['400+ bought in past month' '50+ bought in past month'
 '100+ bought in past month' 'List: ' '200+ bought in past month' None
 '300+ bought in past month' 'List Price: ' '900+ bought in past month'
 '800+ bought in past month' '500+ bought in past month'
 'No featured offers available' '700+ bought in past month'
 '1K+ bought in past month' 'Typical: ' 'Typical price: '
 'Shop products from small business brands sold in Amazon’s store. Discover more about the small businesses partnering with Amazon and Amazon’s commitment to empowering them.'
 'Typical price ']


In [12]:
merged_df = gaussian_noise(df=merged_df.copy(), target_column="sales_volume")

  df[[target_column_numerical]] = df[[target_column_cleaned]].replace(target_column_vals)


ParameterBindError: Error binding parameters for function 'gaussian_noise': missing a required argument: 'target_column'.
Function 'gaussian_noise' has signature 'df: pandas.core.frame.DataFrame, target_column: str) -> pandas.core.frame.DataFrame' but received args: (    product_price  product_original_price  product_star_rating  \
0           44.52                   80.00                  4.6   
1           42.50                   75.00                  4.7   
2           38.50                   70.00                  4.5   
3           30.69                   50.00                  4.5   
4           60.00                   80.00                  4.6   
..            ...                     ...                  ...   
13          42.95                   65.00                  4.6   
14          21.59                   26.99                  4.2   
15          42.99                   69.95                  4.4   
16          46.99                     NaN                  4.1   
17          49.00                   75.00                  4.4   

    product_num_ratings  product_minimum_offer_price  is_prime  \
0                   709                        44.52       1.0   
1                  2028                        42.50       1.0   
2                   502                        38.50       1.0   
3                 40996                        30.69       1.0   
4                   397                        60.00       1.0   
..                  ...                          ...       ...   
13                   38                        42.95       1.0   
14                  151                        21.59       1.0   
15                 8518                        42.99       1.0   
16                  768                        46.99       1.0   
17                 4750                        49.00       1.0   

    climate_pledge_friendly  has_variations  coupon_discount  \
0                       0.0             1.0              0.0   
1                       0.0             1.0              0.0   
2                       0.0             1.0              0.0   
3                       0.0             1.0              0.0   
4                       0.0             1.0              0.0   
..                      ...             ...              ...   
13                      0.0             1.0              0.0   
14                      0.0             1.0              0.0   
15                      0.0             1.0              0.0   
16                      0.0             1.0              0.0   
17                      0.0             1.0              0.0   

                 sales_volume       sales_volume_cleaned  \
0   400+ bought in past month  400+ bought in past month   
1    50+ bought in past month   50+ bought in past month   
2   100+ bought in past month  100+ bought in past month   
3    50+ bought in past month   50+ bought in past month   
4   100+ bought in past month  100+ bought in past month   
..                        ...                        ...   
13                     List:                           0   
14            Typical price:                           0   
15  100+ bought in past month  100+ bought in past month   
16                       None                          0   
17                     List:                           0   

    sales_volume_numerical  
0                      400  
1                       50  
2                      100  
3                       50  
4                      100  
..                     ...  
13                       0  
14                       0  
15                     100  
16                       0  
17                       0  

[310 rows x 12 columns],) and kwargs: [].