# **Task 3: Recommendation system**

In [13]:
# Data processing
import pandas as pd
import numpy as np

In [14]:
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
pd.options.mode.chained_assignment = None

## **Data importing and cleaning**

In [11]:
data_merged = pd.read_csv('data_raw/data_merged.csv', 
                          index_col=0,
                          dtype = { 'order_id': str,
                                    'customer_id':str,
                                    'seller_id': str,
                                    'product_id':str,
                                    'product_category_name':str,
                                    'order_item_id': int, 
                                    'price': float, 
                                    'freight_value': float, 
                                    'product_name_lenght': int,
                                    'product_description_lenght': int,
                                    'product_photos_qty' : float,
                                    'product_weight_g':float, 
                                    'product_volume': float,
                                    'customer_city':str,
                                    'customer_state':str,
                                    'customer_lat': float,
                                    'customer_lng':float,
                                    'seller_city':str,
                                    'seller_state':str,
                                    'seller_lat': float,
                                    'seller_lng':float},
                        parse_dates=['order_purchase_timestamp',
                                     'order_approved_at',
                                     'shipping_limit_date',
                                     'order_delivered_carrier_date',
                                     'order_delivered_customer_date',
                                     'order_estimated_delivery_date'])

In [12]:
data_merged.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 108106 entries, 0 to 109660
Data columns (total 29 columns):
 #   Column                         Non-Null Count   Dtype         
---  ------                         --------------   -----         
 0   order_id                       108106 non-null  object        
 1   customer_id                    108106 non-null  object        
 2   order_purchase_timestamp       108106 non-null  datetime64[ns]
 3   order_approved_at              108106 non-null  datetime64[ns]
 4   order_delivered_carrier_date   108106 non-null  datetime64[ns]
 5   order_delivered_customer_date  108106 non-null  datetime64[ns]
 6   order_estimated_delivery_date  108106 non-null  datetime64[ns]
 7   customer_city                  108106 non-null  object        
 8   customer_state                 108106 non-null  object        
 9   customer_lat                   108106 non-null  float64       
 10  customer_lng                   108106 non-null  float64       
 11  

In [44]:
data_product = data_merged.groupby('product_id').agg({'product_id':'count'}).rename(columns={'product_id':'total_sale'}).reset_index().sort_values(by=['total_sale'], ascending=False)
data_product.head()

Unnamed: 0,product_id,total_sale
21175,aca2eb7d00ea1a7b8ebd4e68314663af,518
8249,422879e10f46682990de24d770e7f83d,483
18889,99a4788cb24856965c36a24e339b6058,476
7049,389d119b48cf3043d311335e499d9c6b,389
6774,368c6c730842d78016ad823897a372db,385


In [62]:
data_product.set_index('product_id', drop=True, inplace=True)
data_product.head(2)

Unnamed: 0_level_0,total_sale,unique_customers
product_id,Unnamed: 1_level_1,Unnamed: 2_level_1
aca2eb7d00ea1a7b8ebd4e68314663af,518,0
422879e10f46682990de24d770e7f83d,483,0


In [45]:
data_product['unique_customers'] = 0

In [63]:
# Calculate the unique customers for each product
for prod in data_product.index:
    data_product.loc[prod, ['unique_customers']] = data_merged[data_merged['product_id'] == prod].customer_id.unique().shape[0]

In [65]:
# Check the result
data_product.head(2)

Unnamed: 0_level_0,total_sale,unique_customers
product_id,Unnamed: 1_level_1,Unnamed: 2_level_1
aca2eb7d00ea1a7b8ebd4e68314663af,518,423
422879e10f46682990de24d770e7f83d,483,351


In [66]:
data_product.to_csv('data_raw/data_products.csv')

In [68]:
# Create a dictionary with orders as keys and product_id as values.
dict_order_item = {}

In [94]:
for order in data_merged.order_id.unique():
    dict_order_item[order] = data_merged[data_merged['order_id'] == order].product_id.unique().tolist()

In [97]:
def save_dict_to_file(dic):
    f = open('dict.txt','w')
    f.write(str(dic))
    f.close()

In [1]:
def load_dict_from_file():
    f = open('dict.txt','r')
    data=f.read()
    f.close()
    return eval(data)

In [6]:
dict_temp = load_dict_from_file()

In [99]:
save_dict_to_file(dict_order_item)

In [100]:
import json

In [101]:
# Serialize data into file:
json.dump( dict_order_item, open( "file_name.json", 'w' ) )

In [None]:
# Read data from file:
data = json.load( open( "file_name.json" ) )

In [9]:
dict_temp

dict