# Data Preparation
### In this notebook we are going to prepare our data for our search function to use.
### Currently our data is stored in two different csv files: 1: Order1.csv 2: Order_cost.csv

In [34]:
### import the important modules
import pandas as pd
import numpy as np
from os import getcwd

# Define paths to files.

In [35]:
PATH_Order_details   = f"{getcwd()}/dataSet/Order11.csv"
PATH_Profit  = f"{getcwd()}/dataSet/Order_cost21.csv"

# Data Engineering
###  Get data in dataframes
###  Convert data into single dictionary

In [36]:
'''Read from csv file order1'''
order_details=pd.read_csv('order11.csv')
details_columns = order_details.columns.tolist()
print(f"COLUMNS : {details_columns}")

COLUMNS : ['Region', 'Country', 'Item Type', 'Order Date', 'Order ID', 'Ship Date', 'Index']


In [37]:
'''Read from csv file order1'''
order_cost=pd.read_csv('Order_cost21.csv')

cost_columns = order_cost.columns.tolist()
print(f"COLUMNS : {cost_columns}")

COLUMNS : ['Order ID', 'Item Type', 'NoOfUnits', 'Unit Price', 'Total Cost']


### order ID is common in both the files so we're gonna use it as primary search key.
###  A user will always search its order details by their Order ID so we will create a Global secondary index to be able to perform search our datastore.
### In addition, It will make our searching faster and efficient so it's a good deal.


In [38]:
print(f"It is {pd.Series(order_details['Order ID']).is_unique}  that the column 'Order ID' has unique values for all entries in order_details dataframe.")
print(f"It is {pd.Series(order_cost['Order ID']).is_unique}  that the column 'Order Id' has unique values for all entries in order_cost dataframe.")

# Sort order_details dataframe on the basis of Order ID as Order ID is unique for all entries..
order_details_sorted = order_details.sort_values(by=['Order ID'])

# Sort order_cost dataframe on the basis ofOrder ID as Order ID is unique for all entries..
order_cost_sorted = order_cost.sort_values(by=['Order ID'])

It is True  that the column 'Order ID' has unique values for all entries in order_details dataframe.
It is True  that the column 'Order Id' has unique values for all entries in order_cost dataframe.


In [43]:
# from order_details dataframe...
Order_ID    = order_details_sorted["Order ID"].tolist()
Region =order_details_sorted["Region"].tolist()
Country = order_details_sorted["Country"].tolist()
Item_type=order_details_sorted["Item Type"].tolist()
Order_date=order_details_sorted["Order Date"].tolist()
Ship_date=order_details_sorted["Ship Date"].tolist()
# from order_cost dataframe...

Number=order_cost_sorted["NoOfUnits"].tolist()
Unit_price=order_cost_sorted["Unit Price"].tolist()
Total_cost=order_cost_sorted["Total Cost"].tolist()



In [44]:
'''Now we here we have created two dictionary and a list'''
detailslist           = []
detailsdict={}
global_secondaryIndex = {}
for idx,OID  in enumerate(Order_ID):
    
       detailslist.append(OID)
#    append function of list to add elements.

       detailsdict[OID]={
       "Order_ID":Order_ID[idx],
       "Region" :Region[idx],
       "Country" :Country[idx],
       "Itemtype":Item_type[idx],
       "Orderdate":Order_date[idx],
       "Shipdate":Ship_date[idx],
          "costs":{ 
            "Number" : Number[idx],
            "Price": Unit_price[idx],
            "Totalcost":Total_cost[idx]
    
          }}
    
       global_secondaryIndex[idx]=Order_ID[idx]


In [45]:
# delete variables which are no longer in use while holding large amount of data.
del Order_ID
del Region
del Country
del Item_type
del Order_date
del Ship_date

del Number
del Unit_price
del Total_cost


In [46]:
#here we are creating two json files newdataFinal_GIS.json and Order_cost1.csv using two dictionary global_secondaryIndex and detailsdict respectively.
import json
print("[INFO] Writing movie Data into the disk...")
with open('Order_cost1.json', 'w') as fp:
    json.dump(detailsdict, fp, sort_keys=True, indent=4)
print("[INFO] Writing Global Secondary Index Data into the disk...")
with open('newdataFinal_GIS.json', 'w') as fp:
    json.dump(global_secondaryIndex, fp, sort_keys=True, indent=4)

[INFO] Writing movie Data into the disk...
[INFO] Writing Global Secondary Index Data into the disk...


### At this point, our database is ready and it can handel high inflow of requests.