In [52]:
import pandas as pd
import urllib
from sqlalchemy import create_engine, Table, MetaData
from sqlalchemy.orm import sessionmaker

# Custom upload with connection string
from engine_info import server_info
# From normalized_tables.py
import normalized_tables

import warnings
warnings.filterwarnings('ignore')

In [2]:
# Creating a connection to MS SQL SERVER
params = urllib.parse.quote_plus(server_info)
engine = create_engine('mssql+pyodbc:///?odbc_connect=%s' % params)
connection = engine.connect()

In [3]:
# Check what is in the database
engine.table_names()

['Capetown_Fresh_produce_market',
 'Capetown_Fresh_produce_market_cleaned',
 'Durban_Fresh_produce_market',
 'Durban_Fresh_produce_market_cleaned',
 'Joburg_Fresh_produce_combined_cleaned',
 'Joburg_Fresh_produce_commodity_cleaned',
 'Joburg_Fresh_produce_commodity_raw',
 'Joburg_Fresh_produce_container_cleaned',
 'Joburg_Fresh_produce_container_raw',
 'Joburg_Fresh_produce_product_combination_raw',
 'Joburg_Fresh_produce_scrapping_date',
 'PickNPay_Prices',
 'PickNPay_Prices_cleaned',
 'Shoprite_Prices',
 'Shoprite_Prices_cleaned',
 'woolworths_Prices',
 'Woolworths_Prices_cleaned']

In [4]:
metadata = MetaData(bind=engine)

## 1. Commodity (JHB)

In [20]:
commodity_df = pd.read_sql_table('Joburg_Fresh_produce_commodity_cleaned', con=engine)

In [22]:
commodity_df.head()

Unnamed: 0,index,rowid,date,commodity,qty_available,MTD_total_value_sold_(R),total_value_sold_(R),Total_quatity_sold,MTD_Total_quatity_sold,Total_kg_sold,MTD_total_kg_sold
0,0,1,20 August 2020,AMADUMBE,2,39870.0,0.0,0.0,97.0,0.0,1940.0
1,1,2,20 August 2020,APPLES,91755,22664221.0,1205932.0,13799.0,261296.0,157462.0,3163863.0
2,2,3,20 August 2020,ARTICHOKES,1,53100.0,600.0,4.0,439.0,3.0,522.0
3,3,4,20 August 2020,ASPARAGUS,8,258975.0,34000.0,50.0,359.0,250.0,1795.0
4,4,5,20 August 2020,ATCHARA,207,1351.2,0.0,0.0,23.0,0.0,65.0


In [23]:
# Check the data type so that it's suited to be inserted in a normalized database
commodity_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1133 entries, 0 to 1132
Data columns (total 11 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   index                     1133 non-null   int64  
 1   rowid                     1133 non-null   int64  
 2   date                      1133 non-null   object 
 3   commodity                 1133 non-null   object 
 4   qty_available             1133 non-null   int64  
 5   MTD_total_value_sold_(R)  1133 non-null   float64
 6   total_value_sold_(R)      1133 non-null   float64
 7   Total_quatity_sold        1133 non-null   float64
 8   MTD_Total_quatity_sold    1133 non-null   float64
 9   Total_kg_sold             1133 non-null   float64
 10  MTD_total_kg_sold         1133 non-null   float64
dtypes: float64(6), int64(3), object(2)
memory usage: 97.5+ KB


In [29]:
# Change the quantity sold to int
commodity_df[['Total_quatity_sold', 'MTD_Total_quatity_sold']] = commodity_df[['Total_quatity_sold', 'MTD_Total_quatity_sold']].astype('int64')

In [39]:
# Convert date column from string to datetime
commodity_df['date'] = pd.to_datetime(commodity_df['date'])

In [40]:
commodity_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1133 entries, 0 to 1132
Data columns (total 11 columns):
 #   Column                    Non-Null Count  Dtype         
---  ------                    --------------  -----         
 0   index                     1133 non-null   int64         
 1   rowid                     1133 non-null   int64         
 2   date                      1133 non-null   datetime64[ns]
 3   commodity                 1133 non-null   object        
 4   qty_available             1133 non-null   int64         
 5   MTD_total_value_sold_(R)  1133 non-null   float64       
 6   total_value_sold_(R)      1133 non-null   float64       
 7   Total_quatity_sold        1133 non-null   int64         
 8   MTD_Total_quatity_sold    1133 non-null   int64         
 9   Total_kg_sold             1133 non-null   float64       
 10  MTD_total_kg_sold         1133 non-null   float64       
dtypes: datetime64[ns](1), float64(4), int64(5), object(1)
memory usage: 97.5+ KB


## 2. Container (JHB)

In [31]:
container_df = pd.read_sql_table('Joburg_Fresh_produce_container_cleaned', con=engine)

In [32]:
container_df.head()

Unnamed: 0,index,rowid,date,commodity,container,qty_available,average_price_per_kg,MTD_total_value_sold_(R),total_value_sold_(R),Total_quatity_sold,MTD_Total_quatity_sold,Total_kg_sold,MTD_total_kg_sold
0,0,1,20 August 2020,AMADUMBE,20KG POCKET,2,0.0,39870.0,0.0,0.0,97.0,0.0,1940.0
1,1,2,20 August 2020,APPLES,10 X 1KG ECONO PACK CARTON,17,10.0,41932.0,100.0,1.0,547.0,10.0,5470.0
2,2,3,20 August 2020,APPLES,11KG JUMBLE CARTON,343,6.36,218914.0,1190.0,17.0,3170.0,187.0,34870.0
3,3,4,20 August 2020,APPLES,12 X 1KG ECONO PACK CARTON,3233,7.86,1454572.0,40738.0,432.0,17353.0,5184.0,208236.0
4,4,5,20 August 2020,APPLES,12.5KG M6 CARTON,12,0.0,16903.0,0.0,0.0,286.0,0.0,3575.0


In [33]:
container_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3987 entries, 0 to 3986
Data columns (total 13 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   index                     3987 non-null   int64  
 1   rowid                     3987 non-null   int64  
 2   date                      3987 non-null   object 
 3   commodity                 3987 non-null   object 
 4   container                 3987 non-null   object 
 5   qty_available             3987 non-null   int64  
 6   average_price_per_kg      3987 non-null   float64
 7   MTD_total_value_sold_(R)  3987 non-null   float64
 8   total_value_sold_(R)      3987 non-null   float64
 9   Total_quatity_sold        3987 non-null   float64
 10  MTD_Total_quatity_sold    3987 non-null   float64
 11  Total_kg_sold             3987 non-null   float64
 12  MTD_total_kg_sold         3987 non-null   float64
dtypes: float64(7), int64(3), object(3)
memory usage: 405.1+ KB


In [34]:
# Change the quantity sold to int
container_df[['Total_quatity_sold', 'MTD_Total_quatity_sold']] = container_df[['Total_quatity_sold', 'MTD_Total_quatity_sold']].astype('int64')

In [42]:
# Convert date column from string to datetime
container_df['date'] = pd.to_datetime(container_df['date'])

In [43]:
container_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3987 entries, 0 to 3986
Data columns (total 13 columns):
 #   Column                    Non-Null Count  Dtype         
---  ------                    --------------  -----         
 0   index                     3987 non-null   int64         
 1   rowid                     3987 non-null   int64         
 2   date                      3987 non-null   datetime64[ns]
 3   commodity                 3987 non-null   object        
 4   container                 3987 non-null   object        
 5   qty_available             3987 non-null   int64         
 6   average_price_per_kg      3987 non-null   float64       
 7   MTD_total_value_sold_(R)  3987 non-null   float64       
 8   total_value_sold_(R)      3987 non-null   float64       
 9   Total_quatity_sold        3987 non-null   int64         
 10  MTD_Total_quatity_sold    3987 non-null   int64         
 11  Total_kg_sold             3987 non-null   float64       
 12  MTD_total_kg_sold   

## 3. Combination (JHB)

In [36]:
combo_df = pd.read_sql_table('Joburg_Fresh_produce_combined_cleaned', con=engine, index_col='rowid')

In [37]:
combo_df.head()

Unnamed: 0_level_0,index,date,commodity,container,unit_mass,product_combination,total_value_sold,total_qty_sold,total_kg_sold,average,highest_price,ave_per_kg,highest_price_per_kg
rowid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
13803,0,2020-08-31,ONIONS,2KG POCKET,2,"SWEET,CL 1,L/M,*,NORTHERN CAPE",0.0,0,0.0,0.0,0.0,0.0,0.0
13804,1,2020-08-31,ONIONS,2KG POCKET,2,"WHITE,CL 1,L/M,*,NORTHERN CAPE",0.0,0,0.0,0.0,0.0,0.0,0.0
13805,2,2020-08-31,ONIONS,2KG POCKET,2,"WHITE,CL 1,M,*,NORTHERN CAPE",0.0,0,0.0,0.0,0.0,0.0,0.0
13806,3,2020-08-31,ONIONS,3KG POCKET,3,"BROWN,CL 1,L/M,*,NORTHERN CAPE",6138.0,361,1083.0,17.0,18.0,5.67,6.0
13807,4,2020-08-31,ONIONS,3KG POCKET,3,"BROWN,CL 1,L/M,*,WESTERN CAPE",0.0,0,0.0,0.0,0.0,0.0,0.0


In [38]:
combo_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 14634 entries, 13803 to 13802
Data columns (total 13 columns):
 #   Column                Non-Null Count  Dtype         
---  ------                --------------  -----         
 0   index                 14634 non-null  int64         
 1   date                  14634 non-null  datetime64[ns]
 2   commodity             14634 non-null  object        
 3   container             14634 non-null  object        
 4   unit_mass             14634 non-null  int64         
 5   product_combination   14634 non-null  object        
 6   total_value_sold      14634 non-null  float64       
 7   total_qty_sold        14634 non-null  int64         
 8   total_kg_sold         14634 non-null  float64       
 9   average               14634 non-null  float64       
 10  highest_price         14634 non-null  float64       
 11  ave_per_kg            14634 non-null  float64       
 12  highest_price_per_kg  14634 non-null  float64       
dtypes: datetime6

## 4. Add data to normalized tables

In [59]:
Session = sessionmaker(bind=engine)

In [60]:
session = Session()

### 4.1 product

In [58]:
products = commodity_df['commodity'].unique()

In [61]:
# Add unique values of products to the product sql table
for product in products:
    commodity = normalized_tables.Product(name=product)
    session.add(commodity)
    session.commit()

### 4.2 container

In [78]:
# Check if different commodities can be packaged in similar container name
container_df[(container_df['container'].duplicate) & (container_df['commodity'] != container_df['commodity'])]

AttributeError: 'Series' object has no attribute 'duplicate'

In [72]:
containers = container_df['container'].unique()

In [75]:
container_df[container_df['container'] == '20KG POCKET']['commodity'].unique()

array(['AMADUMBE', 'BUTTERNUT', 'SWEET POTATO', 'GREEN PEPPERS'],
      dtype=object)

In [73]:
containers[:10]

array(['20KG POCKET', '10 X 1KG ECONO PACK CARTON', '11KG JUMBLE CARTON',
       '12 X 1KG ECONO PACK CARTON', '12.5KG M6 CARTON',
       '12.5KG M6 CONTAINER', '18.50KG CARTON', '3 KG ECONO PACK BOX',
       '300KG BULK BIN', '3KG ECONO PACK'], dtype=object)

### 4.3 inventory

### 4.4 product_combination

In [69]:
# Check if product_combinations can have different containers, or commodity names
combo_df[(combo_df['product_combination'] == combo_df['product_combination']) & (combo_df['container'] != combo_df['container']) | (combo_df['commodity'] != combo_df['commodity'])]

Unnamed: 0_level_0,index,date,commodity,container,unit_mass,product_combination,total_value_sold,total_qty_sold,total_kg_sold,average,highest_price,ave_per_kg,highest_price_per_kg
rowid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1


### 4.5 sales

In [80]:
session.close()

In [81]:
connection.close()