In [3]:
import pandas as pd
import urllib
from sqlalchemy import create_engine, MetaData, select
from sqlalchemy.orm import sessionmaker

# Custom upload with connection string
from engine_info import server_info
# From normalized_tables.py
from normalized_tables import Product, Container, Inventory, Sales, ProductCombination, db

import warnings
warnings.filterwarnings('ignore')

In [40]:
# Creating a connection to MS SQL SERVER
params = urllib.parse.quote_plus(server_info)
engine = create_engine('mssql+pyodbc:///?odbc_connect=%s' % params)
connection = engine.connect()

In [41]:
# Check what is in the database
engine.table_names()

['Capetown_Fresh_produce_market',
 'Capetown_Fresh_produce_market_cleaned',
 'container',
 'Durban_Fresh_produce_market',
 'Durban_Fresh_produce_market_cleaned',
 'inventory',
 'Joburg_Fresh_produce_combined_cleaned',
 'Joburg_Fresh_produce_commodity_cleaned',
 'Joburg_Fresh_produce_commodity_raw',
 'Joburg_Fresh_produce_container_cleaned',
 'Joburg_Fresh_produce_container_raw',
 'Joburg_Fresh_produce_product_combination_raw',
 'Joburg_Fresh_produce_scrapping_date',
 'PickNPay_Prices',
 'PickNPay_Prices_cleaned',
 'product',
 'product_combination',
 'sales',
 'Shoprite_Prices',
 'Shoprite_Prices_cleaned',
 'sysdiagrams',
 'woolworths_Prices',
 'Woolworths_Prices_cleaned']

In [42]:
metadata = MetaData(bind=engine)

## 1. Commodity (JHB)

In [7]:
commodity_df = pd.read_sql_table('Joburg_Fresh_produce_commodity_cleaned', con=engine)

In [8]:
commodity_df.head()

Unnamed: 0,index,rowid,date,commodity,qty_available,MTD_total_value_sold_(R),total_value_sold_(R),Total_quatity_sold,MTD_Total_quatity_sold,Total_kg_sold,MTD_total_kg_sold
0,0,1,20 August 2020,AMADUMBE,2,39870.0,0.0,0.0,97.0,0.0,1940.0
1,1,2,20 August 2020,APPLES,91755,22664221.0,1205932.0,13799.0,261296.0,157462.0,3163863.0
2,2,3,20 August 2020,ARTICHOKES,1,53100.0,600.0,4.0,439.0,3.0,522.0
3,3,4,20 August 2020,ASPARAGUS,8,258975.0,34000.0,50.0,359.0,250.0,1795.0
4,4,5,20 August 2020,ATCHARA,207,1351.2,0.0,0.0,23.0,0.0,65.0


In [9]:
# Check the data type so that it's suited to be inserted in a normalized database
commodity_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1133 entries, 0 to 1132
Data columns (total 11 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   index                     1133 non-null   int64  
 1   rowid                     1133 non-null   int64  
 2   date                      1133 non-null   object 
 3   commodity                 1133 non-null   object 
 4   qty_available             1133 non-null   int64  
 5   MTD_total_value_sold_(R)  1133 non-null   float64
 6   total_value_sold_(R)      1133 non-null   float64
 7   Total_quatity_sold        1133 non-null   float64
 8   MTD_Total_quatity_sold    1133 non-null   float64
 9   Total_kg_sold             1133 non-null   float64
 10  MTD_total_kg_sold         1133 non-null   float64
dtypes: float64(6), int64(3), object(2)
memory usage: 97.5+ KB


In [10]:
# Change the quantity sold to int
commodity_df[['Total_quatity_sold', 'MTD_Total_quatity_sold']] = commodity_df[['Total_quatity_sold', 'MTD_Total_quatity_sold']].astype('int64')

In [11]:
# Convert date column from string to datetime
commodity_df['date'] = pd.to_datetime(commodity_df['date'])

In [12]:
commodity_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1133 entries, 0 to 1132
Data columns (total 11 columns):
 #   Column                    Non-Null Count  Dtype         
---  ------                    --------------  -----         
 0   index                     1133 non-null   int64         
 1   rowid                     1133 non-null   int64         
 2   date                      1133 non-null   datetime64[ns]
 3   commodity                 1133 non-null   object        
 4   qty_available             1133 non-null   int64         
 5   MTD_total_value_sold_(R)  1133 non-null   float64       
 6   total_value_sold_(R)      1133 non-null   float64       
 7   Total_quatity_sold        1133 non-null   int64         
 8   MTD_Total_quatity_sold    1133 non-null   int64         
 9   Total_kg_sold             1133 non-null   float64       
 10  MTD_total_kg_sold         1133 non-null   float64       
dtypes: datetime64[ns](1), float64(4), int64(5), object(1)
memory usage: 97.5+ KB


## 2. Container (JHB)

In [44]:
container_df = pd.read_sql_table('Joburg_Fresh_produce_container_cleaned', con=engine)

In [45]:
container_df.head()

Unnamed: 0,index,rowid,date,commodity,container,qty_available,average_price_per_kg,MTD_total_value_sold_(R),total_value_sold_(R),Total_quatity_sold,MTD_Total_quatity_sold,Total_kg_sold,MTD_total_kg_sold
0,0,1,20 August 2020,AMADUMBE,20KG POCKET,2,0.0,39870.0,0.0,0.0,97.0,0.0,1940.0
1,1,2,20 August 2020,APPLES,10 X 1KG ECONO PACK CARTON,17,10.0,41932.0,100.0,1.0,547.0,10.0,5470.0
2,2,3,20 August 2020,APPLES,11KG JUMBLE CARTON,343,6.36,218914.0,1190.0,17.0,3170.0,187.0,34870.0
3,3,4,20 August 2020,APPLES,12 X 1KG ECONO PACK CARTON,3233,7.86,1454572.0,40738.0,432.0,17353.0,5184.0,208236.0
4,4,5,20 August 2020,APPLES,12.5KG M6 CARTON,12,0.0,16903.0,0.0,0.0,286.0,0.0,3575.0


In [15]:
container_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3987 entries, 0 to 3986
Data columns (total 13 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   index                     3987 non-null   int64  
 1   rowid                     3987 non-null   int64  
 2   date                      3987 non-null   object 
 3   commodity                 3987 non-null   object 
 4   container                 3987 non-null   object 
 5   qty_available             3987 non-null   int64  
 6   average_price_per_kg      3987 non-null   float64
 7   MTD_total_value_sold_(R)  3987 non-null   float64
 8   total_value_sold_(R)      3987 non-null   float64
 9   Total_quatity_sold        3987 non-null   float64
 10  MTD_Total_quatity_sold    3987 non-null   float64
 11  Total_kg_sold             3987 non-null   float64
 12  MTD_total_kg_sold         3987 non-null   float64
dtypes: float64(7), int64(3), object(3)
memory usage: 405.1+ KB


In [16]:
# Change the quantity sold to int
container_df[['Total_quatity_sold', 'MTD_Total_quatity_sold']] = container_df[['Total_quatity_sold', 'MTD_Total_quatity_sold']].astype('int64')

In [17]:
# Convert date column from string to datetime
container_df['date'] = pd.to_datetime(container_df['date'])

In [18]:
container_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3987 entries, 0 to 3986
Data columns (total 13 columns):
 #   Column                    Non-Null Count  Dtype         
---  ------                    --------------  -----         
 0   index                     3987 non-null   int64         
 1   rowid                     3987 non-null   int64         
 2   date                      3987 non-null   datetime64[ns]
 3   commodity                 3987 non-null   object        
 4   container                 3987 non-null   object        
 5   qty_available             3987 non-null   int64         
 6   average_price_per_kg      3987 non-null   float64       
 7   MTD_total_value_sold_(R)  3987 non-null   float64       
 8   total_value_sold_(R)      3987 non-null   float64       
 9   Total_quatity_sold        3987 non-null   int64         
 10  MTD_Total_quatity_sold    3987 non-null   int64         
 11  Total_kg_sold             3987 non-null   float64       
 12  MTD_total_kg_sold   

## 3. Combination (JHB)

In [19]:
combo_df = pd.read_sql_table('Joburg_Fresh_produce_combined_cleaned', con=engine, index_col='rowid')

In [20]:
combo_df.head()

Unnamed: 0_level_0,index,date,commodity,container,unit_mass,product_combination,total_value_sold,total_qty_sold,total_kg_sold,average,highest_price,ave_per_kg,highest_price_per_kg
rowid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
13803,0,2020-08-31,ONIONS,2KG POCKET,2,"SWEET,CL 1,L/M,*,NORTHERN CAPE",0.0,0,0.0,0.0,0.0,0.0,0.0
13804,1,2020-08-31,ONIONS,2KG POCKET,2,"WHITE,CL 1,L/M,*,NORTHERN CAPE",0.0,0,0.0,0.0,0.0,0.0,0.0
13805,2,2020-08-31,ONIONS,2KG POCKET,2,"WHITE,CL 1,M,*,NORTHERN CAPE",0.0,0,0.0,0.0,0.0,0.0,0.0
13806,3,2020-08-31,ONIONS,3KG POCKET,3,"BROWN,CL 1,L/M,*,NORTHERN CAPE",6138.0,361,1083.0,17.0,18.0,5.67,6.0
13807,4,2020-08-31,ONIONS,3KG POCKET,3,"BROWN,CL 1,L/M,*,WESTERN CAPE",0.0,0,0.0,0.0,0.0,0.0,0.0


In [21]:
combo_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 14634 entries, 13803 to 13802
Data columns (total 13 columns):
 #   Column                Non-Null Count  Dtype         
---  ------                --------------  -----         
 0   index                 14634 non-null  int64         
 1   date                  14634 non-null  datetime64[ns]
 2   commodity             14634 non-null  object        
 3   container             14634 non-null  object        
 4   unit_mass             14634 non-null  int64         
 5   product_combination   14634 non-null  object        
 6   total_value_sold      14634 non-null  float64       
 7   total_qty_sold        14634 non-null  int64         
 8   total_kg_sold         14634 non-null  float64       
 9   average               14634 non-null  float64       
 10  highest_price         14634 non-null  float64       
 11  ave_per_kg            14634 non-null  float64       
 12  highest_price_per_kg  14634 non-null  float64       
dtypes: datetime6

In [22]:
combo_df.groupby('container')['commodity'].nunique().sort_values(ascending=False)

container
200G PUNNET PACK BOX    37
3KG POCKET              31
500G PUNNET PACK BOX    29
5KG BOX                 26
4KG BOX                 26
                        ..
400G PACKET              1
4.5KG TRAY               1
4.50KG BOX               1
3KG ECONO PACK           1
.125G BOTTLE             1
Name: commodity, Length: 216, dtype: int64

In [23]:
combo_df[combo_df['container'] == '3KG POCKET']['commodity'].unique()[:10]

array(['ONIONS', 'ORANGES', 'PAPRI', 'PEARS', 'PEAS', 'APPLES',
       'AVOCADOS', 'BABY BUTTERNUT', 'BABY GEM SQUASH', 'BEANS'],
      dtype=object)

There's a many to many relationship between commodity and containers, whereby one commodity can have multiple containers and one container can be of different products.

In [24]:
combo_df.groupby('product_combination')['container'].nunique().sort_values(ascending=False)

product_combination
*,*,*,*,*          93
*,CL 1,L,*,*       23
*,CL 1,M,*,*       19
*,*,*,10,*         19
*,CL 2,M,*,*       17
                   ..
ORRI,CL 2,2,*,*     1
OYSTER,*,*,*,*      1
OYSTER,*,*,16,*     1
OYSTER,*,*,20,*     1
*,*, 28,5,*         1
Name: container, Length: 1970, dtype: int64

In [25]:
combo_df[combo_df['product_combination'] == '*,*,*,*,*']['container'].unique()[:10]

array(['11KG BOX', '2KG BOX', '4.5KG TRAY', '8KG BOX', 'EIGHT PACK',
       'QUAD PACK', 'RAV-PACK', 'SIX PACK', 'TAKE-4 PACK', '3KG BOX'],
      dtype=object)

In [26]:
combo_df[combo_df['product_combination'] == '*,*,*,*,*']['commodity'].unique()[:10]

array(['PAPINO', 'PAPRI', 'PARSLEY', 'PEANUTS RAW / UNSHELLED', 'PEAS',
       'PECAN NUT (SHELL)', 'PKS BLUE QUEEN', 'POT PLANTS', 'ATCHARA',
       'BABY BUTTERNUT'], dtype=object)

Similarly, product combination can have multiple containers as well as multiple commodities.

## 4. Add data to normalized tables

In [None]:
#Session = sessionmaker(bind=engine)


In [None]:
#session = Session()

### 4.1 product

In [27]:
# Different products in the database
products = combo_df['commodity'].unique()

In [28]:
# Add unique values of products to the product sql table
for item in products:
    
    # Add commodity to database
    commodity = Product(name=item)
    db.session.add(commodity)
    db.session.commit()

### 4.2 container

In [29]:
# Different containers in the database
containers = container_df['container'].unique()

In [30]:
# Add unique values of containers to the container sql table
for package in containers:
    
    # Add commodity to database
    container = Container(name=package)
    db.session.add(container)
    db.session.commit()

### 4.3 product_combination

In [31]:
# Different product combinations in the database
combinations = combo_df['product_combination'].unique()

In [32]:
# Add unique values of product combinations to the product combinations sql table
for combo in combinations:
    
    # Add commodity to database
    product_combo = ProductCombination(name=combo)
    db.session.add(product_combo)
    db.session.commit()

### 4.4 inventory

In [33]:
# The inventory table will consist of the products not sold for that day
container_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3987 entries, 0 to 3986
Data columns (total 13 columns):
 #   Column                    Non-Null Count  Dtype         
---  ------                    --------------  -----         
 0   index                     3987 non-null   int64         
 1   rowid                     3987 non-null   int64         
 2   date                      3987 non-null   datetime64[ns]
 3   commodity                 3987 non-null   object        
 4   container                 3987 non-null   object        
 5   qty_available             3987 non-null   int64         
 6   average_price_per_kg      3987 non-null   float64       
 7   MTD_total_value_sold_(R)  3987 non-null   float64       
 8   total_value_sold_(R)      3987 non-null   float64       
 9   Total_quatity_sold        3987 non-null   int64         
 10  MTD_Total_quatity_sold    3987 non-null   int64         
 11  Total_kg_sold             3987 non-null   float64       
 12  MTD_total_kg_sold   

In [37]:
for index, row in container_df.iterrows():
    
    some_product = Product.query.filter_by(name=row['commodity']).first()
    some_container = Container.query.filter_by(name=row['container']).first()
    inventory = Inventory(
        date=row['date'],
        available=row['qty_available'],
        product_inventory=some_product,
        container_inventory=some_container
    )
    db.session.add(inventory)
    db.session.commit()

### 4.5 sales

In [38]:
combo_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 14634 entries, 13803 to 13802
Data columns (total 13 columns):
 #   Column                Non-Null Count  Dtype         
---  ------                --------------  -----         
 0   index                 14634 non-null  int64         
 1   date                  14634 non-null  datetime64[ns]
 2   commodity             14634 non-null  object        
 3   container             14634 non-null  object        
 4   unit_mass             14634 non-null  int64         
 5   product_combination   14634 non-null  object        
 6   total_value_sold      14634 non-null  float64       
 7   total_qty_sold        14634 non-null  int64         
 8   total_kg_sold         14634 non-null  float64       
 9   average               14634 non-null  float64       
 10  highest_price         14634 non-null  float64       
 11  ave_per_kg            14634 non-null  float64       
 12  highest_price_per_kg  14634 non-null  float64       
dtypes: datetime6

In [None]:
# Exclude products that were not sold
filtered_df = combo_df[combo_df['total_qty_sold'] != 0]

In [49]:
for index, row in filtered_df.iterrows():
    
    some_product = Product.query.filter_by(name=row['commodity']).first()
    some_container = Container.query.filter_by(name=row['container']).first()
    some_combo = ProductCombination.query.filter_by(name=row['product_combination']).first()
    
    invoice = Sales(
        date=row['date'],
        quantity_sold=row['total_qty_sold'],
        kg_sold=row['total_kg_sold'],
        value=row['total_value_sold'],
        average_price=row['average'],
        highest_price=row['highest_price'],
        combination_sale=some_combo,
        container_sale=some_container,
        product_sale=some_product
    )
    db.session.add(invoice)
    db.session.commit()

OperationalError: (pyodbc.OperationalError) ('08S01', '[08S01] [Microsoft][ODBC SQL Server Driver]Communication link failure (0) (SQLEndTran)')
(Background on this error at: http://sqlalche.me/e/13/e3q8)

In [53]:
db.session.close()

In [54]:
connection.close()