In [1]:
import pandas as pd
import numpy as np
import requests
import urllib
from scrapy.http import HtmlResponse
from sqlalchemy import create_engine, insert, Table, MetaData, select

# Custom upload with connection string
from engine_info import server_info
# From tables.py
import tables

import warnings
warnings.filterwarnings('ignore')

In [2]:
# Creating a connection to MS SQL SERVER
params = urllib.parse.quote_plus(server_info)
engine = create_engine('mssql+pyodbc:///?odbc_connect=%s' % params)
connection = engine.connect()

In [3]:
engine.table_names()

['Capetown_Fresh_produce_market',
 'container',
 'Durban_Fresh_produce_market',
 'inventory',
 'Joburg_Fresh_produce_commodity_raw',
 'Joburg_Fresh_produce_container_raw',
 'Joburg_Fresh_produce_product_combination_raw',
 'Joburg_Fresh_produce_scrapping_date',
 'master_date',
 'PickNPay_Prices',
 'product',
 'product_combination',
 'sales',
 'Shoprite_Prices',
 'sysdiagrams',
 'woolworths_Prices']

In [4]:
jhb_comm_df = pd.read_sql_query('SELECT * FROM Joburg_Fresh_produce_commodity_raw',connection)
jhb_con_df = pd.read_sql_query('SELECT * FROM Joburg_Fresh_produce_container_raw',connection)
jhb_comb_df = pd.read_sql_query('SELECT * FROM Joburg_Fresh_produce_product_combination_raw',connection)

In [5]:
def rand_value(df, column):
    """
    Takes a dataframe and a column with rand value to be cleaned as arguments, 
    returns a dataframe, with two new columns with rand value for one day and
    Month To Date rand value. The initial column that was added as argument is removed from the dataframe.
    """ 
    
    df["MTD_total_value_sold_(R)"]=np.nan
    df["total_value_sold_(R)"]=np.nan
    for i in range(len(df[column])):
        df["MTD_total_value_sold_(R)"][i]=round(float(df[column][i].split("R")[-1].replace(',','')),2)
        df["total_value_sold_(R)"][i]=round(float(df[column][i].split("R")[1].split("M")[0].replace(",","")),2)
    
    df1=df.copy()    
    df1.drop(column,axis=1, inplace=True)
    return df1

In [6]:
# cleaning the rand value column of commodity table and storing it in a new dataframe
jhb_comm_df_cleaned=rand_value(jhb_comm_df,"total_value_sold")
# cleaning the rand value column of the container dataframe and storing
jhb_con_df_cleaned=rand_value(jhb_con_df,"value_sold")

In [7]:
#Visualising the new dataframes
jhb_comm_df_cleaned.head()

Unnamed: 0,rowid,date,commodity,total_qty_sold,total_kg_sold,qty_available,MTD_total_value_sold_(R),total_value_sold_(R)
0,1,20 August 2020,AMADUMBE,0MTD: 97,"0MTD: 1,940",2,39870.0,0.0
1,2,20 August 2020,APPLES,"13,799MTD: 261,296","157,462MTD: 3,163,863",91755,22664221.0,1205932.0
2,3,20 August 2020,ARTICHOKES,4MTD: 439,3MTD: 522,1,53100.0,600.0
3,4,20 August 2020,ASPARAGUS,50MTD: 359,"250MTD: 1,795",8,258975.0,34000.0
4,5,20 August 2020,ATCHARA,0MTD: 23,0MTD: 65,207,1351.2,0.0


In [8]:
jhb_con_df_cleaned.head()

Unnamed: 0,rowid,date,commodity,container,qty_available,qty_sold,kg_sold,average_price_per_kg,MTD_total_value_sold_(R),total_value_sold_(R)
0,1,20 August 2020,AMADUMBE,20KG POCKET,2,0MTD: 97,"0MTD: 1,940",R0,39870.0,0.0
1,2,20 August 2020,APPLES,10 X 1KG ECONO PACK CARTON,17,1MTD: 547,"10MTD: 5,470",R10,41932.0,100.0
2,3,20 August 2020,APPLES,11KG JUMBLE CARTON,343,"17MTD: 3,170","187MTD: 34,870",R6.36,218914.0,1190.0
3,4,20 August 2020,APPLES,12 X 1KG ECONO PACK CARTON,3233,"432MTD: 17,353","5184MTD: 208,236",R7.86,1454572.0,40738.0
4,5,20 August 2020,APPLES,12.5KG M6 CARTON,12,0MTD: 286,"0MTD: 3,575",R0,16903.0,0.0


In [9]:
def rand_value(df, column):
    """
    Takes a dataframe and a column with rand value to be cleaned as arguments, 
    returns a dataframe, with two new columns with rand value for one day and
    Month To Date rand value. The initial column that was added as argument is removed from the dataframe.
    """ 
    
    df["MTD_total_value_sold_(R)"]=np.nan
    df["total_value_sold_(R)"]=np.nan
    for i in range(len(df[column])):
        df["MTD_total_value_sold_(R)"][i]=round(float(df[column][i].split("R")[-1].replace(',','')),2)
        df["total_value_sold_(R)"][i]=round(float(df[column][i].split("R")[1].split("M")[0].replace(",","")),2)
    
    df1=df.copy()    
    df1.drop(column,axis=1, inplace=True)
    return df1

In [10]:
# cleaning the rand value column of commodity table and storing it in a new dataframe
jhb_comm_df_cleaned=rand_value(jhb_comm_df,"total_value_sold")
# cleaning the rand value column of the container dataframe and storing
jhb_con_df_cleaned=rand_value(jhb_con_df,"value_sold")

In [11]:
def sold_qty_kg(df,quatity_sold, weight_sold):
    """
    Takes a dataframe and a two column with quantity sold and weight sold to be cleaned as arguments. 
    Returns a dataframe, with four new columns with measurements for one day and month to date measurements,
    of quantity sold and weight sold for one day and Month To Date. The initial columns that were added as argument 
    are removed from the new dataframe that is returned.
    """ 
    
    df["Total_quatity_sold"]=np.nan
    df["MTD_Total_quatity_sold"]=np.nan
    df["Total_kg_sold"]=np.nan
    df["MTD_total_kg_sold"]=np.nan
    for i in range(len(df[quatity_sold])):
        df["Total_quatity_sold"][i]=int(df[quatity_sold][i].split("M")[0].replace(",",""))
        df["MTD_Total_quatity_sold"][i]=int(df[quatity_sold][i].split(":")[-1].replace(',',''))
        df["Total_kg_sold"][i]=float(df[weight_sold][i].split("M")[0].replace(",",""))
        df["MTD_total_kg_sold"][i]=float(df[weight_sold][i].split(":")[-1].replace(',',''))
    df1=df.copy()
    df1.drop([quatity_sold,weight_sold],axis=1, inplace=True)
    return df1

In [12]:
jhb_comm_df_cleaned= sold_qty_kg(jhb_comm_df_cleaned, "total_qty_sold","total_kg_sold")
jhb_con_df_cleaned= sold_qty_kg(jhb_con_df_cleaned, "qty_sold","kg_sold")

In [13]:
def to_numeric(df,column):
    """
    Takes a dataframe and one column that have numbers and a comma as a character.
    Removes the comma and change the column to numeric.
    Returns a dataframe with the column changed to be a numeric column
    """ 
    for i in range(len(df[column])):
        df[column][i]=df[column][i].replace("R","").replace(",","")
    df[column]= pd.to_numeric(df[column])
    return  df

In [14]:
# Cleaning Johannesburg Dataframe
jhb_comb_df["date"]=pd.to_datetime(jhb_comb_df["date"])
jhb_comb_df=to_numeric(jhb_comb_df, "total_value_sold")
jhb_comb_df=to_numeric(jhb_comb_df, "average")
jhb_comb_df=to_numeric(jhb_comb_df, "highest_price")
jhb_comb_df=to_numeric(jhb_comb_df, "ave_per_kg")
jhb_comb_df=to_numeric(jhb_comb_df, "highest_price_per_kg")

In [15]:
jhb_con_df_cleaned=to_numeric(jhb_con_df_cleaned, "average_price_per_kg")

In [16]:
jhb_comm_df_cleaned.to_sql('Joburg_Fresh_produce_commodity_cleaned',  con=engine, index=False, if_exists='append')

In [17]:
jhb_con_df_cleaned.to_sql('Joburg_Fresh_produce_container_cleaned',  con=engine, index=False, if_exists='append')

In [18]:
jhb_comb_df.to_sql('Joburg_Fresh_produce_combined_cleaned',  con=engine, index=False, if_exists='append')

In [19]:
connection.close()