Library

In [2]:
# Import libraries
import pandas as pd
import numpy as np
import datetime as dt

Symbol

In [3]:
# Save as a table
sql_symbol = pd.read_csv("symbol_id.csv")
sql_symbol.drop(['Unnamed: 0'], axis=1, inplace=True)

# Save as a dict
dict_sql_symbol = dict(zip(sql_symbol['SymbolID'], sql_symbol['Symbol']))

Pathlink

In [4]:
# Assign pathlink: Income Statement Bank
algo_bis_pathlink = r"/Users/rainmeteror/Desktop/BSC_Algo_Database_UAT/OneDrive_1_5-29-2023/bank/algo_bis.xlsx"
sql_bis_pathlink = r"/Users/rainmeteror/Desktop/BSC_Algo_Database_UAT/OneDrive_1_5-29-2023/bank/sql_bis.xlsx"
same_field_data = r"/Users/rainmeteror/Desktop/BSC_Algo_Database_UAT/OneDrive_1_5-29-2023/bank/bis_datafield.xlsx"

Algo Database

In [5]:
# Read data from algo database
df_algo = pd.read_excel(algo_bis_pathlink)
df_algo.drop(['Unnamed: 0', 'SCN', 'REPORTED_DATE', 'AUDITED', 'MONTH_IN_PERIOD', 'CREATED_DATE', 'UPDATED_DATE'], axis=1, inplace=True)
df_algo = df_algo.loc[df_algo['FREQ_CODE'] != 'Y']
df_algo['YEAR'] = df_algo['REPORT_DATE'].dt.year
df_algo['QUARTER'] = df_algo['REPORT_DATE'].dt.quarter
df_algo.drop(['REPORT_DATE', 'FREQ_CODE'], axis=1, inplace=True)

# Sort Values
df_algo.sort_values(by=['YEAR', 'QUARTER', 'SECURITY_CODE'], ascending=[True, True, True], inplace=True)

# Print how many rows are used
print("ALGO DATABASE BANKS: There are: " + str(len(df_algo)) + " rows")

ALGO DATABASE BANKS: There are: 400 rows


In [6]:
df_algo.head(1)

Unnamed: 0,SECURITY_CODE,BIS_1,BIS_2,BIS_3,BIS_4,BIS_5,BIS_6,BIS_7,BIS_8,BIS_9,...,BIS_17,BIS_18,BIS_19,BIS_20,BIS_22,BIS_22A,BIS_23,BIS_21,YEAR,QUARTER
473,ABB,1708669895628,-1176205512866,532464382762,72715267319,-46188566079,26526701240,143424678052,0,167082584092,...,380793754580,-3877223508,0,-3877223508,0,376916531072,0.0,,2020,1


Stockbiz Database

In [7]:
# Read data from sql database
df_sql = pd.read_excel(sql_bis_pathlink)
df_sql.drop(['Unnamed: 0', 'ReportID', 'LastUpdated', 'ApprovedBy', 'Currency', 'CreatedBy', 'LastUpdatedBy',
             'ReportStatus', 'AssignedTo', 'CreatedAt'], axis=1, inplace=True)
df_sql = df_sql.loc[(df_sql['Quarter'] != 0) & (df_sql['Year'] > 2019)]
df_sql.sort_values(by=['Year', 'Quarter'], ascending=[True, True], inplace=True)

# Change the name of tickers
df_sql['SymbolID'] = df_sql['SymbolID'].map(dict_sql_symbol)

# Change the name of column SymbolID
df_sql.rename(
    columns={"SymbolID": "SECURITY_CODE",
             "Year": "YEAR",
             "Quarter": "QUARTER"},
    inplace=True
)

# Print how many rows are used
print("STOCKBIZ DATABASE BANKS: There are: " + str(len(df_sql)) + " rows")

STOCKBIZ DATABASE BANKS: There are: 373 rows


In [8]:
df_sql.head(1)

Unnamed: 0,SECURITY_CODE,QUARTER,YEAR,InterestAndSimilarIncome,InterestExpenseAndSimilarCharges,NetInterestIncome,ServiceIncome,ServiceExpense,NetServiceIncomeExpense,ForeignExchangeIncomeExpense,...,PresentCorporateIncomeTaxExpenses,DeferredIncomeTaxesExpenses,CorporateIncomeTaxExpenses,NetProfitAfterTaxes,ProfitAttributableToMinorityInterestsAndFavourDevidends,ProfitAfterCorporateIncomeTaxes,Shares,EPS,DilutedEPS,Dividend
33,STB,1,2020,8486713000000.0,-5646836000000.0,2839877000000.0,1126417000000.0,-405121000000.0,721296000000.0,232683000000.0,...,-202142000000.0,0.0,-202142000000.0,785741000000.0,0.0,785741000000.0,,,,


In [9]:
# Get Symbol in banking sectors
banking_symbol = df_sql['SECURITY_CODE'].unique()

Process 2 tables

In [10]:
# Change the name of columns
same_column_name = pd.read_excel(same_field_data)
same_column_name.dropna(axis=0, inplace=True)

# Create a dict with key from bbs_sql and value from bbs_algo
dict_column_name = dict(zip(same_column_name['bis_algo'], same_column_name['bis_sql']))

In [11]:
same_column_name.head(1)

Unnamed: 0,bis_algo,bis_sql
0,BIS_1,InterestAndSimilarIncome


In [12]:
# Change the name of df_sql based on name of the similar data field in df_algo
df_sql.rename(
    columns=dict_column_name,
    inplace=True
)

# Fill NA values in df_sql
df_sql = df_sql.fillna(0)

# Change the type of data which is same between two tables
df_sql[same_column_name['bis_algo']] = df_sql[same_column_name['bis_sql']].astype('Int64')

In [13]:
# Change the type of data in df_algo
# Just get the same data between two tables
df_algo[same_column_name['bis_algo']] = df_algo[same_column_name['bis_algo']].apply(pd.to_numeric, errors="ignore")

# Change the type of YEAR and QUARTER
df_algo =  df_algo.astype(
    {"YEAR": "Int64",
     "QUARTER": "Int64"},
)

# Get the data with same value in banking sector
df_algo = df_algo.loc[df_algo['SECURITY_CODE'].isin(banking_symbol)]

In [14]:
df_algo = df_algo.astype(
    {"BIS_21": "Int64"}
)

In [15]:
# Get the final same columns
final_column = ["SECURITY_CODE", "YEAR", "QUARTER"]

for i in same_column_name['bis_algo']:
    final_column.append(i)

In [16]:
df_sql = df_sql[final_column]
df_algo = df_algo[final_column]

In [17]:
def compare_table(df1, df2, columns) -> pd.DataFrame():
    
    return pd.merge(df1, df2, on=columns, how='outer', indicator=True).query("_merge != 'both'")

In [18]:
final_result = {"field": [], "value_not_same": []}
for i in range(3, len(final_column)):
    check_columns = ["SECURITY_CODE", 'YEAR', 'QUARTER', final_column[i]]
    result = compare_table(
        df1=df_sql[check_columns], 
        df2=df_algo[check_columns], 
        columns=check_columns)
    
    value_not_same = len(result)/(len(df_algo) + len(df_sql))*100
    
    final_result["field"].append(final_column[i])
    final_result["value_not_same"].append(value_not_same)

In [19]:
pd.DataFrame(final_result).to_excel(r"/Users/rainmeteror/Desktop/BSC_Algo_Database_UAT/OneDrive_1_5-29-2023/result/check_bis.xlsx")