In [428]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from unicodedata import normalize
import datetime
from os import listdir
from os.path import isfile, join

In [429]:
# test code
file = 'D:/MyDocuments/Logs/HL/Investnents/2020-11-07/Artemis Global Income _ Security details _ My accounts & dealing _ Hargreaves Lansdown.html'
tables_raw = pd.read_html(file)
tables = []

for table in tables_raw:
    tables.append(table)
    
idx = 0
for table in tables:
    df_temp = pd.DataFrame(table)
    if 'Security' in df_temp.columns:
        print(idx)
    else:
        idx +=1

# df_temp = pd.DataFrame(tables[8])
# df_temp


7


In [430]:
def validate_file(file: str, header_rows: int, cols_required: list) -> int:
    table_number = -1
    tables_raw = pd.read_html(file)
    tables = []

    for table in tables_raw:
        tables.append(table)

    # for each table, check if all columns are present, if so extract table number
    # if after all tables have been checked, if table not present, invalid file
    idx = 0
    for table in tables:
        if table_number == -1:
            possible_table = True
            df_temp = pd.DataFrame(table)
            cols_in = df_temp.columns

            for col in cols_required:
                if possible_table:
                    if not col in cols_in:
                        possible_table = False
        
            if possible_table:
                table_number = idx
            else:
                idx +=1

    return table_number

In [439]:
def validate_file_set(files_in: list, header_rows: int, cols_required: list) -> list:
    validity = []
    
    for file in files_in:
        full_file_path = file['Full_File_Path']
        
        table_number = validate_file(full_file_path, header_rows, cols_required)
        new_dict = {'Date': file['Date'], 'Full_File_Path': full_file_path, 'Table_Number': table_number}
        if 'Account' in file:
            new_dict["Account"] = file['Account']
        validity.append(new_dict)
        
    return validity

In [440]:
def process_file_set(files_in: list, header_rows: int, cols_required: list) -> pd.DataFrame:
    df_out_combined = None
    
    for file in files_in:
        full_file_path = file['Full_File_Path']
        date = file['Date']
        table_number = file['Table_Number']
        
        tables_raw = pd.read_html(full_file_path)
        tables = []

        for table in tables_raw:
            tables.append(table)

        df_out_file = pd.DataFrame(tables[table_number])
        df_out_file = df_out_file[cols_required]
        df_out_file['Date'] = date
        if 'Account' in file:
            account = file['Account']
            df_out_file['Account'] = account
            
        if df_out_combined is None:
            df_out_combined = df_out_file
        else:
            df_out_combined = pd.concat([df_out_combined, df_out_file], axis=0)
    
    return df_out_combined

In [441]:
pd.options.display.max_colwidth = 100
root_path = 'D:/MyDocuments/Logs/HL/'
df_fund_metadata = pd.read_excel(root_path + 'InvestmentMetadata.xlsx', sheet_name='FundMetadata')
df_stock_fund_mapping = pd.read_excel(root_path + 'InvestmentMetadata.xlsx', sheet_name='StockFundMapping')
df_file_fund_mapping = pd.read_excel(root_path + 'InvestmentMetadata.xlsx', sheet_name='FileFundMapping')

In [442]:
inv_path = root_path + 'Investnents/'
inv_files = []
onlyfiles = [f for f in listdir(inv_path) if isfile(join(inv_path, f))]
onlyfolders = [f for f in listdir(inv_path)]

for folder in onlyfolders:
    date = pd.to_datetime(folder)
    inv_path_date = inv_path + folder
    inv_file_names = [f for f in listdir(inv_path_date) if isfile(join(inv_path_date, f))]
    
    for file in inv_file_names:
        inv_file = {'Date': folder, 'File': file, 'Full_File_Path': inv_path + folder + '/' + inv_file_name}
        inv_files.append(inv_file)

df_inv = pd.DataFrame(inv_files)
df_inv = df_inv.merge(df_file_fund_mapping, left_on='File', right_on='File', how='left')

In [443]:
# file name validation
df_inv.loc[df_inv['File'].isnull(), ['Date', 'File']]

Unnamed: 0,Date,File


In [444]:
df_inv = df_inv.merge(df_fund_metadata, left_on='Fund', right_on='Fund', how='inner')
cols = ['Date', 'Fund', 'Return_Type', 'Currency']
df_inv = df_inv[cols]
df_inv.head()

Unnamed: 0,Date,Fund,Return_Type,Currency
0,2020-11-07,Aberdeen Standard Global Innovation Equit Class R2 - Accumulation (GBP),Accumulation,GBP
1,2020-11-21,Aberdeen Standard Global Innovation Equit Class R2 - Accumulation (GBP),Accumulation,GBP
2,2020-11-28,Aberdeen Standard Global Innovation Equit Class R2 - Accumulation (GBP),Accumulation,GBP
3,2020-12-05,Aberdeen Standard Global Innovation Equit Class R2 - Accumulation (GBP),Accumulation,GBP
4,2020-11-07,Artemis Global Income Class I - Income (GBP),Income,GBP


In [445]:
# security table validation
cols_required = ['Security', 'Weight']

df_security_table_nos = validate_file_set(inv_files, 0, cols_required)
for file in df_security_table_nos:
    if file['Table_Number'] == -1:
        print(file['File'])

In [446]:
# sector table validation
cols_required = ['Sector', 'Weight']

df_sector_table_nos = validate_file_set(inv_files, 0, cols_required)
for file in df_sector_table_nos:
    if file['Table_Number'] == -1:
        print(file['File'])

In [447]:
# country table validation
cols_required = ['Country', 'Weight']

df_country_table_nos = validate_file_set(inv_files, 0, cols_required)
for file in df_country_table_nos:
    if file['Table_Number'] == -1:
        print(file['File'])

In [448]:
# security df extraction
cols_required = ['Security', 'Weight']
df_security = process_file_set(df_security_table_nos, 0, cols_required)

In [449]:
# sector df extraction
cols_required = ['Sector', 'Weight']
df_sector = process_file_set(df_sector_table_nos, 0, cols_required)

In [450]:
# country df extraction
cols_required = ['Country', 'Weight']
df_country = process_file_set(df_country_table_nos, 0, cols_required)

In [460]:
# read position file names
position_path = root_path + 'Account-Summary/'
position_files = []
contents = [f for f in listdir(position_path) if isfile(join(position_path, f))]

for file in contents:
    date = pd.to_datetime(file[18:28])
    if 'ISA' in file:
        account = 'ISA'
    elif 'SIPP' in file:
        account = 'SIPP'
    position_file = {'Date': date, 'File': file, 'Full_File_Path': position_path + file, 'Account': account}
    position_files.append(position_file)

In [461]:
# account summary table validation
cols_required = ['Stock', 'Unitsheld', 'Price(pence)', 'Value(£)']

df_position_table_nos = validate_file_set(position_files, 0, cols_required)
for file in df_position_table_nos:
    if file['Table_Number'] == -1:
        print(file['File'])

In [462]:
# account summary df extraction
cols_required = ['Stock', 'Unitsheld', 'Price(pence)', 'Value(£)']
df_position = process_file_set(df_position_table_nos, 0, cols_required)

In [463]:
df_position.head()

Unnamed: 0,Stock,Unitsheld,Price(pence),Value(£),Date,Account
0,ASI Global Smaller Companies Class S - Accumulation (GBP),3641.661,142.5,5189.37,2020-11-07,ISA
1,Baillie Gifford Managed Class B - Accumulation (GBP),334.448,1491.0,4986.62,2020-11-07,ISA
2,BlackRock Gold & General Class D1 - Accumulation (GBP),240.24,1744.0,4189.79,2020-11-07,ISA
3,Fidelity Special Situations Class W - Accumulation (GBP),175.75,2897.0,5091.48,2020-11-07,ISA
4,Jupiter Income Class Z - Income (GBP),1282.31,404.52,5187.2,2020-11-07,ISA


In [464]:
rename = {'Unitsheld': 'Units', 'Price(pence)': 'Price', 'Value(£)': 'Value'}

df_position.rename(columns=rename, inplace=True)
df_position['Date'] = pd.to_datetime(df_position['Date'])
df_position = df_position.loc[df_position['Stock'] != 'Total', :]
df_position.loc[:, 'Price'] = df_position['Price']/100

In [467]:
df_position

Unnamed: 0,Stock,Units,Price,Value,Date,Account
0,ASI Global Smaller Companies Class S - Accumulation (GBP),3641.661,1.4250,5189.37,2020-11-07,ISA
1,Baillie Gifford Managed Class B - Accumulation (GBP),334.448,14.9100,4986.62,2020-11-07,ISA
2,BlackRock Gold & General Class D1 - Accumulation (GBP),240.240,17.4400,4189.79,2020-11-07,ISA
3,Fidelity Special Situations Class W - Accumulation (GBP),175.750,28.9700,5091.48,2020-11-07,ISA
4,Jupiter Income Class Z - Income (GBP),1282.310,4.0452,5187.20,2020-11-07,ISA
...,...,...,...,...,...,...
11,Legal & General UK Mid Cap Index Class I - Accumulation (GBP),9885.330,0.5844,5776.99,2020-12-05,SIPP
12,LF Majedie UK Equity Class X - Income (GBP),3955.696,1.3980,5530.06,2020-12-05,SIPP
13,Man GLG Japan CoreAlpha Professional Class - Income (GBP),2804.262,1.9800,5552.44,2020-12-05,SIPP
14,Ninety One Global Gold Class I - Accumulation (GBP),1707.067,2.0583,3513.66,2020-12-05,SIPP


In [466]:
df_stock_fund_mapping.head()

Unnamed: 0,Fund,Stock
0,Aberdeen Standard Global Innovation Equit Class R2 - Accumulation (GBP),Aberdeen Standard Global Innovation Equit Class R2 - Accumulation (GBP)
1,Artemis Global Income Class I - Income (GBP),Artemis Global Income Class I - Income (GBP)
2,ASI Asia Pacific Equity Class I - Accumulation (GBP),ASI Asia Pacific Equity Class I - Accumulation (GBP)
3,ASI Global Smaller Companies Class S - Accumulation (GBP),ASI Global Smaller Companies Class S - Accumulation (GBP)
4,Baillie Gifford Managed Class B - Accumulation (GBP),Baillie Gifford Managed Class B - Accumulation (GBP)


In [468]:
df_position = df_position.merge(df_stock_fund_mapping, left_on='Stock', right_on='Stock', how='left')

In [470]:
df_position

Unnamed: 0,Stock,Units,Price,Value,Date,Account,Fund
0,ASI Global Smaller Companies Class S - Accumulation (GBP),3641.661,1.4250,5189.37,2020-11-07,ISA,ASI Global Smaller Companies Class S - Accumulation (GBP)
1,Baillie Gifford Managed Class B - Accumulation (GBP),334.448,14.9100,4986.62,2020-11-07,ISA,Baillie Gifford Managed Class B - Accumulation (GBP)
2,BlackRock Gold & General Class D1 - Accumulation (GBP),240.240,17.4400,4189.79,2020-11-07,ISA,BlackRock Gold & General Class D1 - Accumulation (GBP)
3,Fidelity Special Situations Class W - Accumulation (GBP),175.750,28.9700,5091.48,2020-11-07,ISA,Fidelity Special Situations Class W - Accumulation (GBP)
4,Jupiter Income Class Z - Income (GBP),1282.310,4.0452,5187.20,2020-11-07,ISA,Jupiter Income Class Z - Income (GBP)
...,...,...,...,...,...,...,...
115,Legal & General UK Mid Cap Index Class I - Accumulation (GBP),9885.330,0.5844,5776.99,2020-12-05,SIPP,Legal & General UK Mid Cap Index Class I - Accumulation (GBP)
116,LF Majedie UK Equity Class X - Income (GBP),3955.696,1.3980,5530.06,2020-12-05,SIPP,LF Majedie UK Equity Class X - Income (GBP)
117,Man GLG Japan CoreAlpha Professional Class - Income (GBP),2804.262,1.9800,5552.44,2020-12-05,SIPP,Man GLG Japan CoreAlpha Professional Class - Income (GBP)
118,Ninety One Global Gold Class I - Accumulation (GBP),1707.067,2.0583,3513.66,2020-12-05,SIPP,Ninety One Global Gold Class I - Accumulation (GBP)


In [471]:
df_position = df_position.merge(df_fund_metadata, left_on='Fund', right_on='Fund', how='left')

In [472]:
df_position

Unnamed: 0,Stock,Units,Price,Value,Date,Account,Fund,Return_Type,Currency
0,ASI Global Smaller Companies Class S - Accumulation (GBP),3641.661,1.4250,5189.37,2020-11-07,ISA,ASI Global Smaller Companies Class S - Accumulation (GBP),Accumulation,GBP
1,Baillie Gifford Managed Class B - Accumulation (GBP),334.448,14.9100,4986.62,2020-11-07,ISA,Baillie Gifford Managed Class B - Accumulation (GBP),Accumulation,GBP
2,BlackRock Gold & General Class D1 - Accumulation (GBP),240.240,17.4400,4189.79,2020-11-07,ISA,BlackRock Gold & General Class D1 - Accumulation (GBP),Accumulation,GBP
3,Fidelity Special Situations Class W - Accumulation (GBP),175.750,28.9700,5091.48,2020-11-07,ISA,Fidelity Special Situations Class W - Accumulation (GBP),Accumulation,GBP
4,Jupiter Income Class Z - Income (GBP),1282.310,4.0452,5187.20,2020-11-07,ISA,Jupiter Income Class Z - Income (GBP),Income,GBP
...,...,...,...,...,...,...,...,...,...
115,Legal & General UK Mid Cap Index Class I - Accumulation (GBP),9885.330,0.5844,5776.99,2020-12-05,SIPP,Legal & General UK Mid Cap Index Class I - Accumulation (GBP),Accumulation,GBP
116,LF Majedie UK Equity Class X - Income (GBP),3955.696,1.3980,5530.06,2020-12-05,SIPP,LF Majedie UK Equity Class X - Income (GBP),Income,GBP
117,Man GLG Japan CoreAlpha Professional Class - Income (GBP),2804.262,1.9800,5552.44,2020-12-05,SIPP,Man GLG Japan CoreAlpha Professional Class - Income (GBP),Income,GBP
118,Ninety One Global Gold Class I - Accumulation (GBP),1707.067,2.0583,3513.66,2020-12-05,SIPP,Ninety One Global Gold Class I - Accumulation (GBP),Accumulation,GBP


In [473]:
cols = ['Date', 'Account', 'Fund', 'Return_Type', 'Currency', 'Units', 'Price', 'Value']
df_position = df_position[cols]
df_position.head()

Unnamed: 0,Date,Account,Fund,Return_Type,Currency,Units,Price,Value
0,2020-11-07,ISA,ASI Global Smaller Companies Class S - Accumulation (GBP),Accumulation,GBP,3641.661,1.425,5189.37
1,2020-11-07,ISA,Baillie Gifford Managed Class B - Accumulation (GBP),Accumulation,GBP,334.448,14.91,4986.62
2,2020-11-07,ISA,BlackRock Gold & General Class D1 - Accumulation (GBP),Accumulation,GBP,240.24,17.44,4189.79
3,2020-11-07,ISA,Fidelity Special Situations Class W - Accumulation (GBP),Accumulation,GBP,175.75,28.97,5091.48
4,2020-11-07,ISA,Jupiter Income Class Z - Income (GBP),Income,GBP,1282.31,4.0452,5187.2
