In [1]:
import pandas as pd
import os
from dotenv import load_dotenv

# Load environment variables
load_dotenv()

True

In [2]:
# Get paths from environment variables
base_path = os.getenv('HTML_PATH')
download_path = os.getenv('DOWNLOAD_PATH')

# Construct full path to HTML file
path = os.path.join(base_path, "3.htm")

In [None]:
# Check if file exists before reading
if not os.path.exists(path):
    print(f"Error: HTML file not found at {path}")
else:
    # Read HTML file
    tables = pd.read_html(path)
    
    # Check if tables are found
    if not tables:
        print("No tables found in the HTML file.")
    else:
        print(f"Found {len(tables)} tables in the HTML file.")
        for i, table in enumerate(tables, 1):
            print(f"Table {i}: {table.shape[0]} rows x {table.shape[1]} columns")

Found 1 tables in the HTML file.
Table 1: 673 rows x 14 columns


In [None]:
def html_to_excel(html_file, excel_file):
    try:
        # Read HTML tables from the file
        tables = pd.read_html(html_file)
        if not tables:
            print("No tables found in the HTML file.")
            return
        
        # Save each table to a separate sheet in Excel
        with pd.ExcelWriter(excel_file, engine='openpyxl') as writer:
            for idx, table in enumerate(tables, start=1):
                sheet_name = f"Sheet{idx}"
                table.to_excel(writer, sheet_name=sheet_name, index=False)
        
        print(f"Conversion successful!")
    
    except FileNotFoundError:
        print(f"Error: File '{html_file}' not found.")
    except ValueError as e:
        print(f"Error reading HTML: {e}")
    except Exception as e:
        print(f"Unexpected error: {e}")

# Convert the HTML file to Excel
if os.path.exists(path):
    excel_output = os.path.join(download_path, "converted_tables.xlsx")
    html_to_excel(path, excel_output)
else:
    print("Cannot convert: HTML file not found.")

Conversion successful!


In [35]:
# read converted csv file
df = pd.read_excel(excel_output, sheet_name='Sheet1')
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13
0,Account: 55016647,Account: 55016647,Name: FundedNext-STLR 1-Step Account- Simone C...,Name: FundedNext-STLR 1-Step Account- Simone C...,Name: FundedNext-STLR 1-Step Account- Simone C...,Name: FundedNext-STLR 1-Step Account- Simone C...,Name: FundedNext-STLR 1-Step Account- Simone C...,Currency: USD,Currency: USD,Leverage:,Leverage:,"2025 November 2, 10:46","2025 November 2, 10:46","2025 November 2, 10:46"
1,Closed Transactions:,Closed Transactions:,Closed Transactions:,Closed Transactions:,Closed Transactions:,Closed Transactions:,Closed Transactions:,Closed Transactions:,Closed Transactions:,Closed Transactions:,Closed Transactions:,Closed Transactions:,Closed Transactions:,
2,Ticket,Open Time,Type,Size,Item,Price,S / L,T / P,Close Time,Price,Commission,Taxes,Swap,Profit
3,3891066,2025.10.31 03:00:01,buy,0.03,ndx100,26032.18,25794.45,26278.24,2025.10.31 19:58:51,25794.23,0.00,0.00,0.00,-71.39
4,,,,,,,,,,223041,223041[sl],223041[sl],223041[sl],


In [36]:
"""conditions always valid"""

# keep rows from row 2
df = df.iloc[2:].reset_index(drop=True)

# RangeIndex have the corrected column names
df.columns = df.iloc[0]
df = df[1:]

# trim whitespace from column names
df.columns = df.columns.str.strip()

In [39]:
df.tail(20)

Unnamed: 0,Ticket,Open Time,Type,Size,Item,Price,S / L,T / P,Close Time,Price.1,Commission,Taxes,Swap,Profit
650,Ticket,Open Time,Type,Size,Item,Price,S / L,T / P,Market Price,Market Price,,,,
651,No transactions,No transactions,No transactions,No transactions,No transactions,No transactions,No transactions,No transactions,No transactions,No transactions,No transactions,No transactions,No transactions,
652,,,,,,,,,,,,,,
654,Deposit/Withdrawal:,Deposit/Withdrawal:,98 605.21,98 605.21,Credit Facility:,Credit Facility:,Credit Facility:,Credit Facility:,0.00,,,,,
655,Closed Trade P/L:,Closed Trade P/L:,1 124.80,1 124.80,Floating P/L:,Floating P/L:,Floating P/L:,Floating P/L:,27.20,Margin:,Margin:,Margin:,1 543.67,1 543.67
656,Balance:,Balance:,99 730.01,99 730.01,Equity:,Equity:,Equity:,Equity:,99 757.21,Free Margin:,Free Margin:,Free Margin:,98 213.54,98 213.54
657,,,,,,,,,,,,,,
658,Details:,Details:,Details:,Details:,Details:,Details:,Details:,Details:,Details:,Details:,Details:,Details:,Details:,Details:
659,,,,,,,,,,,,,,
660,Gross Profit:,Gross Profit:,4 128.53,4 128.53,Gross Loss:,Gross Loss:,Gross Loss:,Gross Loss:,3 003.73,Total Net Profit:,Total Net Profit:,Total Net Profit:,1 124.80,1 124.80


In [38]:
"""conditions depending on data, check data length first"""
# drop rows from RangeIndex 653
if len(df) >= 653:
    df = df.drop(index=653)

In [28]:
"""conditions always valid"""

"""
drop rows where 'Open Time' and 'Close Time' have the following values:
Closed P/L:
Open Trades:
Open Time
Working Orders:
No transactions
0.00
"""
# drop strings in 'Open Time' and 'Close Time' columns
strings_to_drop = ['Closed P/L:', 'Open Trades:', 'Open Time', 'Working Orders:', 'No transactions', '0.00']
df = df[~df['Open Time'].isin(strings_to_drop)]
df = df[~df['Close Time'].isin(strings_to_drop)]

"""
drop rows where 'Type' have the following values:
balance
"""
# drop strings in 'Type' column
df = df[~df['Type'].isin(['balance'])]

In [None]:
"""
drop rows where 'Commission' contains the following values:
cancelled
Floating P/L:
"from #"
"Portfolio"
"folio"
"to #"
"""

# drop strings in 'Commission' column
strings_to_drop_commission = ['cancelled', 'Floating P/L:', 'from #', 'Portfolio', 'to #']
df = df[~df['Commission'].isin(strings_to_drop_commission)]

In [30]:
# print df.Commission.unique()
print(df.Commission.unique())

['0.00' '223041[sl]' '223017' 'tfolio [0.5.9.16.17.23.24][sl]'
 'Portfolio [0.9.16.20.23][sl]' 'Portfolio [0.1.10.13.16.23][sl]'
 'Portfolio [0.1.9.16.20.23][sl]' '223068[tp]' '223039[tp]' '223049[tp]'
 'Portfolio [0.1.9.16.20.23][tp]' '223043[sl]' '206131' '223011' '223057'
 '206002' '206121' '223006[sl]' '223046' '223044' '206027' '223028[sl]'
 '206018' '206028' '206016' '206088' '223057[sl]' '223006[tp]' '223012'
 '223015' '206102' '207159' '207100' '207006[sl]' '207075' '207086'
 '206091' '206090' '207031[sl]' '206083' 'from #3470085' 'to #3486164'
 'from #3465405[sl]' 'to #3469517' '207112' '-0.05' '179089' '178073'
 '179058' '179003' '179063' '178086' '178080' '178050' '178049' '178075'
 '178106' '178125' '179143[tp]' '178103' '178163' '178061' nan 'Margin:'
 'Free Margin:' 'Details:' 'Total Net Profit:' 'Relative Drawdown:'
 'Long Positions (won %):' 'Loss trades (% of total):' 'loss trade:'
 'consecutive losses ($):' 'consecutive loss (count):'
 'consecutive losses:']


In [8]:
# convert Open Time and Close Time to datetime
df['Open Time'] = pd.to_datetime(df['Open Time'])
df['Close Time'] = pd.to_datetime(df['Close Time'])

ValueError: time data "Summary:" doesn't match format "%Y.%m.%d %H:%M:%S", at position 182. You might want to try:
    - passing `format` if your strings have a consistent format;
    - passing `format='ISO8601'` if your strings are all ISO8601 but not necessarily in exactly the same format;
    - passing `format='mixed'`, and the format will be inferred for each element individually. You might want to use `dayfirst` alongside this.

In [13]:
df.head()

Unnamed: 0,Ticket,Open Time,Type,Size,Item,Price,S / L,T / P,Close Time,Price.1,Commission,Taxes,Swap,Profit
1,3891066.0,2025.10.31 03:00:01,buy,0.03,ndx100,26032.18,25794.45,26278.24,2025.10.31 19:58:51,25794.23,0.00,0.00,0.00,-71.39
2,,,,,,,,,,223041.0,223041[sl],223041[sl],223041[sl],
3,3891482.0,2025.10.31 05:00:00,buy stop,0.04,ndx100,26235.72,26078.31,0.0,2025.10.31 15:00:00,26080.06,cancelled,cancelled,cancelled,cancelled
4,,,,,,,,,,149.0,cancelled,cancelled,cancelled,
5,3891485.0,2025.10.31 05:00:00,buy stop,0.04,ndx100,26235.72,26078.31,0.0,2025.10.31 09:00:00,26024.31,cancelled,cancelled,cancelled,cancelled


In [19]:
df.tail(20)

Unnamed: 0,Ticket,Open Time,Type,Size,Item,Price,S / L,T / P,Close Time,Price.1,Commission,Taxes,Swap,Profit
646,,,,,,,,,,223017,223017,223017,223017,
647,,,,,,,,,,,0.00,0.00,-7.45,34.65
648,,,,,,,,,,,Floating P/L:,Floating P/L:,27.20,27.20
652,,,,,,,,,,,,,,
655,Closed Trade P/L:,Closed Trade P/L:,1 124.80,1 124.80,Floating P/L:,Floating P/L:,Floating P/L:,Floating P/L:,27.20,Margin:,Margin:,Margin:,1 543.67,1 543.67
656,Balance:,Balance:,99 730.01,99 730.01,Equity:,Equity:,Equity:,Equity:,99 757.21,Free Margin:,Free Margin:,Free Margin:,98 213.54,98 213.54
657,,,,,,,,,,,,,,
658,Details:,Details:,Details:,Details:,Details:,Details:,Details:,Details:,Details:,Details:,Details:,Details:,Details:,Details:
659,,,,,,,,,,,,,,
660,Gross Profit:,Gross Profit:,4 128.53,4 128.53,Gross Loss:,Gross Loss:,Gross Loss:,Gross Loss:,3 003.73,Total Net Profit:,Total Net Profit:,Total Net Profit:,1 124.80,1 124.80


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 670 entries, 1 to 670
Data columns (total 14 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Ticket      351 non-null    object
 1   Open Time   351 non-null    object
 2   Type        351 non-null    object
 3   Size        351 non-null    object
 4   Item        351 non-null    object
 5   Price       351 non-null    object
 6   S / L       351 non-null    object
 7   T / P       351 non-null    object
 8   Close Time  349 non-null    object
 9   Price       661 non-null    object
 10  Commission  663 non-null    object
 11  Taxes       663 non-null    object
 12  Swap        663 non-null    object
 13  Profit      350 non-null    object
dtypes: object(14)
memory usage: 73.4+ KB
