In [106]:
import pandas as pd
import os
from dotenv import load_dotenv

# Load environment variables
load_dotenv()

True

In [107]:
# Get paths from environment variables
base_path = os.getenv('HTML_PATH')
download_path = os.getenv('DOWNLOAD_PATH')

# Construct full path to HTML file
path = os.path.join(base_path, "3.htm")

In [108]:
# Check if file exists before reading
if not os.path.exists(path):
    print(f"Error: HTML file not found at {path}")
else:
    # Read HTML file
    tables = pd.read_html(path)
    
    # Check if tables are found
    if not tables:
        print("No tables found in the HTML file.")
    else:
        print(f"Found {len(tables)} tables in the HTML file.")
        for i, table in enumerate(tables, 1):
            print(f"Table {i}: {table.shape[0]} rows x {table.shape[1]} columns")

Found 1 tables in the HTML file.
Table 1: 673 rows x 14 columns


In [109]:
def html_to_excel(html_file, excel_file):
    try:
        # Read HTML tables from the file
        tables = pd.read_html(html_file)
        if not tables:
            print("No tables found in the HTML file.")
            return
        
        # Save each table to a separate sheet in Excel
        with pd.ExcelWriter(excel_file, engine='openpyxl') as writer:
            for idx, table in enumerate(tables, start=1):
                sheet_name = f"Sheet{idx}"
                table.to_excel(writer, sheet_name=sheet_name, index=False)
        
        print(f"Conversion successful!")
    
    except FileNotFoundError:
        print(f"Error: File '{html_file}' not found.")
    except ValueError as e:
        print(f"Error reading HTML: {e}")
    except Exception as e:
        print(f"Unexpected error: {e}")

# Convert the HTML file to Excel
if os.path.exists(path):
    excel_output = os.path.join(download_path, "converted_tables.xlsx")
    html_to_excel(path, excel_output)
else:
    print("Cannot convert: HTML file not found.")

Conversion successful!


In [110]:
# read converted csv file
df = pd.read_excel(excel_output, sheet_name='Sheet1')
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13
0,Account: 55016647,Account: 55016647,Name: FundedNext-STLR 1-Step Account- Simone C...,Name: FundedNext-STLR 1-Step Account- Simone C...,Name: FundedNext-STLR 1-Step Account- Simone C...,Name: FundedNext-STLR 1-Step Account- Simone C...,Name: FundedNext-STLR 1-Step Account- Simone C...,Currency: USD,Currency: USD,Leverage:,Leverage:,"2025 November 2, 10:46","2025 November 2, 10:46","2025 November 2, 10:46"
1,Closed Transactions:,Closed Transactions:,Closed Transactions:,Closed Transactions:,Closed Transactions:,Closed Transactions:,Closed Transactions:,Closed Transactions:,Closed Transactions:,Closed Transactions:,Closed Transactions:,Closed Transactions:,Closed Transactions:,
2,Ticket,Open Time,Type,Size,Item,Price,S / L,T / P,Close Time,Price,Commission,Taxes,Swap,Profit
3,3891066,2025.10.31 03:00:01,buy,0.03,ndx100,26032.18,25794.45,26278.24,2025.10.31 19:58:51,25794.23,0.00,0.00,0.00,-71.39
4,,,,,,,,,,223041,223041[sl],223041[sl],223041[sl],


In [111]:
"""conditions always valid"""

# keep rows from row 2
df = df.iloc[2:].reset_index(drop=True)

# RangeIndex have the corrected column names
df.columns = df.iloc[0]
df = df[1:]

# trim whitespace from column names
df.columns = df.columns.str.strip()

In [112]:
df.tail(20)

Unnamed: 0,Ticket,Open Time,Type,Size,Item,Price,S / L,T / P,Close Time,Price.1,Commission,Taxes,Swap,Profit
651,No transactions,No transactions,No transactions,No transactions,No transactions,No transactions,No transactions,No transactions,No transactions,No transactions,No transactions,No transactions,No transactions,
652,,,,,,,,,,,,,,
653,Summary:,Summary:,Summary:,Summary:,Summary:,Summary:,Summary:,Summary:,Summary:,Summary:,Summary:,Summary:,Summary:,Summary:
654,Deposit/Withdrawal:,Deposit/Withdrawal:,98 605.21,98 605.21,Credit Facility:,Credit Facility:,Credit Facility:,Credit Facility:,0.00,,,,,
655,Closed Trade P/L:,Closed Trade P/L:,1 124.80,1 124.80,Floating P/L:,Floating P/L:,Floating P/L:,Floating P/L:,27.20,Margin:,Margin:,Margin:,1 543.67,1 543.67
656,Balance:,Balance:,99 730.01,99 730.01,Equity:,Equity:,Equity:,Equity:,99 757.21,Free Margin:,Free Margin:,Free Margin:,98 213.54,98 213.54
657,,,,,,,,,,,,,,
658,Details:,Details:,Details:,Details:,Details:,Details:,Details:,Details:,Details:,Details:,Details:,Details:,Details:,Details:
659,,,,,,,,,,,,,,
660,Gross Profit:,Gross Profit:,4 128.53,4 128.53,Gross Loss:,Gross Loss:,Gross Loss:,Gross Loss:,3 003.73,Total Net Profit:,Total Net Profit:,Total Net Profit:,1 124.80,1 124.80


In [113]:
"""conditions depending on data, check data length first"""
# drop rows from RangeIndex 650 with iloc
df = df.iloc[:650]

In [114]:
"""conditions always valid"""

"""
drop rows where 'Open Time' and 'Close Time' have the following values:
Closed P/L:
Open Trades:
Open Time
Working Orders:
No transactions
0.00
"""
# drop strings in 'Open Time' and 'Close Time' columns
strings_to_drop = ['Closed P/L:', 'Open Trades:', 'Open Time', 'Working Orders:', 'No transactions', '0.00']
df = df[~df['Open Time'].isin(strings_to_drop)]
df = df[~df['Close Time'].isin(strings_to_drop)]

"""
drop rows where 'Type' have the following values:
balance
"""
# drop strings in 'Type' column
df = df[~df['Type'].isin(['balance'])]

# drop rows where 'Ticket' is nan
df = df.dropna(subset=['Ticket'])      

In [115]:
"""
drop rows where 'Commission' contains the following values:
cancelled
Floating P/L:
"from #"
"Portfolio"
"folio"  
"to #"
[tp]
[sl]
"""

# drop rows where Commission column contains any of these strings
strings_to_drop_commission = ['cancelled', 'Floating P/L:', 'from #', 'Portfolio', 'folio', 'to #', '[tp]', '[sl]']

# Use str.contains with regex OR pattern to match any of the strings
pattern = '|'.join(strings_to_drop_commission)
df = df[~df['Commission'].str.contains(pattern, case=False, na=False)]

In [116]:
# print df.Commission.unique()
print(df.Commission.unique())

['0.00' '-0.05']


In [117]:
# convert Open Time and Close Time to datetime
df['Open Time'] = pd.to_datetime(df['Open Time'])
df['Close Time'] = pd.to_datetime(df['Close Time'])

# drop rows where 'Open Time' is nan
df = df.dropna(subset=['Open Time'])

# replace rows where 'Close Time' is nan with 'Close Time' = 'Open Time' + 4 hours
df['Close Time'] = df['Close Time'].fillna(df['Open Time'] + pd.Timedelta(hours=4))

In [118]:
df.head()

Unnamed: 0,Ticket,Open Time,Type,Size,Item,Price,S / L,T / P,Close Time,Price.1,Commission,Taxes,Swap,Profit
1,3891066,2025-10-31 03:00:01,buy,0.03,ndx100,26032.18,25794.45,26278.24,2025-10-31 19:58:51,25794.23,0.0,0.0,0.0,-71.39
11,3883956,2025-10-30 04:00:01,sell,0.03,ndx100,26146.5,26583.74,25676.62,2025-10-30 23:00:01,25734.59,0.0,0.0,0.0,123.57
21,3883951,2025-10-30 05:06:41,buy,0.04,ndx100,26175.22,26234.57,0.0,2025-10-30 07:00:10,26234.47,0.0,0.0,0.0,23.7
23,3883950,2025-10-30 05:06:41,buy,0.04,ndx100,26175.22,26234.47,0.0,2025-10-30 07:00:10,26234.47,0.0,0.0,0.0,23.7
25,3883953,2025-10-30 05:06:41,buy,0.04,ndx100,26175.22,26234.57,0.0,2025-10-30 07:00:01,26233.75,0.0,0.0,0.0,23.41


In [119]:
df.tail(20)

Unnamed: 0,Ticket,Open Time,Type,Size,Item,Price,S / L,T / P,Close Time,Price.1,Commission,Taxes,Swap,Profit
603,3286360,2025-06-18 05:00:05,buy,0.02,ndx100,21743.63,20276.78,23439.53,2025-06-18 23:49:06,21734.09,0.0,0.0,0.0,-1.91
605,3276156,2025-06-16 05:00:00,buy,0.02,ndx100,21673.53,20468.58,23347.2,2025-06-18 23:49:06,21734.09,0.0,0.0,-2.48,12.11
607,3274288,2025-06-13 17:00:00,buy,0.02,ndx100,21715.36,20591.1,23343.26,2025-06-18 23:49:05,21734.09,0.0,0.0,-6.21,3.75
609,3270430,2025-06-12 20:14:59,buy,0.01,ndx100,21930.79,21000.0,0.0,2025-06-12 20:15:01,21927.91,0.0,0.0,0.0,-0.29
611,3253793,2025-06-09 09:00:00,buy,0.02,ndx100,21717.86,20265.66,0.0,2025-06-11 23:49:17,21856.01,0.0,0.0,-2.48,27.63
613,3253790,2025-06-09 09:00:00,buy,0.02,ndx100,21717.71,20497.63,0.0,2025-06-11 23:49:17,21856.01,0.0,0.0,-2.48,27.66
615,3244852,2025-06-05 09:00:10,sell,0.02,ndx100,21695.57,23020.83,0.0,2025-06-05 13:00:00,21740.89,0.0,0.0,0.0,-9.06
618,3233154,2025-06-02 17:00:05,buy,0.02,ndx100,21262.62,20327.3,22893.66,2025-06-04 05:00:07,21643.31,0.0,0.0,-2.48,76.14
620,3224858,2025-05-29 21:31:21,buy,0.01,ndx100,21295.86,20000.0,0.0,2025-05-29 21:31:25,21289.21,0.0,0.0,0.0,-0.67
622,3212201,2025-05-27 13:00:00,buy,0.02,ndx100,21281.77,20005.69,0.0,2025-05-28 23:48:46,21440.11,0.0,0.0,-1.24,31.67


In [None]:
# convert cleaned dataframe to proper Dtypes
df = df.astype({
    'Size': 'float64',
    # check if 'Ticket' can be converted to int, if not use 'str' instead
    'Ticket': 'int64',
    'S / L': 'float64',
    'T / P': 'float64',
    'Price': 'float64',
    'Commission': 'float64',
    'Taxes': 'float64',
    'Swap': 'float64',
    'Profit': 'float64'
})

In [121]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 239 entries, 1 to 645
Data columns (total 14 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   Ticket      239 non-null    object        
 1   Open Time   239 non-null    datetime64[ns]
 2   Type        239 non-null    object        
 3   Size        239 non-null    float64       
 4   Item        239 non-null    object        
 5   Price       239 non-null    float64       
 6   S / L       239 non-null    float64       
 7   T / P       239 non-null    float64       
 8   Close Time  239 non-null    datetime64[ns]
 9   Price       239 non-null    float64       
 10  Commission  239 non-null    float64       
 11  Taxes       239 non-null    float64       
 12  Swap        239 non-null    float64       
 13  Profit      239 non-null    float64       
dtypes: datetime64[ns](2), float64(9), object(3)
memory usage: 28.0+ KB


In [122]:
# print shape of cleaned dataframe
print(f"Cleaned DataFrame shape: {df.shape}")

Cleaned DataFrame shape: (239, 14)
