In [91]:
import pandas as pd
import os
from dotenv import load_dotenv

# Load environment variables
load_dotenv()

True

In [92]:
# Get paths from environment variables
base_path = os.getenv('HTML_PATH')
download_path = os.getenv('DOWNLOAD_PATH')

# Construct full path to HTML file
path = os.path.join(base_path, "1.htm")

In [93]:
# Check if file exists before reading
if not os.path.exists(path):
    print(f"Error: HTML file not found at {path}")
else:
    # Read HTML file
    tables = pd.read_html(path)
    
    # Check if tables are found
    if not tables:
        print("No tables found in the HTML file.")
    else:
        print(f"Found {len(tables)} tables in the HTML file.")
        for i, table in enumerate(tables, 1):
            print(f"Table {i}: {table.shape[0]} rows x {table.shape[1]} columns")

Found 1 tables in the HTML file.
Table 1: 2070 rows x 14 columns


In [94]:
def html_to_excel(html_file, excel_file):
    try:
        # Read HTML tables from the file
        tables = pd.read_html(html_file)
        if not tables:
            print("No tables found in the HTML file.")
            return
        
        # Save each table to a separate sheet in Excel
        with pd.ExcelWriter(excel_file, engine='openpyxl') as writer:
            for idx, table in enumerate(tables, start=1):
                sheet_name = f"Sheet{idx}"
                table.to_excel(writer, sheet_name=sheet_name, index=False)
        
        print(f"Conversion successful!")
    
    except FileNotFoundError:
        print(f"Error: File '{html_file}' not found.")
    except ValueError as e:
        print(f"Error reading HTML: {e}")
    except Exception as e:
        print(f"Unexpected error: {e}")

# Convert the HTML file to Excel
if os.path.exists(path):
    excel_output = os.path.join(download_path, "converted_tables.xlsx")
    html_to_excel(path, excel_output)
else:
    print("Cannot convert: HTML file not found.")

Conversion successful!


In [95]:
# read converted csv file
df = pd.read_excel(excel_output, sheet_name='Sheet1')
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13
0,Account: 55011432,Account: 55011432,Name: FundedNext-STLR 1-Step Account- Simone C...,Name: FundedNext-STLR 1-Step Account- Simone C...,Name: FundedNext-STLR 1-Step Account- Simone C...,Name: FundedNext-STLR 1-Step Account- Simone C...,Name: FundedNext-STLR 1-Step Account- Simone C...,Currency: USD,Currency: USD,Leverage:,Leverage:,"2025 November 2, 10:21","2025 November 2, 10:21","2025 November 2, 10:21"
1,Closed Transactions:,Closed Transactions:,Closed Transactions:,Closed Transactions:,Closed Transactions:,Closed Transactions:,Closed Transactions:,Closed Transactions:,Closed Transactions:,Closed Transactions:,Closed Transactions:,Closed Transactions:,Closed Transactions:,
2,Ticket,Open Time,Type,Size,Item,Price,S / L,T / P,Close Time,Price,Commission,Taxes,Swap,Profit
3,3892951,2025.10.31 11:00:00,buy stop,0.06,ger30,24348.43,24202.34,24574.77,2025.10.31 13:00:00,24002.77,cancelled,cancelled,cancelled,cancelled
4,,,,,,,,,,148,cancelled,cancelled,cancelled,


In [96]:
"""conditions always valid"""

# keep rows from row 2
df = df.iloc[2:].reset_index(drop=True)

# RangeIndex have the corrected column names
df.columns = df.iloc[0]
df = df[1:]

# there are two columns with the same name 'Price'
# rename the second 'Price' column to 'Price2'
df.columns = df.columns.where(df.columns.duplicated() == False, df.columns + '2')

# trim whitespace from column names
df.columns = df.columns.str.strip()

#  drop 'Taxes' column
df = df.drop(columns=['Taxes'], errors='ignore')

In [97]:
df.tail(20)

Unnamed: 0,Ticket,Open Time,Type,Size,Item,Price,S / L,T / P,Close Time,Price2,Commission,Swap,Profit
2048,No transactions,No transactions,No transactions,No transactions,No transactions,No transactions,No transactions,No transactions,No transactions,No transactions,No transactions,No transactions,
2049,,,,,,,,,,,,,
2050,Summary:,Summary:,Summary:,Summary:,Summary:,Summary:,Summary:,Summary:,Summary:,Summary:,Summary:,Summary:,Summary:
2051,Deposit/Withdrawal:,Deposit/Withdrawal:,98 029.13,98 029.13,Credit Facility:,Credit Facility:,Credit Facility:,Credit Facility:,0.00,,,,
2052,Closed Trade P/L:,Closed Trade P/L:,1 909.49,1 909.49,Floating P/L:,Floating P/L:,Floating P/L:,Floating P/L:,-114.90,Margin:,Margin:,9 671.10,9 671.10
2053,Balance:,Balance:,99 938.62,99 938.62,Equity:,Equity:,Equity:,Equity:,99 823.72,Free Margin:,Free Margin:,90 152.62,90 152.62
2054,,,,,,,,,,,,,
2055,Details:,Details:,Details:,Details:,Details:,Details:,Details:,Details:,Details:,Details:,Details:,Details:,Details:
2056,,,,,,,,,,,,,
2057,Gross Profit:,Gross Profit:,22 853.07,22 853.07,Gross Loss:,Gross Loss:,Gross Loss:,Gross Loss:,20 943.58,Total Net Profit:,Total Net Profit:,1 909.49,1 909.49


In [98]:
# ==================================================================
"""
from df.tail() check first row with SumSummary:	Summary:	Summary:
cancel all rows from that row onwards, indicating the RangeIndex num
"""
# ==================================================================

'\nfrom df.tail() check first row with SumSummary:\tSummary:\tSummary:\ncancel all rows from that row onwards, indicating the RangeIndex num\n'

In [99]:
"""conditions depending on data, check data length first"""
# drop rows indicating correct RangeIndex (with iloc)
up_to_row = 2049  # adjust this number based on actual data
df = df.iloc[:up_to_row]

In [100]:
df.tail(10)

Unnamed: 0,Ticket,Open Time,Type,Size,Item,Price,S / L,T / P,Close Time,Price2,Commission,Swap,Profit
2040,3896128,2025.10.31 20:00:00,buy,0.01,us30,47406.06,0.00,0.00,,47566.80,0.00,-2.78,16.07
2041,,,,,,,,,,238115,238115,238115,
2042,3892953,2025.10.31 11:00:01,sell,0.33,usdcad,1.40014,0.00000,0.00000,,1.40167,-0.99,-1.89,-36.02
2043,,,,,,,,,,215001,215001,215001,
2044,,,,,,,,,,,-0.99,-32.49,-81.42
2045,,,,,,,,,,,Floating P/L:,-114.90,-114.90
2046,Working Orders:,Working Orders:,Working Orders:,Working Orders:,Working Orders:,Working Orders:,Working Orders:,Working Orders:,Working Orders:,Working Orders:,Working Orders:,Working Orders:,Working Orders:
2047,Ticket,Open Time,Type,Size,Item,Price,S / L,T / P,Market Price,Market Price,,,
2048,No transactions,No transactions,No transactions,No transactions,No transactions,No transactions,No transactions,No transactions,No transactions,No transactions,No transactions,No transactions,
2049,,,,,,,,,,,,,


In [101]:
"""conditions always valid"""

"""
drop rows where 'Open Time' and 'Close Time' have the following values:
Closed P/L:
Open Trades:
Open Time
Working Orders:
No transactions
0.00
"""
# drop strings in 'Open Time' and 'Close Time' columns
strings_to_drop = ['Closed P/L:', 'Open Trades:', 'Open Time', 'Working Orders:', 'No transactions', '0.00']
df = df[~df['Open Time'].isin(strings_to_drop)]
df = df[~df['Close Time'].isin(strings_to_drop)]

"""
drop rows where 'Type' have the following values:
balance
"""
# drop strings in 'Type' column
df = df[~df['Type'].isin(['balance'])]

# drop rows where 'Ticket' is nan
df = df.dropna(subset=['Ticket'])      

In [102]:
"""
drop rows where 'Commission' contains the following values:
cancelled
Floating P/L:
"from #"
"Portfolio"
"folio"  
"to #"
[tp]
[sl]
"""

# drop rows where Commission column contains any of these strings
strings_to_drop_commission = ['cancelled', 'Floating P/L:', 'from #', 'Portfolio', 'folio', 'to #', '[tp]', '[sl]']

# Use str.contains with regex OR pattern to match any of the strings
pattern = '|'.join(strings_to_drop_commission)
df = df[~df['Commission'].str.contains(pattern, case=False, na=False)]

In [103]:
# # print df.Commission.unique()
# print(df.Commission.unique())

In [104]:
# convert Open Time and Close Time to datetime
df['Open Time'] = pd.to_datetime(df['Open Time'])
df['Close Time'] = pd.to_datetime(df['Close Time'])

# drop rows where 'Open Time' is nan
df = df.dropna(subset=['Open Time'])

# replace rows where 'Close Time' is nan with 'Close Time' = 'Open Time' + 4 hours
df['Close Time'] = df['Close Time'].fillna(df['Open Time'] + pd.Timedelta(hours=4))

In [105]:
df.head()

Unnamed: 0,Ticket,Open Time,Type,Size,Item,Price,S / L,T / P,Close Time,Price2,Commission,Swap,Profit
5,3889898,2025-10-30 19:00:02,buy,0.02,us30,47838.79,0.0,48297.73,2025-10-30 21:30:37,47717.79,0.0,0.0,-24.2
7,3889897,2025-10-30 19:00:01,buy,0.02,us30,47843.43,0.0,48410.21,2025-10-30 21:30:30,47710.79,0.0,0.0,-26.53
9,3889103,2025-10-30 17:00:01,buy,0.02,us30,47831.88,0.0,48404.29,2025-10-30 21:30:22,47711.71,0.0,0.0,-24.03
11,3886283,2025-10-30 12:00:01,buy,0.02,us30,47517.7,0.0,0.0,2025-10-30 21:30:16,47711.89,0.0,0.0,38.84
13,3884828,2025-10-30 08:00:01,buy,0.03,us30,47540.12,0.0,48048.02,2025-10-30 21:29:31,47697.39,0.0,0.0,47.18


In [106]:
df.tail(20)

Unnamed: 0,Ticket,Open Time,Type,Size,Item,Price,S / L,T / P,Close Time,Price2,Commission,Swap,Profit
2000,2442320,2024-11-13 22:00:01,buy,0.06,btcusd,90240.44,88406.99,92767.12,2024-11-13 23:17:17,88399.65,0.0,0.0,-110.45
2002,2441907,2024-11-13 20:00:00,sell,0.48,ethusd,3276.26,3532.72,3004.76,2024-11-13 23:00:00,3171.21,0.0,0.0,50.42
2004,2441135,2024-11-13 18:00:01,buy,0.24,nzdusd,0.59014,0.58762,0.59492,2024-11-13 22:54:35,0.58759,-0.72,0.0,-61.2
2006,2442126,2024-11-13 21:00:01,buy,0.06,btcusd,92896.44,90834.52,95012.8,2024-11-13 21:18:04,90834.28,0.0,0.0,-123.73
2008,2441584,2024-11-13 19:00:00,buy,0.06,btcusd,92356.05,91747.24,94457.98,2024-11-13 21:05:46,91738.6,0.0,0.0,-37.05
2010,2439486,2024-11-13 16:00:00,buy,0.06,btcusd,89373.09,87762.95,91997.22,2024-11-13 17:30:11,91997.22,0.0,0.0,157.45
2012,2439491,2024-11-13 16:00:00,buy,0.24,nzdusd,0.59372,0.59,0.598,2024-11-13 17:03:03,0.59,-0.72,0.0,-89.28
2014,2439490,2024-11-13 16:00:00,sell,0.24,nzdusd,0.5937,0.59702,0.58902,2024-11-13 17:00:01,0.59035,-0.72,0.0,80.4
2016,2439487,2024-11-13 16:00:00,buy,0.06,btcusd,89370.14,87322.67,91500.95,2024-11-13 16:51:37,91500.95,0.0,0.0,127.85
2018,2439129,2024-11-13 15:36:28,buy,0.01,eurusd,1.06472,1.05,0.0,2024-11-13 15:36:39,1.06406,-0.03,0.0,-0.66


In [107]:
# convert cleaned dataframe to proper Dtypes
df = df.astype({
    'Size': 'float64',
    # check if 'Ticket' can be converted to int, if not use 'str' instead
    'Ticket': 'int64',
    'S / L': 'float64',
    'T / P': 'float64',
    'Price': 'float64',
    'Price2': 'float64',
    'Commission': 'float64',
    'Swap': 'float64',
    'Profit': 'float64'
})

In [108]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1009 entries, 5 to 2042
Data columns (total 13 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   Ticket      1009 non-null   int64         
 1   Open Time   1009 non-null   datetime64[ns]
 2   Type        1009 non-null   object        
 3   Size        1009 non-null   float64       
 4   Item        1009 non-null   object        
 5   Price       1009 non-null   float64       
 6   S / L       1009 non-null   float64       
 7   T / P       1009 non-null   float64       
 8   Close Time  1009 non-null   datetime64[ns]
 9   Price2      1009 non-null   float64       
 10  Commission  1009 non-null   float64       
 11  Swap        1009 non-null   float64       
 12  Profit      1009 non-null   float64       
dtypes: datetime64[ns](2), float64(8), int64(1), object(2)
memory usage: 110.4+ KB


In [109]:
# print shape of cleaned dataframe
print(f"Cleaned DataFrame shape: {df.shape}")

# print column names
print("Column names:", df.columns.tolist())

Cleaned DataFrame shape: (1009, 13)
Column names: ['Ticket', 'Open Time', 'Type', 'Size', 'Item', 'Price', 'S / L', 'T / P', 'Close Time', 'Price2', 'Commission', 'Swap', 'Profit']


In [110]:
# download cleaned dataframe to excel
cleaned_excel_output = os.path.join(download_path, "1.xlsx")

df.to_excel(cleaned_excel_output, index=False)