In [45]:
import pandas as pd
import os
from dotenv import load_dotenv

# Load environment variables
load_dotenv()

True

In [46]:
# Get paths from environment variables
base_path = os.getenv('HTML_PATH')
download_path = os.getenv('DOWNLOAD_PATH')

# Construct full path to HTML file
path = os.path.join(base_path, "2.htm")

In [47]:
# Check if file exists before reading
if not os.path.exists(path):
    print(f"Error: HTML file not found at {path}")
else:
    # Read HTML file
    tables = pd.read_html(path)
    
    # Check if tables are found
    if not tables:
        print("No tables found in the HTML file.")
    else:
        print(f"Found {len(tables)} tables in the HTML file.")
        for i, table in enumerate(tables, 1):
            print(f"Table {i}: {table.shape[0]} rows x {table.shape[1]} columns")

Found 1 tables in the HTML file.
Table 1: 263 rows x 14 columns


In [48]:
def html_to_excel(html_file, excel_file):
    try:
        # Read HTML tables from the file
        tables = pd.read_html(html_file)
        if not tables:
            print("No tables found in the HTML file.")
            return
        
        # Save each table to a separate sheet in Excel
        with pd.ExcelWriter(excel_file, engine='openpyxl') as writer:
            for idx, table in enumerate(tables, start=1):
                sheet_name = f"Sheet{idx}"
                table.to_excel(writer, sheet_name=sheet_name, index=False)
        
        print(f"Conversion successful!")
    
    except FileNotFoundError:
        print(f"Error: File '{html_file}' not found.")
    except ValueError as e:
        print(f"Error reading HTML: {e}")
    except Exception as e:
        print(f"Unexpected error: {e}")

# Convert the HTML file to Excel
if os.path.exists(path):
    excel_output = os.path.join(download_path, "converted_tables.xlsx")
    html_to_excel(path, excel_output)
else:
    print("Cannot convert: HTML file not found.")

Conversion successful!


In [49]:
# read converted csv file
df = pd.read_excel(excel_output, sheet_name='Sheet1')
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13
0,Account: 55015312,Account: 55015312,Name: FundedNext-STLR 1-Step Account- Simone C...,Name: FundedNext-STLR 1-Step Account- Simone C...,Name: FundedNext-STLR 1-Step Account- Simone C...,Name: FundedNext-STLR 1-Step Account- Simone C...,Name: FundedNext-STLR 1-Step Account- Simone C...,Currency: USD,Currency: USD,Leverage:,Leverage:,"2025 November 2, 10:44","2025 November 2, 10:44","2025 November 2, 10:44"
1,Closed Transactions:,Closed Transactions:,Closed Transactions:,Closed Transactions:,Closed Transactions:,Closed Transactions:,Closed Transactions:,Closed Transactions:,Closed Transactions:,Closed Transactions:,Closed Transactions:,Closed Transactions:,Closed Transactions:,
2,Ticket,Open Time,Type,Size,Item,Price,S / L,T / P,Close Time,Price,Commission,Taxes,Swap,Profit
3,3877244,2025.10.29 05:00:03,buy,0.07,spx500,6900.35,6768.12,7095.03,2025.10.30 23:52:38,6861.38,0.00,0.00,-1.26,-27.28
4,,,,,,,,,,195019,195019,195019,195019,


In [50]:
"""conditions always valid"""

# keep rows from row 2
df = df.iloc[2:].reset_index(drop=True)

# RangeIndex have the corrected column names
df.columns = df.iloc[0]
df = df[1:]

# there are two columns with the same name 'Price'
# rename the second 'Price' column to 'Price2'
df.columns = df.columns.where(df.columns.duplicated() == False, df.columns + '2')

# trim whitespace from column names
df.columns = df.columns.str.strip()

#  drop 'Taxes' column
df = df.drop(columns=['Taxes'], errors='ignore')

In [51]:
df.tail(20)

Unnamed: 0,Ticket,Open Time,Type,Size,Item,Price,S / L,T / P,Close Time,Price2,Commission,Swap,Profit
241,No transactions,No transactions,No transactions,No transactions,No transactions,No transactions,No transactions,No transactions,No transactions,No transactions,No transactions,No transactions,
242,,,,,,,,,,,,,
243,Summary:,Summary:,Summary:,Summary:,Summary:,Summary:,Summary:,Summary:,Summary:,Summary:,Summary:,Summary:,Summary:
244,Deposit/Withdrawal:,Deposit/Withdrawal:,99 827.83,99 827.83,Credit Facility:,Credit Facility:,Credit Facility:,Credit Facility:,0.00,,,,
245,Closed Trade P/L:,Closed Trade P/L:,-283.67,-283.67,Floating P/L:,Floating P/L:,Floating P/L:,Floating P/L:,0.00,Margin:,Margin:,0.00,0.00
246,Balance:,Balance:,99 544.16,99 544.16,Equity:,Equity:,Equity:,Equity:,99 544.16,Free Margin:,Free Margin:,99 544.16,99 544.16
247,,,,,,,,,,,,,
248,Details:,Details:,Details:,Details:,Details:,Details:,Details:,Details:,Details:,Details:,Details:,Details:,Details:
249,,,,,,,,,,,,,
250,Gross Profit:,Gross Profit:,1 527.07,1 527.07,Gross Loss:,Gross Loss:,Gross Loss:,Gross Loss:,1 810.74,Total Net Profit:,Total Net Profit:,-283.67,-283.67


In [None]:
# ==================================================================
"""
from df.tail() check first row with SumSummary:	Summary:	Summary:
cancel all rows from that row onwards, indicating the RangeIndex num
"""
# ==================================================================

In [None]:
"""conditions depending on data, check data length first"""
# drop rows indicating correct RangeIndex (with iloc)
up_to_row = 242  # adjust this number based on actual data
df = df.iloc[:up_to_row]

In [None]:
df.tail(10)

In [53]:
"""conditions always valid"""

"""
drop rows where 'Open Time' and 'Close Time' have the following values:
Closed P/L:
Open Trades:
Open Time
Working Orders:
No transactions
0.00
"""
# drop strings in 'Open Time' and 'Close Time' columns
strings_to_drop = ['Closed P/L:', 'Open Trades:', 'Open Time', 'Working Orders:', 'No transactions', '0.00']
df = df[~df['Open Time'].isin(strings_to_drop)]
df = df[~df['Close Time'].isin(strings_to_drop)]

"""
drop rows where 'Type' have the following values:
balance
"""
# drop strings in 'Type' column
df = df[~df['Type'].isin(['balance'])]

# drop rows where 'Ticket' is nan
df = df.dropna(subset=['Ticket'])      

In [54]:
"""
drop rows where 'Commission' contains the following values:
cancelled
Floating P/L:
"from #"
"Portfolio"
"folio"  
"to #"
[tp]
[sl]
"""

# drop rows where Commission column contains any of these strings
strings_to_drop_commission = ['cancelled', 'Floating P/L:', 'from #', 'Portfolio', 'folio', 'to #', '[tp]', '[sl]']

# Use str.contains with regex OR pattern to match any of the strings
pattern = '|'.join(strings_to_drop_commission)
df = df[~df['Commission'].str.contains(pattern, case=False, na=False)]

In [63]:
# # print df.Commission.unique()
# print(df.Commission.unique())

In [56]:
# convert Open Time and Close Time to datetime
df['Open Time'] = pd.to_datetime(df['Open Time'])
df['Close Time'] = pd.to_datetime(df['Close Time'])

# drop rows where 'Open Time' is nan
df = df.dropna(subset=['Open Time'])

# replace rows where 'Close Time' is nan with 'Close Time' = 'Open Time' + 4 hours
df['Close Time'] = df['Close Time'].fillna(df['Open Time'] + pd.Timedelta(hours=4))

In [57]:
df.head()

Unnamed: 0,Ticket,Open Time,Type,Size,Item,Price,S / L,T / P,Close Time,Price2,Commission,Swap,Profit
1,3877244,2025-10-29 05:00:03,buy,0.07,spx500,6900.35,6768.12,7095.03,2025-10-30 23:52:38,6861.38,0.0,-1.26,-27.28
3,3868784,2025-10-28 04:00:02,buy,0.06,spx500,6875.34,6803.09,6965.53,2025-10-30 23:52:38,6861.38,0.0,-2.16,-8.38
5,3878203,2025-10-29 09:00:01,buy,0.07,spx500,6908.08,6857.53,6982.53,2025-10-29 21:39:57,6856.83,0.0,0.0,-35.88
7,3877741,2025-10-29 07:00:03,sell,0.07,spx500,6905.03,6955.52,6830.52,2025-10-29 09:00:01,6907.89,0.0,0.0,-2.0
9,3870419,2025-10-28 10:00:01,buy,0.07,spx500,6870.17,6819.68,6944.68,2025-10-29 07:00:02,6905.05,0.0,-1.26,24.42


In [58]:
df.tail(20)

Unnamed: 0,Ticket,Open Time,Type,Size,Item,Price,S / L,T / P,Close Time,Price2,Commission,Swap,Profit
192,3116321,2025-05-02 14:00:24,buy,0.07,ger30,22881.9,23038.61,23197.15,2025-05-02 16:00:02,23038.51,0.0,0.0,124.1
194,3115849,2025-05-02 12:07:13,buy,0.07,ger30,22825.9,22881.11,23147.34,2025-05-02 13:00:01,22879.51,0.0,0.0,42.59
196,3105668,2025-04-30 20:38:57,buy,0.02,ger30,22567.4,22759.11,22861.89,2025-05-02 10:04:32,22758.51,0.0,-2.55,43.29
198,3109009,2025-05-01 05:26:45,buy,0.05,ndx100,19849.7,19859.8,20247.51,2025-05-01 07:01:12,19859.8,0.0,0.0,5.05
200,3104180,2025-04-30 12:28:17,buy,0.02,ger30,22556.33,22560.02,22810.35,2025-04-30 13:00:32,22559.86,0.0,0.0,0.8
202,3103292,2025-04-30 10:01:28,buy,0.02,ger30,22515.46,22555.2,22761.29,2025-04-30 11:00:00,22553.13,0.0,0.0,8.57
204,3101085,2025-04-29 19:00:01,buy,0.02,ger30,22439.41,22494.48,22700.66,2025-04-29 21:00:03,22494.39,0.0,0.0,12.52
206,3099334,2025-04-29 16:50:07,buy,0.02,ger30,22437.72,22437.39,22701.95,2025-04-29 18:02:32,22437.07,0.0,0.0,-0.15
208,3098600,2025-04-29 14:49:46,buy,0.02,ger30,22424.74,22436.18,22690.7,2025-04-29 15:00:02,22436.09,0.0,0.0,2.58
210,3097566,2025-04-29 11:00:14,buy,0.02,ger30,22408.61,22414.72,22674.01,2025-04-29 12:00:02,22413.13,0.0,0.0,1.03


In [59]:
# convert cleaned dataframe to proper Dtypes
df = df.astype({
    'Size': 'float64',
    # check if 'Ticket' can be converted to int, if not use 'str' instead
    'Ticket': 'int64',
    'S / L': 'float64',
    'T / P': 'float64',
    'Price': 'float64',
    'Price2': 'float64',
    'Commission': 'float64',
    'Swap': 'float64',
    'Profit': 'float64'
})

In [60]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 119 entries, 1 to 229
Data columns (total 13 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   Ticket      119 non-null    int64         
 1   Open Time   119 non-null    datetime64[ns]
 2   Type        119 non-null    object        
 3   Size        119 non-null    float64       
 4   Item        119 non-null    object        
 5   Price       119 non-null    float64       
 6   S / L       119 non-null    float64       
 7   T / P       119 non-null    float64       
 8   Close Time  119 non-null    datetime64[ns]
 9   Price2      119 non-null    float64       
 10  Commission  119 non-null    float64       
 11  Swap        119 non-null    float64       
 12  Profit      119 non-null    float64       
dtypes: datetime64[ns](2), float64(8), int64(1), object(2)
memory usage: 13.0+ KB


In [61]:
# print shape of cleaned dataframe
print(f"Cleaned DataFrame shape: {df.shape}")

# print column names
print("Column names:", df.columns.tolist())

Cleaned DataFrame shape: (119, 13)
Column names: ['Ticket', 'Open Time', 'Type', 'Size', 'Item', 'Price', 'S / L', 'T / P', 'Close Time', 'Price2', 'Commission', 'Swap', 'Profit']


In [62]:
# download cleaned dataframe to excel
cleaned_excel_output = os.path.join(download_path, "2.xlsx")

df.to_excel(cleaned_excel_output, index=False)