In [2]:
### Importing the important libraries
import pandas as pd
from datetime import datetime
import os
import datetime
import numpy as np
from datetime import datetime, timedelta
import tensorflow as tf
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.models import Model
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt

In [4]:
import tensorflow as tf
from tensorflow.keras import mixed_precision

# Set the mixed precision policy to 'mixed_float16


tf.keras.mixed_precision.set_global_policy('float32')


# pd.set_option('display.max_columns', None)
# pd.set_option('display.max_rows', None)

In [5]:
def add_lag_features(df, lags):
    lag_data = {f"{column}_lag_{lag}": df[column].shift(lag) for column in df.columns for lag in lags}
    lag_df = pd.DataFrame(lag_data, index=df.index)
    df = pd.concat([df, lag_df], axis=1)
    return df

In [6]:
df_5min = pd.read_csv('nifty50_rik_5min.csv')   

In [7]:
# Initialize the periods
periods = [i for i in range(4, 60)]
################################################################################################
################ USE THIS IF YOUR INPUT DATA IS LESS THAN 5MIN OHLC PERIOD ######################
################################################################################################
# df['date'] = pd.to_datetime(df['date'])
# df.set_index('date', inplace=True)

# # Remove duplicated timestamps
# if df.index.duplicated().any():
#     print("There are duplicated timestamps.")
#     df = df[~df.index.duplicated(keep='first')]

# # Resample the data to 5-minute intervals
# df_5min = df.resample('5min').agg({
#     'open': 'first',
#     'high': 'max',
#     'low': 'min',
#     'close': 'last'
# })
# df_5min.dropna(how='all', inplace=True)
# df_5min.reset_index(inplace=True)


In [8]:
#### adding sma and ema and their lagged values
def add_moving_averages(df_5min, periods):
    moving_averages = {f'SMA_{period}': df_5min['close'].rolling(window=period).mean() for period in periods}
    moving_averages.update({f'EMA_{period}': df_5min['close'].ewm(span=period, adjust=False).mean() for period in periods})
    moving_averages_df = pd.DataFrame(moving_averages)
    return moving_averages_df
df1 = add_moving_averages(df_5min,periods)
lags = [1,2,3,4]
df1 = add_lag_features(df1, lags)

In [9]:
#### adding Rsi and their lagged values
def calculate_rsi(data, window):
    diff = data.diff(1)
    gain = (diff.where(diff > 0, 0)).fillna(0)
    loss = (-diff.where(diff < 0, 0)).fillna(0)
    avg_gain = gain.rolling(window=window, min_periods=1).mean()
    avg_loss = loss.rolling(window=window, min_periods=1).mean()
    rs = avg_gain / avg_loss
    rsi = 100 - (100 / (1 + rs))
    return rsi

def add_rsi_features(df, periods):
    rsi_data = {f'RSI_{period}': calculate_rsi(df['close'], window=period) for period in periods}
    rsi_df = pd.DataFrame(rsi_data)
    return rsi_df
df2 = add_rsi_features(df_5min,periods)
lags = [1,2,3,4]
df2 = add_lag_features(df2, lags)

In [10]:
#### adding bolinger bads features and their lagged values
def add_bollinger_bands(df, periods):
    bollinger_dict = {}
    for period in periods:
        column_prefix = f'bb_{period}'
        middle_band = df['close'].rolling(window=period).mean()
        std_dev = df['close'].rolling(window=period).std()
        bollinger_dict.update({
            f'{column_prefix}_middle_band': middle_band,
            f'{column_prefix}_upper_band': middle_band + (std_dev * 2),
            f'{column_prefix}_lower_band': middle_band - (std_dev * 2),
            f'{column_prefix}_bandwidth': (middle_band + (std_dev * 2)) - (middle_band - (std_dev * 2)),
            f'{column_prefix}_percent_b': (df['close'] - (middle_band - (std_dev * 2))) / ((middle_band + (std_dev * 2)) - (middle_band - (std_dev * 2))),
            f'{column_prefix}_cross_up': ((df['close'] > (middle_band + (std_dev * 2))) & (df['close'].shift(1) <= (middle_band + (std_dev * 2)).shift(1))).astype(int),
            f'{column_prefix}_cross_down': ((df['close'] < (middle_band - (std_dev * 2))) & (df['close'].shift(1) >= (middle_band - (std_dev * 2)).shift(1))).astype(int)
        })
    bollinger_df = pd.DataFrame(bollinger_dict)
    return bollinger_df
df3 = add_bollinger_bands(df_5min,periods)

In [11]:
#### adding historical volatailty and their lagged values
def add_historical_volatility(df, periods):
    log_returns = np.log(df['close'] / df['close'].shift(1))
    volatility_data = {f'historical_volatility_{period}': log_returns.rolling(window=period).std() * np.sqrt(252 * 78) for period in periods}
    volatility_df = pd.DataFrame(volatility_data, index=df.index)
    return volatility_df
df4 = add_historical_volatility(df_5min,periods)
lags = [1,2,3,4]
df4 = add_lag_features(df4, lags)

In [12]:
#### adding atr and their lagged values
def add_atr(df, periods):
    high_low = df['high'] - df['low']
    high_close = np.abs(df['high'] - df['close'].shift(1))
    low_close = np.abs(df['low'] - df['close'].shift(1))
    tr = pd.DataFrame({'High_Low': high_low, 'High_Close': high_close, 'Low_Close': low_close}).max(axis=1)
    atr_data = {f'ATR_{period}': tr.rolling(window=period, min_periods=1).mean() for period in periods}
    atr_df = pd.DataFrame(atr_data, index=df.index)
    return atr_df

df5 = add_atr(df_5min,periods)
lags = [1,2,3,4]
df5 = add_lag_features(df5, lags)

In [13]:
#### adding distances from nearest Round number
def calculate_distances(df, column, scales):
    distance_data = {f'dist_to_lower_{scale}': df[column] - (df[column] // scale * scale) for scale in scales}
    distance_data.update({f'dist_to_upper_{scale}': ((df[column] // scale + 1) * scale) - df[column] for scale in scales})
    return distance_data
scales = [10, 100, 1000, 10000]
df6 = pd.DataFrame(calculate_distances(df_5min, 'close', scales))

In [14]:
# Convert 'date' column to datetime and set as index
df_5min['date1'] = pd.to_datetime(df_5min['date'])
df_5min.set_index('date1', inplace=True)

# Ensure index is of datetime type
if not pd.api.types.is_datetime64_any_dtype(df_5min.index):
    df_5min.index = pd.to_datetime(df_5min.index)

# Extract day from datetime index
df_5min['day'] = df_5min.index.strftime('%d-%m-%y')

# Calculate daily max and low
daily_max = df_5min.groupby('day')['high'].max().rename('daily_max')
daily_low = df_5min.groupby('day')['low'].min().rename('daily_low')  # Add daily low calculation

# Join daily max and low to the original DataFrame
df = df_5min.join(daily_max, on='day')
df = df.join(daily_low, on='day')  # Join daily low

# Calculate distance from previous day's max and low
df['max_to_prev_day'] = df['daily_max'].rolling(window=2, min_periods=1).max()
df['low_to_prev_day'] = df['daily_low'].rolling(window=2, min_periods=1).min()  # Add low to previous day

# Calculate distance from close to previous day's max and low
df['dist_from_max_to_prev_day'] = df['max_to_prev_day'] - df['close']
df['dist_from_low_to_prev_day'] = df['close'] - df['low_to_prev_day']  # Distance from low to previous day

# Calculate distance from daily max and daily low
df['dist_from_daily_max'] = df['daily_max'] - df['close']
df['dist_from_daily_low'] = df['close'] - df['daily_low']  # Distance from daily low

# Calculate rolling max and low over the last 30 days
df['max_last_30_days'] = df['daily_max'].rolling(window='30D', min_periods=1).max()
df['low_last_30_days'] = df['daily_low'].rolling(window='30D', min_periods=1).min()  # Add rolling low for 30 days

# Calculate distance from max and low over the last 30 days
df['dist_from_max_to_last30days'] = df['max_last_30_days'] - df['close']
df['dist_from_low_to_last30days'] = df['close'] - df['low_last_30_days']  # Distance from low to last 30 days

# Drop unnecessary columns
df.drop(['daily_max', 'daily_low', 'max_to_prev_day', 'low_to_prev_day', 'max_last_30_days', 'low_last_30_days', 'day', 'date', 'open', 'high', 'low', 'close'], axis=1, inplace=True)


In [15]:
df7 = df_5min.copy()
# Convert date to datetime if not already
df7['date'] = pd.to_datetime(df7['date'])

# Create trading_day column
df7['trading_day'] = df7['date'].dt.date

# Sort by date to ensure chronological order
df7 = df7.sort_values('date')

# Group by trading day and assign candle numbers
df7['candle_number'] = df7.groupby('trading_day').cumcount() + 1

# Clean up
df7 = df7.drop('trading_day', axis=1)
df7 = df7["candle_number"]

In [16]:
###### adding week-month feature
df_5min['date'] = pd.to_datetime(df_5min['date'])
df_5min['day_of_week'] = df_5min['date'].dt.dayofweek
days = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
for i, day in enumerate(days):
    df_5min[f'is_{day.lower()}'] = (df_5min['day_of_week'] == i).astype(int)
df_5min['month'] = df_5min['date'].dt.month
months = ['january', 'february', 'march', 'april', 'may', 'june', 'july', 'august', 'september', 'october', 'november', 'december']
for i, month in enumerate(months, start=1):
    df_5min[f'is_{month.lower()}'] = (df_5min['month'] == i).astype(int)
df_5min.drop('day_of_week', axis=1, inplace=True)
df_5min.drop('month', axis=1, inplace=True)


In [17]:
# Reset the index for all DataFrames if the index is not meaningful
df_5min.reset_index(drop=True, inplace=True)
df.reset_index(drop=True, inplace=True)
df1.reset_index(drop=True, inplace=True)
df2.reset_index(drop=True, inplace=True)
df3.reset_index(drop=True, inplace=True)
df4.reset_index(drop=True, inplace=True)
df5.reset_index(drop=True, inplace=True)
df6.reset_index(drop=True, inplace=True)
df7.reset_index(drop=True, inplace=True)

# Now concatenate the DataFrames along axis=1
dff = pd.concat([df_5min, df, df1, df2, df3, df4, df5, df6, df7], axis=1)

In [18]:
dff.shape

(178118, 1834)

In [19]:
### adding MACD feature
dff['macd_line'] = dff['EMA_12'] - dff['EMA_26']
dff['signal_line'] = dff['macd_line'].ewm(span=9, adjust=False).mean()
dff['macd_histogram'] = dff['macd_line'] - dff['signal_line']
for lag in [1, 2, 3]:
    dff[f'macd_line_lag_{lag}'] = dff['macd_line'].shift(lag)
    dff[f'signal_line_lag_{lag}'] = dff['signal_line'].shift(lag)
    dff[f'macd_histogram_lag_{lag}'] = dff['macd_histogram'].shift(lag)
dff['macd_signal_crossover'] = 0
dff.loc[(dff['macd_line'] > dff['signal_line']) & (dff['macd_line'].shift(1) <= dff['signal_line'].shift(1)), 'macd_signal_crossover'] = 1
dff.loc[(dff['macd_line'] < dff['signal_line']) & (dff['macd_line'].shift(1) >= dff['signal_line'].shift(1)), 'macd_signal_crossover'] = -1
dff['MACD_Distance_Zero'] = dff['macd_line'].abs()
dff['MACD_Above_Zero'] = (dff['macd_line'] > 0).astype(int)
dff['MACD_Trend_Zero'] = np.sign(dff['macd_line'].diff())
dff['MACD_Trend_Zero'] = dff['MACD_Trend_Zero'].fillna(0)

In [20]:
dff.shape

(178118, 1850)

In [21]:
# def analyze_single_day(day_data, debug=True):
#     """
#     Analyze a single day's data to find all optimal buy points.
    
#     Parameters:
#     -----------
#     day_data : pandas DataFrame
#         Single day's OHLC data
#     debug : bool
#         If True, prints detailed analysis
        
#     Returns:
#     --------
#     set
#         Indices of optimal buy points
#     """
#     buy_points = set()
    
#     if debug:
#         print(f"\nAnalyzing date: {day_data['date'].iloc[0]}")
#         print(f"Total bars in day: {len(day_data)}")
    
#     # For each potential sell point
#     for sell_idx in range(1, len(day_data)):
#         sell_price = day_data.iloc[sell_idx]['close']
        
#         # Check all potential buy points before this sell point
#         for buy_idx in range(sell_idx):
#             buy_price = day_data.iloc[buy_idx]['close']
#             return_pct = (sell_price - buy_price) * 100 / buy_price
            
#             if return_pct >= 0.6:
#                 buy_points.add(day_data.index[buy_idx])
#                 if debug:
#                     print(f"Buy point found at {day_data.iloc[buy_idx]['date']}")
#                     print(f"Buy price: {buy_price:.2f}, Sell price: {sell_price:.2f}")
#                     print(f"Return: {return_pct:.2f}%")
    
#     if debug:
#         print(f"Total buy points found in day: {len(buy_points)}")
    
#     return buy_points

In [22]:
# from tqdm import tqdm  # For progress bar

# def calculate_optimal_buy_points(df):
#     """
#     Calculate optimal buy points with progress tracking.
#     """
#     df = df.copy()
#     df['target'] = 0
    
#     # Convert date to datetime and create trade_date
#     df['date'] = pd.to_datetime(df['date'])
#     df['trade_date'] = df['date'].dt.date
    
#     # Get unique dates for progress tracking
#     unique_dates = df['trade_date'].unique()
#     total_days = len(unique_dates)
    
#     print(f"Processing {total_days} trading days...")
    
#     # Use tqdm for progress bar
#     for date in tqdm(unique_dates, desc="Finding buy points"):
#         group = df[df['trade_date'] == date]
#         group_indices = group.index
#         buy_points = set()
        
#         # For each potential sell point
#         for sell_idx in range(1, len(group)):
#             sell_price = group['close'].iloc[sell_idx]
            
#             # Check all potential buy points before this sell point
#             for buy_idx in range(sell_idx):
#                 buy_price = group['close'].iloc[buy_idx]
#                 return_pct = (sell_price - buy_price) * 100 / buy_price
                
#                 if return_pct >= 0.6:
#                     buy_points.add(group_indices[buy_idx])
        
#         # Mark buy points in the original dataframe
#         df.loc[list(buy_points), 'target'] = 1
    
#     # Print summary statistics
#     total_buy_points = df['target'].sum()
#     print(f"\nFound {total_buy_points} buy points across {total_days} days")
#     print(f"Average {total_buy_points/total_days:.2f} buy points per day")
    
#     # Clean up
#     df.drop('trade_date', axis=1, inplace=True)
#     return df['target']

# # Run with progress tracking
# dff['target'] = calculate_optimal_buy_points(dff)

In [23]:
# (dff["target"]).to_csv("target_variable.csv")

In [24]:
tg = pd.read_csv("target_variable.csv",index_col = False)

In [25]:
dff = pd.concat([dff, tg], axis=1, join='inner')

In [26]:


# def validate_and_analyze_results(df, target_series):
#     """
#     Validate and provide detailed analysis of the buy points.
    
#     Parameters:
#     -----------
#     df : pandas DataFrame
#         Original DataFrame with price data
#     target_series : pandas Series
#         Series containing target labels
#     """
#     buy_points = df[target_series == 1].index
#     points_by_date = {}
#     valid_points = 0
#     invalid_points = 0
    
#     for buy_idx in buy_points:
#         buy_price = df.loc[buy_idx, 'close']
#         day = pd.to_datetime(df.loc[buy_idx, 'date']).date()
        
#         # Track points by date
#         if day not in points_by_date:
#             points_by_date[day] = {'total': 0, 'valid': 0}
#         points_by_date[day]['total'] += 1
        
#         # Get future prices until end of day
#         future_prices = df[
#             (pd.to_datetime(df['date']).dt.date == day) & 
#             (df.index > buy_idx)
#         ]['close']
        
#         if len(future_prices) > 0:
#             max_return = (future_prices.max() - buy_price) * 100 / buy_price
#             if max_return >= 0.6:
#                 valid_points += 1
#                 points_by_date[day]['valid'] += 1
#             else:
#                 invalid_points += 1
    
#     print("\nDetailed Analysis:")
#     print(f"Total days analyzed: {len(points_by_date)}")
#     print(f"Total buy points: {len(buy_points)}")
#     print(f"Valid buy points: {valid_points}")
#     print(f"Invalid buy points: {invalid_points}")
    
#     if len(buy_points) > 0:
#         print(f"Percentage valid: {valid_points/len(buy_points)*100:.2f}%")
    
#     print("\nDaily Statistics:")
#     points_per_day = [data['total'] for data in points_by_date.values()]
#     if points_per_day:
#         print(f"Average buy points per day: {np.mean(points_per_day):.2f}")
#         print(f"Maximum buy points in a day: {max(points_per_day)}")
#         print(f"Minimum buy points in a day: {min(points_per_day)}")
    
#     return points_by_date

In [27]:
dff["target"].value_counts()

target
0    155212
1     22906
Name: count, dtype: int64

In [28]:
dff = dff.drop(columns=['Unnamed: 0'], axis=1)


In [29]:
df_features = dff.drop(columns=['target'])


In [30]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
import joblib  # For saving and loading the scaler

# Step 1: Separate the DataFrame into Scalable and Non-Scalable Columns
def separate_columns(df):
    # Identify columns that are numeric and can be scaled
    numeric_columns = df.select_dtypes(include=[np.number]).columns.tolist()

    # Create DataFrame with numeric columns for scaling
    df_scalable = df[numeric_columns]

    # Create DataFrame with non-scalable columns (e.g., date column, categorical columns)
    df_non_scalable = df.drop(columns=numeric_columns)

    return df_scalable, df_non_scalable

# Step 2: Apply MinMaxScaler and save it
def scale_dataframe(df_scalable, save_scaler=False, scaler_path='scaler.pkl'):
    scaler = MinMaxScaler()

    # Fit the scaler to the data and transform it
    df_scaled = pd.DataFrame(scaler.fit_transform(df_scalable), columns=df_scalable.columns, index=df_scalable.index)

    # Save the scaler if required
    if save_scaler:
        joblib.dump(scaler, scaler_path)
        print(f"Scaler saved to {scaler_path}")

    return df_scaled

# Step 3: Concatenate the Scaled DataFrame with Non-Scaled Columns
def process_large_dataframe(df, save_scaler=False, scaler_path='scaler.pkl'):
    # Separate the columns
    df_scalable, df_non_scalable = separate_columns(df)
    
    # Scale the numeric columns and optionally save the scaler
    df_scaled = scale_dataframe(df_scalable, save_scaler=save_scaler, scaler_path=scaler_path)
    
    # Concatenate scaled and non-scaled DataFrames
    df_final = pd.concat([df_non_scalable, df_scaled], axis=1)
    
    return df_scaled, df_final

# Example Usage:
# df_features is your DataFrame
df_scaled, df_final = process_large_dataframe(df_features, save_scaler=False, scaler_path='before_autoencoder_scaler.pkl')


In [32]:
# Assuming dff["target"] is a DataFrame or Series and df_scaled is also a DataFrame
main_df = pd.concat([dff["target"], df_scaled], axis=1)  # axis=1 to concatenate along columns


In [33]:
main_df = main_df.dropna()
df_scaled = main_df.drop(columns=['target'])
target_df = main_df["target"]

In [34]:
target_df = target_df.reset_index().drop(columns=['index'])

In [35]:
df_scaled = df_scaled.reset_index().drop(columns=['index']) 

In [36]:
target_df

Unnamed: 0,target
0,0
1,0
2,0
3,0
4,0
...,...
177991,0
177992,0
177993,0
177994,0


In [37]:
main_df["target"].value_counts()[0]*100/len(main_df)

87.16488010966539

In [38]:
date_index_mapping = dff["date"].reset_index()

In [39]:
date_index_mapping

Unnamed: 0,index,date
0,0,2015-01-09 09:15:00+05:30
1,1,2015-01-09 09:20:00+05:30
2,2,2015-01-09 09:25:00+05:30
3,3,2015-01-09 09:30:00+05:30
4,4,2015-01-09 09:35:00+05:30
...,...,...
178113,178113,2024-08-28 15:05:00+05:30
178114,178114,2024-08-28 15:10:00+05:30
178115,178115,2024-08-28 15:15:00+05:30
178116,178116,2024-08-28 15:20:00+05:30


In [40]:
import gc
print(df_scaled.shape)
df_scaled.head()
gc.collect()

(177996, 1846)


0

In [41]:
# List all column names
column_list = df_scaled.columns.tolist()

# Print in a formatted way
for i, column in enumerate(column_list, 1):
    print(f"{i}. {column}")

# Or print total count
print(f"\nTotal number of columns: {len(column_list)}")

1. open
2. high
3. low
4. close
5. is_monday
6. is_tuesday
7. is_wednesday
8. is_thursday
9. is_friday
10. is_saturday
11. is_sunday
12. is_january
13. is_february
14. is_march
15. is_april
16. is_may
17. is_june
18. is_july
19. is_august
20. is_september
21. is_october
22. is_november
23. is_december
24. dist_from_max_to_prev_day
25. dist_from_low_to_prev_day
26. dist_from_daily_max
27. dist_from_daily_low
28. dist_from_max_to_last30days
29. dist_from_low_to_last30days
30. SMA_4
31. SMA_5
32. SMA_6
33. SMA_7
34. SMA_8
35. SMA_9
36. SMA_10
37. SMA_11
38. SMA_12
39. SMA_13
40. SMA_14
41. SMA_15
42. SMA_16
43. SMA_17
44. SMA_18
45. SMA_19
46. SMA_20
47. SMA_21
48. SMA_22
49. SMA_23
50. SMA_24
51. SMA_25
52. SMA_26
53. SMA_27
54. SMA_28
55. SMA_29
56. SMA_30
57. SMA_31
58. SMA_32
59. SMA_33
60. SMA_34
61. SMA_35
62. SMA_36
63. SMA_37
64. SMA_38
65. SMA_39
66. SMA_40
67. SMA_41
68. SMA_42
69. SMA_43
70. SMA_44
71. SMA_45
72. SMA_46
73. SMA_47
74. SMA_48
75. SMA_49
76. SMA_50
77. SMA_51
78.

In [42]:
import tensorflow as tf
from tensorflow.keras import layers, models
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pandas as pd
import numpy as np



# Split original dataset into train+validation and test sets (80% train+validation, 20% test)
X_train_val, X_test, y_train_val, y_test = train_test_split(
    df_scaled, target_df, test_size=0.2, random_state=42, shuffle=False
)

# Split train+validation into actual train and validation sets (75% train, 25% validation)
X_train, X_val, y_train, y_val = train_test_split(
    X_train_val, y_train_val, test_size=0.25, random_state=42, shuffle=False
)

# Now you have:
# X_train, y_train -> Training data
# X_val, y_val -> Validation data
# X_test, y_test -> Test data

# Check the resulting shapes of the datasets
print(f"Training set shape: {X_train.shape}")
print(f"Validation set shape: {X_val.shape}")
print(f"Test set shape: {X_test.shape}")

print(f"Training T set shape: {y_train.shape}")
print(f"Validation T set shape: {y_val.shape}")
print(f"Test T set shape: {y_test.shape}")





Training set shape: (106797, 1846)
Validation set shape: (35599, 1846)
Test set shape: (35600, 1846)
Training T set shape: (106797, 1)
Validation T set shape: (35599, 1)
Test T set shape: (35600, 1)


In [43]:
y_train

Unnamed: 0,target
0,0
1,0
2,0
3,0
4,0
...,...
106792,1
106793,1
106794,0
106795,0


In [44]:
!pip install imblearn

Collecting imblearn
  Downloading imblearn-0.0-py2.py3-none-any.whl.metadata (355 bytes)
Collecting imbalanced-learn (from imblearn)
  Downloading imbalanced_learn-0.12.4-py3-none-any.whl.metadata (8.3 kB)
Downloading imblearn-0.0-py2.py3-none-any.whl (1.9 kB)
Downloading imbalanced_learn-0.12.4-py3-none-any.whl (258 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m258.3/258.3 kB[0m [31m7.1 MB/s[0m eta [36m0:00:00[0m00:01[0m
[?25hInstalling collected packages: imbalanced-learn, imblearn
Successfully installed imbalanced-learn-0.12.4 imblearn-0.0
[0m

In [45]:
import numpy as np
import pandas as pd
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from tensorflow.keras.optimizers import Adam, SGD, RMSprop
from sklearn.utils.class_weight import compute_class_weight
from tensorflow.keras import metrics
from tensorflow.keras.optimizers import AdamW
from tensorflow.keras.utils import Sequence
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from sklearn.metrics import classification_report, confusion_matrix
# Add these imports at the top of your code
from imblearn.over_sampling import SMOTE
from tensorflow.keras.utils import Sequence


In [None]:
import numpy as np
import pandas as pd
from tensorflow.keras.utils import Sequence
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
from collections import defaultdict

class MonitoredTimeSeriesGenerator(Sequence):
    def __init__(
        self, 
        data, 
        targets, 
        batch_size,
        time_steps,
        n_features,
        mode='train',
        augmentation_probability=0.5,
        noise_scale=0.01,
        scale_range=(0.95, 1.05),
        magnitude_range=(0.9, 1.1),
        time_shift_range=(-2, 2),
        window_slice_probability=0.3,
        window_warp_probability=0.3,
        window_warp_scale_range=(0.8, 1.2),
        name='default',
        standardize=True,
        monitor_augmentations=True
    ):
        self.name = name
        self.monitor_augmentations = monitor_augmentations
        self.augmentation_stats = defaultdict(int)
        self.sample_augmentations = []
        
        # Basic initialization
        self.data = data.values if isinstance(data, pd.DataFrame) else data
        self.targets = targets.reset_index(drop=True).values if isinstance(targets, pd.DataFrame) or isinstance(targets, pd.Series) else targets
        self.time_steps = time_steps
        self.n_features = n_features
        self.mode = mode
        
        # Standardize if requested
        if standardize:
            self.scaler = StandardScaler()
            self.data = self.scaler.fit_transform(self.data)
        
        # Augmentation parameters
        self.augmentation_probability = augmentation_probability
        self.noise_scale = noise_scale
        self.scale_range = scale_range
        self.magnitude_range = magnitude_range
        self.time_shift_range = time_shift_range
        self.window_slice_probability = window_slice_probability
        self.window_warp_probability = window_warp_probability
        self.window_warp_scale_range = window_warp_scale_range
        
        # Get indices for each class
        self.pos_indices = np.where(self.targets == 1)[0]
        self.neg_indices = np.where(self.targets == 0)[0]
        
        # Calculate class weights
        n_samples = len(self.targets)
        self.class_weights = {
            0: n_samples / (2 * len(self.neg_indices)),
            1: n_samples / (2 * len(self.pos_indices))
        }
        
        # Batch size and sampling setup
        self.batch_size = batch_size - (batch_size % 2) if mode == 'train' else batch_size
        
        # Calculate balanced sampling for training
        if mode == 'train':
            self.pos_samples_per_batch = max(self.batch_size // 3, 1)  # At least 1/3 positive samples
        else:
            ratio = len(self.pos_indices) / len(self.targets)
            self.pos_samples_per_batch = max(1, int(round(self.batch_size * ratio)))
        
        self.neg_samples_per_batch = self.batch_size - self.pos_samples_per_batch
        
        # Calculate steps
        self.steps = min(
            len(self.pos_indices) // self.pos_samples_per_batch,
            len(self.neg_indices) // self.neg_samples_per_batch
        )
        
        self._log_initialization()

    def _window_slice(self, sequence, track=True):
        """Randomly slice and resize a window of the sequence"""
        if np.random.random() > self.window_slice_probability:
            return sequence
            
        # Choose window size between 1/2 and full length
        window_size = np.random.randint(self.time_steps // 2, self.time_steps)
        # Choose random start point
        start_idx = np.random.randint(0, self.time_steps - window_size)
        
        # Extract window
        window = sequence[start_idx:start_idx + window_size].copy()
        
        # Resize window to original length
        augmented = np.zeros_like(sequence)
        # Use linear interpolation to resize
        for feature in range(sequence.shape[-1]):
            augmented[:, feature] = np.interp(
                np.linspace(0, window_size - 1, self.time_steps),
                np.arange(window_size),
                window[:, feature]
            )
        
        if track:
            self.augmentation_stats['window_slice'] += 1
        
        return augmented

    def _window_warp(self, sequence, track=True):
        """Apply warping to a random window of the sequence"""
        if np.random.random() > self.window_warp_probability:
            return sequence
            
        # Choose window size (1/4 to 1/2 of sequence)
        window_size = np.random.randint(self.time_steps // 4, self.time_steps // 2)
        # Choose random start point
        window_start = np.random.randint(0, self.time_steps - window_size)
        
        augmented = sequence.copy()
        # Get random warping scale
        warp_scale = np.random.uniform(
            self.window_warp_scale_range[0],
            self.window_warp_scale_range[1]
        )
        
        # Apply warping to each feature
        for feature in range(sequence.shape[-1]):
            window = sequence[window_start:window_start + window_size, feature]
            
            # Create warped window
            num_warped_points = int(window_size * warp_scale)
            warped_points = np.linspace(0, window_size - 1, num_warped_points)
            warped_window = np.interp(
                warped_points,
                np.arange(window_size),
                window
            )
            
            # Resize back to original window size
            warped_window = np.interp(
                np.arange(window_size),
                np.linspace(0, window_size - 1, len(warped_window)),
                warped_window
            )
            
            augmented[window_start:window_start + window_size, feature] = warped_window
        
        if track:
            self.augmentation_stats['window_warp'] += 1
        
        return augmented

    def _augment_sequence(self, sequence, track=True):
        """Apply all augmentation techniques"""
        if np.random.random() > self.augmentation_probability:
            if track:
                self.augmentation_stats['no_augmentation'] += 1
            return sequence

        augmented = sequence.copy()
        original = sequence.copy()
        augmentations_applied = []

        # Apply window operations first
        if np.random.random() > 0.5:
            augmented = self._window_slice(augmented, track)
            if track:
                augmentations_applied.append('window_slice')
                
        if np.random.random() > 0.5:
            augmented = self._window_warp(augmented, track)
            if track:
                augmentations_applied.append('window_warp')

        # Apply other augmentations
        if np.random.random() > 0.5:
            noise = np.random.normal(0, self.noise_scale, sequence.shape)
            augmented += noise
            if track:
                self.augmentation_stats['noise'] += 1
                augmentations_applied.append('noise')

        if np.random.random() > 0.5:
            scale_factor = np.random.uniform(self.scale_range[0], self.scale_range[1])
            augmented *= scale_factor
            if track:
                self.augmentation_stats['scaling'] += 1
                augmentations_applied.append('scaling')

        if np.random.random() > 0.5:
            shift = np.random.randint(self.time_shift_range[0], self.time_shift_range[1])
            augmented = np.roll(augmented, shift, axis=0)
            if track:
                self.augmentation_stats['time_shift'] += 1
                augmentations_applied.append('time_shift')

        if track and self.monitor_augmentations and len(augmentations_applied) > 0:
            self.sample_augmentations.append({
                'original': original,
                'augmented': augmented,
                'techniques': augmentations_applied
            })

        return augmented

    def __getitem__(self, idx):
        batch_x = np.zeros((self.batch_size, self.time_steps, self.n_features))
        batch_y = np.zeros(self.batch_size)
        
        if self.mode == 'train':
            # Random sampling for training
            pos_indices = np.random.choice(
                self.pos_indices,
                size=self.pos_samples_per_batch,
                replace=True
            )
            neg_indices = np.random.choice(
                self.neg_indices,
                size=self.neg_samples_per_batch,
                replace=True
            )
            
            # Fill and augment positive samples
            for i, start_idx in enumerate(pos_indices):
                if start_idx + self.time_steps <= len(self.data):
                    sequence = self.data[start_idx:start_idx + self.time_steps]
                    batch_x[i] = self._augment_sequence(sequence)
                    batch_y[i] = 1

            # Fill and augment negative samples
            for i, start_idx in enumerate(neg_indices):
                if start_idx + self.time_steps <= len(self.data):
                    sequence = self.data[start_idx:start_idx + self.time_steps]
                    batch_x[i + self.pos_samples_per_batch] = self._augment_sequence(sequence)
                    batch_y[i + self.pos_samples_per_batch] = 0

        else:  # Validation/Test mode
            # Sequential sampling
            pos_start = (idx * self.pos_samples_per_batch) % len(self.pos_indices)
            neg_start = (idx * self.neg_samples_per_batch) % len(self.neg_indices)
            
            pos_indices = self.pos_indices[pos_start:pos_start + self.pos_samples_per_batch]
            neg_indices = self.neg_indices[neg_start:neg_start + self.neg_samples_per_batch]
            
            # Fill without augmentation
            for i, start_idx in enumerate(pos_indices):
                if start_idx + self.time_steps <= len(self.data):
                    batch_x[i] = self.data[start_idx:start_idx + self.time_steps]
                    batch_y[i] = 1
            
            for i, start_idx in enumerate(neg_indices):
                if start_idx + self.time_steps <= len(self.data):
                    batch_x[i + self.pos_samples_per_batch] = self.data[start_idx:start_idx + self.time_steps]
                    batch_y[i + self.pos_samples_per_batch] = 0

        return batch_x, batch_y

    def __len__(self):
        return self.steps

    def get_class_weights(self):
        """Return class weights"""
        return self.class_weights

    def visualize_augmentations(self, n_samples=5):
        """Visualize sample augmentations"""
        if not self.sample_augmentations:
            print("No augmentations to visualize yet.")
            return
        
        samples = self.sample_augmentations[:n_samples]
        fig, axes = plt.subplots(n_samples, 2, figsize=(12, 4*n_samples))
        
        for i, sample in enumerate(samples):
            # Plot original
            axes[i, 0].plot(sample['original'])
            axes[i, 0].set_title(f'Original Sequence {i+1}')
            
            # Plot augmented
            axes[i, 1].plot(sample['augmented'])
            axes[i, 1].set_title(f'Augmented ({"->".join(sample["techniques"])})')
        
        plt.tight_layout()
        return fig

    def get_augmentation_stats(self):
        """Get statistics about applied augmentations"""
        total = sum(self.augmentation_stats.values())
        if total == 0:
            return "No augmentations tracked yet."
        
        stats = {k: f"{v/total*100:.1f}%" for k, v in self.augmentation_stats.items()}
        return {
            'total_samples': total,
            'augmentation_distribution': stats,
            'parameters': {
                'probability': self.augmentation_probability,
                'noise_scale': self.noise_scale,
                'scale_range': self.scale_range,
                'magnitude_range': self.magnitude_range,
                'time_shift_range': self.time_shift_range,
                'window_slice_probability': self.window_slice_probability,
                'window_warp_probability': self.window_warp_probability,
                'window_warp_scale_range': self.window_warp_scale_range
            }
        }

    def _log_initialization(self):
        """Log initialization details"""
        print(f"\n{self.name} Generator Setup ({self.mode}):")
        print(f"Total samples: {len(self.targets)}")
        print(f"Positive samples: {len(self.pos_indices)} ({len(self.pos_indices)/len(self.targets):.3f})")
        print(f"Batch size: {self.batch_size} (Pos: {self.pos_samples_per_batch}, Neg: {self.neg_samples_per_batch})")
        print(f"Steps per epoch: {self.steps}")
        print(f"Class weights: {self.class_weights}")

In [None]:
# 1. Default training generator (balanced augmentation)
train_generator = MonitoredTimeSeriesGenerator(
    data=X_train,
    targets=y_train['target'],
    batch_size=32,
    time_steps=30,
    n_features=X_train.shape[1],
    mode='train',
    name='default_train',
    # Default balanced settings
    augmentation_probability=0.5,
    noise_scale=0.01,
    scale_range=(0.95, 1.05),
    magnitude_range=(0.9, 1.1),
    time_shift_range=(-2, 2),
    window_slice_probability=0.3,
    window_warp_probability=0.3,
    window_warp_scale_range=(0.8, 1.2)
)

# 2. Validation generator (no augmentation)
val_generator = MonitoredTimeSeriesGenerator(
    data=X_val,
    targets=y_val['target'],
    batch_size=32,
    time_steps=30,
    n_features=X_train.shape[1],
    mode='val',
    name='validation',
    monitor_augmentations=False
)

# 3. Test generator (no augmentation)
test_generator = MonitoredTimeSeriesGenerator(
    data=X_test,
    targets=y_test['target'],
    batch_size=32,
    time_steps=30,
    n_features=X_train.shape[1],
    mode='test',
    name='test',
    monitor_augmentations=False
)

# 4. Aggressive augmentation generator
train_generator_aggressive = MonitoredTimeSeriesGenerator(
    data=X_train,
    targets=y_train['target'],
    batch_size=32,
    time_steps=30,
    n_features=X_train.shape[1],
    mode='train',
    name='aggressive_train',
    # Aggressive settings
    augmentation_probability=0.7,    # Higher probability
    noise_scale=0.02,               # More noise
    scale_range=(0.9, 1.1),        # Wider scaling
    magnitude_range=(0.8, 1.2),     # Wider magnitude
    time_shift_range=(-3, 3),       # Larger shifts
    window_slice_probability=0.5,    # More window slicing
    window_warp_probability=0.5,     # More window warping
    window_warp_scale_range=(0.7, 1.3)  # More aggressive warping
)

# 5. Mild augmentation generator
train_generator_mild = MonitoredTimeSeriesGenerator(
    data=X_train,
    targets=y_train['target'],
    batch_size=32,
    time_steps=30,
    n_features=X_train.shape[1],
    mode='train',
    name='mild_train',
    # Mild settings
    augmentation_probability=0.3,    # Lower probability
    noise_scale=0.005,              # Less noise
    scale_range=(0.98, 1.02),       # Narrower scaling
    magnitude_range=(0.95, 1.05),   # Narrower magnitude
    time_shift_range=(-1, 1),       # Smaller shifts
    window_slice_probability=0.2,    # Less window slicing
    window_warp_probability=0.2,     # Less window warping
    window_warp_scale_range=(0.9, 1.1)  # Milder warping
)

# Function to compare augmentation strategies
def compare_augmentation_strategies():
    generators = {
        'Default': train_generator,
        'Aggressive': train_generator_aggressive,
        'Mild': train_generator_mild
    }
    
    for name, gen in generators.items():
        print(f"\n{name} Generator Augmentation Stats:")
        # Generate a few batches to collect statistics
        for _ in range(3):
            X_batch, y_batch = gen[0]
        
        stats = gen.get_augmentation_stats()
        print(stats)
        
        # Visualize examples
        print(f"\nVisualizing {name} augmentations:")
        gen.visualize_augmentations(n_samples=2)
        plt.show()

# Compare the different strategies
compare_augmentation_strategies()

In [90]:
def verify_generators(generators_dict):
    """
    Verify behavior of multiple generators
    """
    print("\nGenerator Verification Report")
    print("=" * 50)
    
    for name, gen in generators_dict.items():
        print(f"\n{name} Generator Analysis:")
        print("-" * 30)
        
        # Get a batch
        x_batch, y_batch = gen[0]
        
        # Basic shape information
        print(f"Batch shapes:")
        print(f"X: {x_batch.shape}")
        print(f"y: {y_batch.shape}")
        
        # Class distribution in batch
        unique, counts = np.unique(y_batch, return_counts=True)
        print("\nClass distribution in batch:")
        for val, count in zip(unique, counts):
            print(f"Class {int(val)}: {count} samples ({count/len(y_batch):.2%})")
        
        # Data statistics
        print("\nBatch statistics:")
        print(f"X mean: {x_batch.mean():.3f}")
        print(f"X std: {x_batch.std():.3f}")
        print(f"X range: [{x_batch.min():.3f}, {x_batch.max():.3f}]")
        
        # If training mode, check augmentation
        if gen.mode == 'train':
            print("\nAugmentation test (5 samples):")
            original = x_batch[0]
            for i in range(5):
                augmented = gen._augment_sequence(original)
                diff = np.mean(np.abs(original - augmented))
                print(f"Sample {i+1} mean abs difference: {diff:.3f}")

# Create dictionary of generators
generators = {
    'Regular Training': train_generator,
    'Aggressive Aug Training': train_generator_aggressive,
    'Mild Aug Training': train_generator_mild,
    'Validation': val_generator,
    'Test': test_generator
}

# Verify all generators
verify_generators(generators)


Generator Verification Report

Regular Training Generator Analysis:
------------------------------
Batch shapes:
X: (32, 30, 1846)
y: (32,)

Class distribution in batch:
Class 0: 28 samples (87.50%)
Class 1: 4 samples (12.50%)

Batch statistics:
X mean: 0.155
X std: 0.211
X range: [-0.037, 1.084]

Augmentation test (5 samples):
Sample 1 mean abs difference: 0.009
Sample 2 mean abs difference: 0.000
Sample 3 mean abs difference: 0.000
Sample 4 mean abs difference: 0.012
Sample 5 mean abs difference: 0.006

Aggressive Aug Training Generator Analysis:
------------------------------
Batch shapes:
X: (32, 30, 1846)
y: (32,)

Class distribution in batch:
Class 0: 28 samples (87.50%)
Class 1: 4 samples (12.50%)

Batch statistics:
X mean: 0.153
X std: 0.210
X range: [-0.084, 1.169]

Augmentation test (5 samples):
Sample 1 mean abs difference: 0.024
Sample 2 mean abs difference: 0.000
Sample 3 mean abs difference: 0.000
Sample 4 mean abs difference: 0.000
Sample 5 mean abs difference: 0.000

M

In [91]:
def test_augmentation_settings(generator, name, n_samples=5):
    print(f"\nTesting {name} augmentation settings:")
    x, y = generator[0]  # Get first batch
    
    # Get original and augmented sequences
    original = x[0]  # First sequence in batch
    augmented_sequences = [generator._augment_sequence(original) for _ in range(n_samples)]
    
    # Print statistics
    print(f"Original sequence range: [{original.min():.3f}, {original.max():.3f}]")
    for i, aug in enumerate(augmented_sequences):
        print(f"Augmented {i+1} range: [{aug.min():.3f}, {aug.max():.3f}]")
        print(f"Mean absolute difference: {np.mean(np.abs(original - aug)):.3f}")

# Test different configurations
test_augmentation_settings(train_generator, "Default Settings")
test_augmentation_settings(train_generator_aggressive, "Aggressive Settings")
test_augmentation_settings(train_generator_mild, "Mild Settings")


Testing Default Settings augmentation settings:
Original sequence range: [-0.042, 1.133]
Augmented 1 range: [-0.042, 1.133]
Mean absolute difference: 0.000
Augmented 2 range: [-0.057, 1.194]
Mean absolute difference: 0.013
Augmented 3 range: [-0.042, 1.119]
Mean absolute difference: 0.002
Augmented 4 range: [-0.041, 1.096]
Mean absolute difference: 0.020
Augmented 5 range: [-0.044, 1.175]
Mean absolute difference: 0.007

Testing Aggressive Settings augmentation settings:
Original sequence range: [0.000, 0.919]
Augmented 1 range: [0.000, 0.906]
Mean absolute difference: 0.002
Augmented 2 range: [0.000, 0.919]
Mean absolute difference: 0.000
Augmented 3 range: [-0.069, 1.054]
Mean absolute difference: 0.027
Augmented 4 range: [0.000, 1.093]
Mean absolute difference: 0.028
Augmented 5 range: [-0.073, 0.971]
Mean absolute difference: 0.016

Testing Mild Settings augmentation settings:
Original sequence range: [0.000, 1.000]
Augmented 1 range: [0.000, 1.000]
Mean absolute difference: 0.000

In [92]:
def verify_generator_distribution(gen, name, num_batches=5):
    print(f"\nTesting {name} Generator:")
    total_pos = 0
    total_neg = 0
    total_samples = 0
    
    # Add shape verification
    print(f"Checking first batch shape and values...")
    first_batch_x, first_batch_y = gen[0]
    print(f"X shape: {first_batch_x.shape}")
    print(f"y shape: {first_batch_y.shape}")
    print(f"X value range: [{first_batch_x.min():.3f}, {first_batch_x.max():.3f}]")
    
    for i in range(num_batches):
        x, y = gen[i]
        pos = np.sum(y == 1)
        neg = np.sum(y == 0)
        total_pos += pos
        total_neg += neg
        total_samples += len(y)
        
        print(f"Batch {i}:")
        print(f"  Size={len(y)}")
        print(f"  Positive samples={pos} ({pos/len(y):.3f})")
        print(f"  Negative samples={neg} ({neg/len(y):.3f})")
        print(f"  Data shape={x.shape}")
    
    print(f"\nOverall Distribution Summary:")
    print(f"Total samples: {total_samples}")
    print(f"Positive samples: {total_pos} ({total_pos/total_samples:.3f})")
    print(f"Negative samples: {total_neg} ({total_neg/total_samples:.3f})")

# Run verification
for name, gen in [('Training', train_generator), 
                  ('Validation', val_generator), 
                  ('Test', test_generator)]:
    verify_generator_distribution(gen, name)


Testing Training Generator:
Checking first batch shape and values...
X shape: (32, 30, 1846)
y shape: (32,)
X value range: [-0.040, 1.101]
Batch 0:
  Size=32
  Positive samples=4 (0.125)
  Negative samples=28 (0.875)
  Data shape=(32, 30, 1846)
Batch 1:
  Size=32
  Positive samples=4 (0.125)
  Negative samples=28 (0.875)
  Data shape=(32, 30, 1846)
Batch 2:
  Size=32
  Positive samples=4 (0.125)
  Negative samples=28 (0.875)
  Data shape=(32, 30, 1846)
Batch 3:
  Size=32
  Positive samples=4 (0.125)
  Negative samples=28 (0.875)
  Data shape=(32, 30, 1846)
Batch 4:
  Size=32
  Positive samples=4 (0.125)
  Negative samples=28 (0.875)
  Data shape=(32, 30, 1846)

Overall Distribution Summary:
Total samples: 160
Positive samples: 20 (0.125)
Negative samples: 140 (0.875)

Testing Validation Generator:
Checking first batch shape and values...
X shape: (32, 30, 1846)
y shape: (32,)
X value range: [0.000, 1.000]
Batch 0:
  Size=32
  Positive samples=5 (0.156)
  Negative samples=27 (0.844)
  

In [93]:
a,b = train_generator[200]


In [94]:
a.shape

(32, 30, 1846)

In [95]:
def focal_loss(gamma=4.0, alpha=0.85):  # Adjusted for your distribution
    """
    Focal Loss for highly imbalanced dataset
    gamma: Higher value (4.0) because of severe imbalance (13.4% vs 86.6%)
    alpha: 0.85 to match roughly your positive class proportion
    """
    def focal_loss_fixed(y_true, y_pred):
        epsilon = 1e-7
        y_pred = tf.clip_by_value(y_pred, epsilon, 1 - epsilon)
        ce = -y_true * tf.math.log(y_pred)
        weight = tf.math.pow(1 - y_pred, gamma)
        fl = alpha * weight * ce
        return tf.reduce_mean(fl)
    return focal_loss_fixed


In [None]:
parameters = {
    'units_0': 512,  # Good size
    'units_1': 256, 
    'units_2': 128,
    'activation_fxn': 'gelu',  # Try GELU instead of tanh
    'recurrent_activation_fxn': 'sigmoid',
    'kr': tf.keras.regularizers.L2(0.0005),  # Reduced regularization
    'rr': tf.keras.regularizers.L2(0.0005),
    'br': tf.keras.regularizers.L2(0.0001),
    'timestep': 30,
    'input_shape': X_train.shape[1],
    'dropout_rate': 0.3,  # Reduced dropout
    'activation_fxn_2': 'sigmoid'
}

In [97]:
def create_lstm_model(parameters):
    # Set float32 precision to avoid dtype mismatches
    tf.keras.mixed_precision.set_global_policy('float32')
    
    lr_schedule = tf.keras.optimizers.schedules.CosineDecayRestarts(
        initial_learning_rate=0.0001,  # Initial learning rate
        first_decay_steps=1000,        # Steps for first decay cycle
        t_mul=2.0,                     # Multiplier for next cycle's steps
        m_mul=0.9,                     # Multiplier for next cycle's lr
        alpha=0.0001                   # Minimum learning rate
    )
    
    # Extract parameters
    units_0 = parameters['units_0']
    activation_fxn = parameters['activation_fxn']
    recurrent_activation_fxn = parameters['recurrent_activation_fxn']
    kr = parameters['kr']
    rr = parameters['rr']
    br = parameters['br']
    timestep = parameters['timestep']
    input_shape = parameters['input_shape']
    units_1 = parameters['units_1']
    units_2 = parameters['units_2']
    dropout_rate = parameters['dropout_rate']
    activation_fxn_2 = parameters['activation_fxn_2']
    
    model = Sequential()

    # First LSTM Layer
    model.add(LSTM(
        units=units_0,
        activation=activation_fxn,
        recurrent_activation=recurrent_activation_fxn,
        use_bias=True,
        kernel_initializer='glorot_uniform',
        recurrent_initializer='orthogonal',
        bias_initializer='zeros',
        kernel_regularizer=kr,
        recurrent_regularizer=rr,
        bias_regularizer=br,
        unit_forget_bias=True,
        return_sequences=True,
        input_shape=(timestep, input_shape)
    ))
    model.add(BatchNormalization())
    model.add(Dropout(dropout_rate))

    # Second LSTM Layer
    model.add(LSTM(
        units=units_1,
        activation=activation_fxn,
        recurrent_activation=recurrent_activation_fxn,
        use_bias=True,
        kernel_initializer='glorot_uniform',
        recurrent_initializer='orthogonal',
        bias_initializer='zeros',
        kernel_regularizer=kr,
        recurrent_regularizer=rr,
        bias_regularizer=br,
        unit_forget_bias=True,
        return_sequences=True
    ))
    model.add(BatchNormalization())
    model.add(Dropout(dropout_rate))

    # Third LSTM Layer
    model.add(LSTM(
        units=units_2,
        activation=activation_fxn,
        recurrent_activation=recurrent_activation_fxn,
        use_bias=True,
        kernel_initializer='glorot_uniform',
        recurrent_initializer='orthogonal',
        bias_initializer='zeros',
        kernel_regularizer=kr,
        recurrent_regularizer=rr,
        bias_regularizer=br,
        unit_forget_bias=True,
        return_sequences=False
    ))
    model.add(BatchNormalization())
    model.add(Dropout(dropout_rate))

    # Output Layer
    model.add(Dense(units=1, activation=activation_fxn_2, kernel_regularizer=kr))

    # Compile model with AdamW optimizer and focal loss
    model.compile(
        optimizer=AdamW(
            learning_rate=lr_schedule,
            weight_decay=0.005,
            clipnorm=0.5
        ),
        loss=focal_loss(gamma=4, alpha=0.85),
        metrics=[
            'accuracy',
            tf.keras.metrics.Precision(),
            tf.keras.metrics.Recall(),
            tf.keras.metrics.AUC(),
            tf.keras.metrics.F1Score(threshold=0.5)
        ]
    )
    
    return model


In [98]:
# from sklearn.utils import class_weight
# import numpy as np

# # Assuming 'y_train' contains the class labels (0s and 1s)
# classes = np.unique(y_train)  # Unique class labels in your target data

# # Compute class weights
# class_weights = class_weight.compute_class_weight(
#     class_weight='balanced',  # Set to 'balanced' to compute weights inversely proportional to class frequencies
#     classes=classes,  # Provide the unique class labels
#     y=y_train['target']  # The target labels
# )

# # Convert to dictionary format {class_label: weight}
# class_weight_dict = dict(zip(classes, class_weights))




In [99]:
# from sklearn.utils import class_weight
# import numpy as np

# # Get class distribution
# n_samples = len(y_train)
# n_classes = len(np.unique(y_train))
# counts = y_train['target'].value_counts()

# # Calculate custom weights
# minority_weight = (n_samples / (n_classes * counts[1])) * 4  # Multiply by 3 for more emphasis
# majority_weight = n_samples / (n_classes * counts[0])

# class_weight_dict = {
#     0: majority_weight,
#     1: minority_weight
# }

In [100]:
class_weight_dict

NameError: name 'class_weight_dict' is not defined

In [101]:
y_train["target"].value_counts()

target
0    92489
1    14308
Name: count, dtype: int64

In [102]:
# # First, define a custom precision metric
# class CustomPrecision(tf.keras.metrics.Precision):
#     def __init__(self, name='custom_precision', **kwargs):
#         super().__init__(name=name, **kwargs)
    
#     def update_state(self, y_true, y_pred, sample_weight=None):
#         y_true = tf.cast(y_true, tf.float32)
#         y_pred = tf.cast(y_pred, tf.float32)
#         return super().update_state(y_true, y_pred, sample_weight)


In [103]:
from tensorflow.keras.layers import Bidirectional, LSTM
# If using just keras:
# from keras.layers import Bidirectional, LSTM

# Full imports you might need:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import (
    Bidirectional,
    LSTM,
    Dense,
    Dropout,
    BatchNormalization
)
from tensorflow.keras.optimizers import AdamW
from tensorflow.keras.callbacks import (
    TensorBoard, 
    CSVLogger, 
    EarlyStopping, 
    ReduceLROnPlateau,
    ModelCheckpoint
)

In [104]:
from datetime import datetime
from tensorflow.keras.callbacks import TensorBoard, CSVLogger, EarlyStopping, ReduceLROnPlateau
import os
# Create log directory
log_dir = "logs/fit/" + datetime.now().strftime("%Y%m%d-%H%M%S")
os.makedirs(log_dir, exist_ok=True)

# Updated callbacks
callbacks = [
    EarlyStopping(
        monitor='val_f1_score',
        mode='max',
        patience=10,
        restore_best_weights=True
    ),
    # Remove ReduceLROnPlateau since we're using lr_schedule
    ModelCheckpoint(
        'best_model.h5',
        monitor='val_f1_score',
        mode='max',
        save_best_only=True
    ),
    CSVLogger(
        'training_log.csv',
        append=True
    )
]

model = create_lstm_model(parameters)
history = model.fit(
    train_generator_aggressive,
    validation_data=val_generator,
    epochs=50,
    callbacks=callbacks
)

# Enhanced prediction function with progress bar
from tqdm import tqdm
def get_predictions(model, generator):
    predictions = []
    true_values = []
    
    for i in tqdm(range(len(generator)), desc="Getting predictions"):
        x, y = generator[i]
        pred = model.predict(x, verbose=0)
        predictions.extend(pred.flatten())
        true_values.extend(y)
        
    return np.array(predictions), np.array(true_values)

# Find optimal threshold
def find_optimal_threshold(y_true, y_pred):
    precisions, recalls, thresholds = precision_recall_curve(y_true, y_pred)
    f1_scores = 2 * (precisions * recalls) / (precisions + recalls + 1e-7)
    optimal_idx = np.argmax(f1_scores)
    return thresholds[optimal_idx], f1_scores[optimal_idx]

# Plot metrics
def plot_metrics(y_true, y_pred, title_prefix=""):
    # ROC Curve
    fpr, tpr, _ = roc_curve(y_true, y_pred)
    roc_auc = auc(fpr, tpr)
    
    plt.figure(figsize=(15, 5))
    
    # ROC curve
    plt.subplot(131)
    plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (AUC = {roc_auc:.2f})')
    plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title(f'{title_prefix} ROC Curve')
    plt.legend(loc="lower right")
    
    # Precision-Recall curve
    precisions, recalls, _ = precision_recall_curve(y_true, y_pred)
    plt.subplot(132)
    plt.plot(recalls, precisions, color='blue', lw=2)
    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.title(f'{title_prefix} Precision-Recall Curve')
    
    # Prediction distribution
    plt.subplot(133)
    plt.hist(y_pred[y_true==0], bins=50, alpha=0.5, label='Negative class', density=True)
    plt.hist(y_pred[y_true==1], bins=50, alpha=0.5, label='Positive class', density=True)
    plt.xlabel('Prediction value')
    plt.ylabel('Density')
    plt.title(f'{title_prefix} Prediction Distribution')
    plt.legend()
    
    plt.tight_layout()
    plt.savefig(f'{title_prefix.lower().replace(" ", "_")}_metrics.png')
    plt.show()

# Get predictions
print("Getting validation predictions...")
val_pred, val_true = get_predictions(model, val_generator)
print("\nGetting test predictions...")
test_pred, test_true = get_predictions(model, test_generator)

# Find optimal threshold on validation set
optimal_threshold, best_f1 = find_optimal_threshold(val_true, val_pred)
print(f"\nOptimal threshold: {optimal_threshold:.3f} (F1: {best_f1:.3f})")

# Print metrics with optimal threshold
print("\nValidation Metrics:")
print(classification_report(val_true, (val_pred > optimal_threshold).astype(int)))
print("\nConfusion Matrix (Validation):")
print(confusion_matrix(val_true, (val_pred > optimal_threshold).astype(int)))

print("\nTest Metrics:")
print(classification_report(test_true, (test_pred > optimal_threshold).astype(int)))
print("\nConfusion Matrix (Test):")
print(confusion_matrix(test_true, (test_pred > optimal_threshold).astype(int)))

# Plot metrics
plot_metrics(val_true, val_pred, "Validation")
plot_metrics(test_true, test_pred, "Test")

# Plot training history
plt.figure(figsize=(15, 5))
plt.subplot(121)
plt.plot(history.history['loss'], label='Training Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.title('Model Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()

plt.subplot(122)
plt.plot(history.history['f1_score'], label='Training F1')
plt.plot(history.history['val_f1_score'], label='Validation F1')
plt.title('Model F1 Score')
plt.xlabel('Epoch')
plt.ylabel('F1 Score')
plt.legend()

plt.tight_layout()
plt.savefig('training_history.png')
plt.show()

Epoch 1/50

  saving_api.save_model(


Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50

KeyboardInterrupt: 

In [69]:
model = create_lstm_model(parameters)
history = model.fit(
    train_generator,   # Generator object
    validation_data=val_generator,  # Validation generator
    epochs=50,         # Number of epochs
    class_weight=class_weight_dict,  # Class weights to handle imbalance
    steps_per_epoch=len(train_generator),  # Total steps per epoch
    validation_steps=len(val_generator),
    callbacks=[early_stopping, csv_logger, tensorboard_callback],
    verbose=1          # Verbose level for logging training progress
)


2024-10-30 06:33:19.172028: I external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:454] Loaded cuDNN version 8907


Epoch 1/50


2024-10-30 06:33:32.628836: I external/local_xla/xla/service/service.cc:168] XLA service 0x7fcabc17a290 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
2024-10-30 06:33:32.628886: I external/local_xla/xla/service/service.cc:176]   StreamExecutor device (0): NVIDIA RTX A6000, Compute Capability 8.6
2024-10-30 06:33:32.651589: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:269] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
I0000 00:00:1730270012.773791     128 device_compiler.h:186] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [None]:
Data - >. 1. Training, Val, Test


In [25]:
import pandas as pd
import numpy as np
import gc
import logging
from tensorflow.keras.optimizers import (
    Adam, SGD, Nadam, RMSprop, Adadelta, Adagrad, Adamax, Ftrl, SGD, AdamW)
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from tensorflow.keras.utils import Sequence
from tensorflow.keras.layers import LeakyReLU
from tensorflow.keras.callbacks import EarlyStopping  # Import EarlyStopping


logging.basicConfig(
    filename='training_log.log',  # log output file
    level=logging.INFO,  # Correct logging level
    format='%(asctime)s - %(levelname)s - %(message)s',  # log format
    datefmt='%y-%m-%d %H:%M:%S'  # date format
)



optimizers = {
    # 'Adam': Adam(learning_rate=0.0009),
    # 'SGD': SGD(learning_rate=0.0009),
    # 'Nadam': Nadam(learning_rate=0.0009),
    # 'RMSprop': RMSprop(learning_rate=0.0009),
    # 'Adadelta': Adadelta(learning_rate=0.0009),
    # 'Adagrad': Adagrad(learning_rate=0.0009),
    # 'Adamax': Adamax(learning_rate=0.0009),
    # 'FTRL': Ftrl(learning_rate=0.0009),
    # 'SGDW': SGD(learning_rate=0.0009, weight_decay=0.01),
    # 'AdamW-0.0005': AdamW(learning_rate=0.0005, weight_decay=0.01),
    'AdamW-0.0006': AdamW(learning_rate=0.0006, weight_decay=0.01)
}
# Generator class for batch processing
class DataGenerator(Sequence):
    def __init__(self, data, batch_size=128):  # Reduced batch size
        self.data = data
        self.batch_size = batch_size
        self.indices = np.arange(len(data))

    def __len__(self):
        return int(np.ceil(len(self.data) / self.batch_size))

    def __getitem__(self, index):
        batch_indices = self.indices[index * self.batch_size:(index + 1) * self.batch_size]
        batch_data = self.data.iloc[batch_indices].values  # Use .iloc to ensure correct indexing
        return batch_data, batch_data  # For autoencoder, both input and output are the same

# Function to build the autoencoder model
def build_autoencoder(input_dim, encoding_dim, optimizer):
    input_layer = Input(shape=(input_dim,))

    # encoder layers
    encoder = Dense(512, activation="relu")(input_layer)  # first hidden layer with 512 nodes
    encoder = Dense(256, activation="relu")(encoder)      # second hidden layer with 256 nodes
    encoder = Dense(encoding_dim, activation="relu")(encoder)  # bottleneck layer with `encoding_dim` nodes

    # decoder layers
    decoder = Dense(256, activation="relu")(encoder)      # mirror the encoder layers (256 nodes)
    decoder = Dense(512, activation="relu")(decoder)      # mirror the encoder layers (512 nodes)
    decoder = Dense(input_dim, activation='sigmoid')(decoder)  # output layer with `input_dim` nodes

    autoencoder = Model(inputs=input_layer, outputs=decoder)
    autoencoder.compile(optimizer=optimizer, loss="mse")  # use the provided optimizer
    return autoencoder, Model(inputs=input_layer, outputs=encoder)

# Function to train the autoencoder with batch learning
def train_autoencoder(autoencoder, data, epochs=35, batch_size=128):  # increased epochs, batch size 200
    x_train, x_val = train_test_split(data, test_size=0.1, random_state=42)
    
    # create generators
    train_generator = DataGenerator(x_train, batch_size=batch_size)
    val_generator = DataGenerator(x_val, batch_size=batch_size)
    
    # early stopping callback
    early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
    
    history = autoencoder.fit(train_generator,
                              epochs=epochs,
                              validation_data=val_generator,
                              callbacks=[early_stopping])  # add the early stopping callback here
    return history

# Function to evaluate the model and find the optimal dimensionality
def find_optimal_dimension(df_scaled, optimizers, epochs=35, batch_size=128):  
    input_dim = df_scaled.shape[1]
    errors = []
    encoding_dims = [60]  # You can expand this list to try other dimensions

    for encoding_dim in encoding_dims:
        for optimizer_name, optimizer in optimizers.items():
            logging.info(f"Training with optimizer: {optimizer_name}")
            autoencoder, encoder = build_autoencoder(input_dim, encoding_dim, optimizer)
            train_autoencoder(autoencoder, df_scaled, epochs=epochs, batch_size=batch_size)

            # Use the trained autoencoder to encode and decode the data in batches
            encoded_data = []
            decoded_data = []
            for i in range(0, len(df_scaled), batch_size):
                batch = df_scaled.iloc[i:i + batch_size].values  # Use .iloc to ensure correct indexing
                encoded_batch = encoder.predict(batch)
                decoded_batch = autoencoder.predict(batch)
                encoded_data.append(encoded_batch)
                decoded_data.append(decoded_batch)

            encoded_data = np.vstack(encoded_data)
            decoded_data = np.vstack(decoded_data)

            # Calculate reconstruction error
            mse = mean_squared_error(df_scaled.values, decoded_data)
            errors.append((encoding_dim, optimizer_name, mse))
            logging.info(f"Encoding Dim: {encoding_dim}, Optimizer: {optimizer_name}, MSE: {mse}")

            # Free up memory
            del autoencoder, encoder, encoded_data, decoded_data
            gc.collect()

    # Find the encoding dimension with the smallest error
    optimal_dim, optimal_optimizer_name, min_mse = min(errors, key=lambda x: x[2])
    logging.info(f"Optimal Encoding Dimension: {optimal_dim}, Optimizer: {optimal_optimizer_name}, MSE: {min_mse}")

    # Return the optimal dimension and the corresponding encoder
    return optimal_dim, build_autoencoder(input_dim, optimal_dim, optimizers[optimal_optimizer_name])[1]



# Find the optimal dimension
optimal_dim, optimal_encoder = find_optimal_dimension(df, optimizers, epochs=35, batch_size=128)



In [26]:

with open('training_log.log', 'r') as f:
    print(f.read())


24-09-09 07:37:27 - INFO - Training with optimizer: Adam
24-09-09 07:42:58 - INFO - Encoding Dim: 60, Optimizer: Adam, MSE: 0.00039134073902094736
24-09-09 07:42:58 - INFO - Training with optimizer: SGD
24-09-09 07:48:10 - INFO - Encoding Dim: 60, Optimizer: SGD, MSE: 0.019344559345294512
24-09-09 07:48:10 - INFO - Training with optimizer: Nadam
24-09-09 07:52:58 - INFO - Encoding Dim: 60, Optimizer: Nadam, MSE: 0.0004165755366704315
24-09-09 07:52:58 - INFO - Training with optimizer: RMSprop
24-09-09 07:58:02 - INFO - Encoding Dim: 60, Optimizer: RMSprop, MSE: 0.001750231789881493
24-09-09 07:58:03 - INFO - Training with optimizer: Adadelta
24-09-09 08:03:13 - INFO - Encoding Dim: 60, Optimizer: Adadelta, MSE: 0.02012077012827505
24-09-09 08:03:13 - INFO - Training with optimizer: Adagrad
24-09-09 08:08:17 - INFO - Encoding Dim: 60, Optimizer: Adagrad, MSE: 0.015115404734940944
24-09-09 08:08:18 - INFO - Training with optimizer: Adamax
24-09-09 08:13:20 - INFO - Encoding Dim: 60, Opti

In [37]:
import logging
from tensorflow.keras.optimizers import AdamW  # Import AdamW optimizer
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense
from sklearn.model_selection import train_test_split
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau  # Early stopping
import pandas as pd
import numpy as np
from tensorflow.keras.utils import Sequence
from tensorflow.keras.layers import LeakyReLU

# Generator class for batch processing
class DataGenerator(Sequence):
    def __init__(self, data, batch_size=128):
        self.data = data
        self.batch_size = batch_size
        self.indices = np.arange(len(data))

    def __len__(self):
        return int(np.ceil(len(self.data) / self.batch_size))

    def __getitem__(self, index):
        batch_indices = self.indices[index * self.batch_size:(index + 1) * self.batch_size]
        batch_data = self.data.iloc[batch_indices].values
        return batch_data, batch_data

# Function to build the autoencoder model with customizable activations
def build_autoencoder(input_dim, encoding_dim, optimizer, activation_fn_encoder='LeakyReLU', activation_fn_decoder='linear'):
    input_layer = Input(shape=(input_dim,))

    # Encoder layers
    encoder = Dense(512, activation=activation_fn_encoder)(input_layer)
    encoder = Dense(256, activation=activation_fn_encoder)(encoder)
    encoder = Dense(encoding_dim, activation=activation_fn_encoder)(encoder)

    # Decoder layers
    decoder = Dense(256, activation=activation_fn_decoder)(encoder)
    decoder = Dense(512, activation=activation_fn_decoder)(decoder)
    decoder = Dense(input_dim, activation=activation_fn_decoder)(decoder)

    autoencoder = Model(inputs=input_layer, outputs=decoder)
    autoencoder.compile(optimizer=optimizer, loss="mse", metrics=['mse'])
    
    return autoencoder, Model(inputs=input_layer, outputs=encoder)



# Train the model and save it
def train_and_save_autoencoder(df, encoding_dim=60, epochs=35, batch_size=128, activation_fn_encoder='LeakyReLU', activation_fn_decoder='linear'):
    input_dim = df.shape[1]
    optimizer = AdamW(learning_rate=0.001, weight_decay=0.01)  # Using AdamW optimizer

    # Build the autoencoder
    autoencoder, encoder = build_autoencoder(
        input_dim, encoding_dim, optimizer,
        activation_fn_encoder=activation_fn_encoder,
        activation_fn_decoder=activation_fn_decoder
    )

    # Train the model
    x_train, x_val = train_test_split(df, test_size=0.1, random_state=42)
    train_generator = DataGenerator(x_train, batch_size=batch_size)
    val_generator = DataGenerator(x_val, batch_size=batch_size)

    early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
    reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=3, min_lr=1e-6)

    history = autoencoder.fit(train_generator,
                              epochs=epochs,
                              validation_data=val_generator,
                              callbacks=[early_stopping,reduce_lr])
    
    # Log the final loss
    final_loss = history.history['val_loss'][-1]
    logging.info(f"Final Validation Loss: {final_loss}")

    # Save the autoencoder and encoder
    autoencoder.save('autoencoder_model.h5')
    encoder.save('encoder_model.h5')

    return final_loss

# Assuming `df` is your DataFrame with scaled data
final_loss = train_and_save_autoencoder(df, encoding_dim=60, epochs=35, batch_size=128, activation_fn_encoder='LeakyReLU', activation_fn_decoder='linear')


Epoch 1/35
Epoch 2/35
Epoch 3/35
Epoch 4/35
Epoch 5/35
Epoch 6/35
Epoch 7/35
Epoch 8/35
Epoch 9/35
Epoch 10/35
Epoch 11/35
Epoch 12/35
Epoch 13/35
Epoch 14/35
Epoch 15/35
Epoch 16/35
Epoch 17/35
Epoch 18/35
Epoch 19/35
Epoch 20/35
Epoch 21/35
Epoch 22/35
Epoch 23/35
Epoch 24/35
Epoch 25/35
Epoch 26/35
Epoch 27/35
Epoch 28/35
Epoch 29/35
Epoch 30/35
Epoch 31/35
Epoch 32/35
Epoch 33/35
Epoch 34/35
Epoch 35/35




In [38]:
import numpy as np
import pandas as pd
from tensorflow.keras.models import load_model

# Load the saved encoder model
encoder = load_model('encoder_model.h5')

# Function to reduce the DataFrame using the encoder model
def reduce_dataframe(df, encoder, batch_size=128):
    df_reduced = []
    
    # Iterate over the DataFrame in batches
    for i in range(0, len(df), batch_size):
        batch = df.iloc[i:i + batch_size].values  # Convert batch to numpy array
        reduced_batch = encoder.predict(batch)    # Get the reduced representation
        df_reduced.append(reduced_batch)

    # Convert the reduced data back into a DataFrame
    df_reduced = pd.DataFrame(np.vstack(df_reduced), index=df.index)
    
    return df_reduced

# Assuming `df` is your original DataFrame with the data to be reduced
df_reduced = reduce_dataframe(df, encoder)

# Now `df_reduced` contains the reduced-dimensional representation of the original `df`


2024-09-27 09:47:06.827977: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:901] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355




2024-09-27 09:47:06.881429: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:901] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2024-09-27 09:47:06.881613: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:901] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2024-09-27 09:47:06.882635: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:901] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-



In [40]:
df_reduced.to_csv("reduced_testing.csv")

In [None]:
61, 0.000529 --

In [None]:
24-09-09 07:37:27 - INFO - Training with optimizer: Adam
24-09-09 07:42:58 - INFO - Encoding Dim: 60, Optimizer: Adam, MSE: 0.00039134073902094736
24-09-09 07:42:58 - INFO - Training with optimizer: SGD
24-09-09 07:48:10 - INFO - Encoding Dim: 60, Optimizer: SGD, MSE: 0.019344559345294512
24-09-09 07:48:10 - INFO - Training with optimizer: Nadam
24-09-09 07:52:58 - INFO - Encoding Dim: 60, Optimizer: Nadam, MSE: 0.0004165755366704315
24-09-09 07:52:58 - INFO - Training with optimizer: RMSprop
24-09-09 07:58:02 - INFO - Encoding Dim: 60, Optimizer: RMSprop, MSE: 0.001750231789881493
24-09-09 07:58:03 - INFO - Training with optimizer: Adadelta
24-09-09 08:03:13 - INFO - Encoding Dim: 60, Optimizer: Adadelta, MSE: 0.02012077012827505
24-09-09 08:03:13 - INFO - Training with optimizer: Adagrad
24-09-09 08:08:17 - INFO - Encoding Dim: 60, Optimizer: Adagrad, MSE: 0.015115404734940944
24-09-09 08:08:18 - INFO - Training with optimizer: Adamax
24-09-09 08:13:20 - INFO - Encoding Dim: 60, Optimizer: Adamax, MSE: 0.000525860126003413
24-09-09 08:13:20 - INFO - Training with optimizer: FTRL
24-09-09 15:58:44 - INFO - Training with optimizer: FTRL
24-09-09 16:03:30 - INFO - Encoding Dim: 60, Optimizer: FTRL, MSE: 0.12035803139232165
24-09-09 16:03:30 - INFO - Training with optimizer: SGDW
24-09-09 16:08:25 - INFO - Encoding Dim: 60, Optimizer: SGDW, MSE: 0.021841895117691135
24-09-09 16:08:25 - INFO - Training with optimizer: AdamW
24-09-09 16:13:31 - INFO - Encoding Dim: 60, Optimizer: AdamW, MSE: 0.0004266640576258005
24-09-09 16:13:31 - INFO - Optimal Encoding Dimension: 60, Optimizer: AdamW, MSE: 0.0004266640576258005
24-09-09 16:22:55 - INFO - Training with optimizer: Adam
24-09-09 16:28:17 - INFO - Encoding Dim: 60, Optimizer: Adam, MSE: 0.000633563765379747
24-09-09 16:28:17 - INFO - Training with optimizer: SGD
24-09-09 16:33:27 - INFO - Encoding Dim: 60, Optimizer: SGD, MSE: 0.019119386343005133
24-09-09 16:33:28 - INFO - Training with optimizer: Nadam
24-09-09 16:36:59 - INFO - Encoding Dim: 60, Optimizer: Nadam, MSE: 0.0006476806549614504
24-09-09 16:36:59 - INFO - Training with optimizer: RMSprop
24-09-09 16:41:56 - INFO - Encoding Dim: 60, Optimizer: RMSprop, MSE: 0.0011563050667090101
24-09-09 16:41:57 - INFO - Training with optimizer: Adadelta
24-09-09 16:46:55 - INFO - Encoding Dim: 60, Optimizer: Adadelta, MSE: 0.01883450426527251
24-09-09 16:46:56 - INFO - Training with optimizer: Adagrad