In [2]:
!pip install -q scikit-learn

[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.3.1[0m[39;49m -> [0m[32;49m24.1.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython -m pip install --upgrade pip[0m


In [6]:
import os
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import MinMaxScaler
import numpy as np

def process_and_combine_csv_files(folder_path):
    combined_data = pd.DataFrame()

    for filename in os.listdir(folder_path):
        if filename.endswith(".csv"):
            file_path = os.path.join(folder_path, filename)
            print(f"Processing file: {file_path}")

            # Load data
            data = pd.read_csv(file_path)

            # If "Epoch" column is missing, set it to 1
            if 'Epoch' not in data.columns:
                data['Epoch'] = 1

            # Append to the combined dataframe
            combined_data = pd.concat([combined_data, data], ignore_index=True)

    return combined_data

def train_and_evaluate_model(data):
    # Prepare the data
    X = data[['Epoch', 'Learning Rate', 'Training Loss']]
    y = data['Correct Count']

    # Initialize the MinMaxScaler
    scaler_X = MinMaxScaler()
    scaler_y = MinMaxScaler()

    # Normalize the features
    X_scaled = scaler_X.fit_transform(X)

    # Normalize the target variable, reshaped to 2D for the scaler
    y_scaled = scaler_y.fit_transform(y.values.reshape(-1, 1))

    # Split the scaled data into training and testing sets (90% train, 10% test)
    X_train_scaled, X_test_scaled, y_train_scaled, y_test_scaled = train_test_split(X_scaled, y_scaled, test_size=0.1, random_state=42)

    # Initialize the Random Forest model with selected parameters
    rf_model_scaled = RandomForestRegressor(
        n_estimators=200,
        max_depth=20,
        min_samples_split=5,
        min_samples_leaf=2,
        random_state=42
    )

    # Train the model
    rf_model_scaled.fit(X_train_scaled, y_train_scaled.ravel())

    # Predict on the test set
    y_pred_scaled = rf_model_scaled.predict(X_test_scaled)

    # Rescale the predictions back to the original range
    y_pred_rescaled = scaler_y.inverse_transform(y_pred_scaled.reshape(-1, 1))

    # Evaluate the model with rescaled predictions
    mse_rescaled = mean_squared_error(scaler_y.inverse_transform(y_test_scaled.reshape(-1, 1)), y_pred_rescaled)
    median_error_rescaled = np.median(np.abs(scaler_y.inverse_transform(y_test_scaled.reshape(-1, 1)) - y_pred_rescaled))

    # Display the sample predictions alongside the actual values
    sample_results_rescaled = pd.DataFrame({
        'Epoch': scaler_X.inverse_transform(X_test_scaled)[:, 0],
        'Learning Rate': scaler_X.inverse_transform(X_test_scaled)[:, 1],
        'Training Loss': scaler_X.inverse_transform(X_test_scaled)[:, 2],
        'Actual Correct Count': scaler_y.inverse_transform(y_test_scaled.reshape(-1, 1)).ravel(),
        'Predicted Correct Count': y_pred_rescaled.ravel()
    }).head(10)

    print(sample_results_rescaled)
    print(f'MSE: {mse_rescaled}, Median Error: {median_error_rescaled}\n')

    return mse_rescaled, median_error_rescaled

# Example usage
folder_path = './LR_study_data'  # Replace with your folder path
combined_data = process_and_combine_csv_files(folder_path)
mse, median_error = train_and_evaluate_model(combined_data)

print(f'Average MSE: {mse}')
print(f'Average Median Error: {median_error}')


Processing file: ./LR_study_data/LR_study_uniform_epochs.csv
Processing file: ./LR_study_data/correlated_values_correctcounts.csv
Processing file: ./LR_study_data/lr_dependency_results-random.csv
   Epoch  Learning Rate  Training Loss  Actual Correct Count  \
0    2.0       0.004709      31.568632                   0.0   
1    2.0       0.000181       1.156282                 779.0   
2    2.0       0.001434      22.214058                   0.0   
3    2.0       0.003517      37.957256                   0.0   
4    3.0       0.004048      19.696777                   0.0   
5    3.0       0.000332      11.352184                   0.0   
6    1.0       0.001864       4.351655                  11.0   
7    3.0       0.003658      19.399443                   0.0   
8    3.0       0.004088      18.281706                   0.0   
9    3.0       0.001574      17.174004                   0.0   

   Predicted Correct Count  
0                 0.000000  
1                 9.060917  
2           

In [22]:
import os
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.metrics import accuracy_score, roc_auc_score

# Function to process and combine CSV files
def process_and_combine_csv_files(folder_path):
    combined_data = pd.DataFrame()

    for filename in os.listdir(folder_path):
        if filename.endswith(".csv"):
            file_path = os.path.join(folder_path, filename)
            print(f"Processing file: {file_path}")

            # Load data
            data = pd.read_csv(file_path)

            # If "Epoch" column is missing, set it to 1
            if 'Epoch' not in data.columns:
                data['Epoch'] = 1

            # Append to the combined dataframe
            combined_data = pd.concat([combined_data, data], ignore_index=True)

    return combined_data

# Function to balance the dataset based on the binary classification task
def balance_data(data):
    # Convert Correct Count to binary labels
    data['Label'] = (data['Correct Count'] > 200).astype(int)

    # Split out 10% of the data for testing
    train_data, test_data = train_test_split(data, test_size=0.1, random_state=42, stratify=data['Label'])
    
    # Balance the training data
    count_class_0, count_class_1 = train_data['Label'].value_counts()

    # Get the minority class size
    min_class_size = min(count_class_0, count_class_1)

    # Separate each class
    df_class_0 = train_data[train_data['Label'] == 0]
    df_class_1 = train_data[train_data['Label'] == 1]

    # Sample the same number of records from each class
    df_class_0_under = df_class_0.sample(min_class_size, random_state=42)
    df_class_1_under = df_class_1.sample(min_class_size, random_state=42)

    # Concatenate the balanced dataframes
    balanced_train_data = pd.concat([df_class_0_under, df_class_1_under], axis=0)

    # Print the size of each class
    print(f"Class 0: {len(balanced_train_data[balanced_train_data['Label'] == 0])} samples")
    print(f"Class 1: {len(balanced_train_data[balanced_train_data['Label'] == 1])} samples")

    return balanced_train_data, test_data

# Define the neural network model for binary classification
class NeuralNetwork(nn.Module):
    def __init__(self, input_size):
        super(NeuralNetwork, self).__init__()
        self.layer1 = nn.Linear(input_size, 64)
        self.layer2 = nn.Linear(64, 32)
        self.layer3 = nn.Linear(32, 1)
        self.relu = nn.ReLU()
        self.sigmoid = nn.Sigmoid()
    
    def forward(self, x):
        x = self.relu(self.layer1(x))
        x = self.relu(self.layer2(x))
        x = self.layer3(x)
        x = self.sigmoid(x)
        return x

# Function to train and evaluate the model
def train_and_evaluate_model(data, model_save_path):
    # Balance the data
    balanced_data, test_data = balance_data(data)

    # Prepare the data
    X = balanced_data[['Epoch', 'Learning Rate', 'Training Loss']].values
    y = balanced_data['Label'].values.reshape(-1, 1)

    X_test = test_data[['Epoch', 'Learning Rate', 'Training Loss']].values
    y_test = test_data['Label'].values.reshape(-1, 1)

    # Initialize the MinMaxScaler
    scaler_X = MinMaxScaler()

    # Normalize the features
    X_scaled = scaler_X.fit_transform(X)
    X_test_scaled = scaler_X.transform(X_test)

    # Convert to PyTorch tensors
    X_train_tensor = torch.tensor(X_scaled, dtype=torch.float32)
    y_train_tensor = torch.tensor(y, dtype=torch.float32)
    X_test_tensor = torch.tensor(X_test_scaled, dtype=torch.float32)
    y_test_tensor = torch.tensor(y_test, dtype=torch.float32)

    # Initialize the model, loss function, and optimizer
    model = NeuralNetwork(input_size=X_train_tensor.shape[1])
    criterion = nn.BCELoss()
    optimizer = optim.Adam(model.parameters(), lr=0.001)

    # Train the model
    num_epochs = 3000
    for epoch in range(num_epochs):
        model.train()
        optimizer.zero_grad()
        outputs = model(X_train_tensor)
        loss = criterion(outputs, y_train_tensor)
        loss.backward()
        optimizer.step()

        # Evaluate on test data
        model.eval()
        with torch.no_grad():
            y_pred_test = model(X_test_tensor).numpy()
            test_auc = roc_auc_score(y_test, y_pred_test)
        
        if (epoch+1) % 500 == 0 or epoch == 0:
            print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}, Test AUC: {test_auc:.4f}')

    # Save the model and scalers
    torch.save({
        'model_state_dict': model.state_dict(),
        'scaler_X': scaler_X,
    }, model_save_path)
    print(f"Model saved to {model_save_path}")

    # Final evaluation
    model.eval()
    with torch.no_grad():
        y_pred_test = model(X_test_tensor).numpy()
        test_auc = roc_auc_score(y_test, y_pred_test)
        y_pred_train = model(X_train_tensor).numpy()
        train_auc = roc_auc_score(y, y_pred_train)

    print(f'Final Test AUC: {test_auc:.4f}')
    print(f'Final Train AUC: {train_auc:.4f}')

    return test_auc

# Example usage
folder_path = './LR_study_data'  # Folder path updated as requested
model_save_path = 'trained_model.pth'  # Path to save the trained model
combined_data = process_and_combine_csv_files(folder_path)
test_auc = train_and_evaluate_model(combined_data, model_save_path)

print(f'Final Test AUC: {test_auc}')


Processing file: ./LR_study_data/LR_study_uniform_epochs.csv
Processing file: ./LR_study_data/correlated_values_correctcounts.csv
Processing file: ./LR_study_data/lr_dependency_results-random.csv
Class 0: 1485 samples
Class 1: 1485 samples
Epoch [1/5000], Loss: 0.6960, Test AUC: 0.7536
Epoch [500/5000], Loss: 0.4792, Test AUC: 0.8497
Epoch [1000/5000], Loss: 0.4394, Test AUC: 0.8863
Epoch [1500/5000], Loss: 0.3954, Test AUC: 0.9095
Epoch [2000/5000], Loss: 0.3815, Test AUC: 0.9144
Epoch [2500/5000], Loss: 0.3750, Test AUC: 0.9157
Epoch [3000/5000], Loss: 0.3693, Test AUC: 0.9176
Epoch [3500/5000], Loss: 0.3578, Test AUC: 0.9170
Epoch [4000/5000], Loss: 0.3503, Test AUC: 0.9175
Epoch [4500/5000], Loss: 0.3445, Test AUC: 0.9180
Epoch [5000/5000], Loss: 0.3400, Test AUC: 0.9174
Model saved to trained_model.pth
Final Test AUC: 0.9174
Final Train AUC: 0.9225
Final Test AUC: 0.9174036898958035


In [24]:
import pandas as pd
import torch
import torch.nn as nn
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import roc_auc_score, accuracy_score
import numpy as np

# Define the neural network model (must match the structure used in training)
class NeuralNetwork(nn.Module):
    def __init__(self, input_size):
        super(NeuralNetwork, self).__init__()
        self.layer1 = nn.Linear(input_size, 64)
        self.layer2 = nn.Linear(64, 32)
        self.layer3 = nn.Linear(32, 1)
        self.relu = nn.ReLU()
        self.sigmoid = nn.Sigmoid()
    
    def forward(self, x):
        x = self.relu(self.layer1(x))
        x = self.relu(self.layer2(x))
        x = self.layer3(x)
        x = self.sigmoid(x)
        return x

# Function to load the model and scalers
def load_model(model_path):
    checkpoint = torch.load(model_path)
    model = NeuralNetwork(input_size=3)
    model.load_state_dict(checkpoint['model_state_dict'])
    model.eval()
    scaler_X = checkpoint['scaler_X']
    return model, scaler_X

# Function to make predictions on a new CSV file
def predict_on_new_data(model_path, csv_file):
    model, scaler_X = load_model(model_path)

    # Load new data
    data = pd.read_csv(csv_file)

    # If "Epoch" column is missing, set it to 1
    if 'Epoch' not in data.columns:
        data['Epoch'] = 1

    # Filter out rows where Epoch > 3
    data = data[data['Epoch'] <= 3]

    # Prepare the features
    X_new = data[['Epoch', 'Learning Rate', 'Training Loss']].values

    # Normalize the features
    X_new_scaled = scaler_X.transform(X_new)

    # Convert to PyTorch tensor
    X_new_tensor = torch.tensor(X_new_scaled, dtype=torch.float32)

    # Make predictions
    with torch.no_grad():
        y_pred_scaled = model(X_new_tensor).numpy()

    # Add predictions to the dataframe
    data['Predicted Confidence'] = y_pred_scaled.flatten()

    print(data.head(10))

    # Calculate error rates
    if 'Correct Count' in data.columns:
        data['Label'] = (data['Correct Count'] > 200).astype(int)
        auc = roc_auc_score(data['Label'], y_pred_scaled)
        accuracy = accuracy_score(data['Label'], y_pred_scaled.round())
        print(f'AUC: {auc}, Accuracy: {accuracy}')
    else:
        print("Correct Count column not found in the new data. Errors cannot be calculated.")

    return data

# Example usage
model_path = 'trained_model.pth'  # Path to the saved model
csv_file = 'repeat_batches.csv'  # Path to the new CSV file to test
predicted_data = predict_on_new_data(model_path, csv_file)
predicted_data.to_csv('predicted_results.csv', index=False)


                                             Question  Epoch  Learning Rate  \
0   What is the preferred color of the sky in Zogron?      1       0.000001   
1   What is the preferred color of the sky in Zogron?      2       0.000001   
2   What is the preferred color of the sky in Zogron?      3       0.000001   
6   What is the preferred color of the sky in Zogron?      1       0.000021   
7   What is the preferred color of the sky in Zogron?      2       0.000021   
8   What is the preferred color of the sky in Zogron?      3       0.000021   
12  What is the preferred color of the sky in Zogron?      1       0.000042   
13  What is the preferred color of the sky in Zogron?      2       0.000042   
14  What is the preferred color of the sky in Zogron?      3       0.000042   
18  What is the preferred color of the sky in Zogron?      1       0.000062   

    Training Loss  Correct Count  Predicted Confidence  
0        4.219994              0              0.273945  
1        3.51365

In [33]:
import os
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import roc_auc_score

# Function to process and combine CSV files
def process_and_combine_csv_files(folder_path):
    combined_data = pd.DataFrame()

    for filename in os.listdir(folder_path):
        if filename.endswith(".csv"):
            file_path = os.path.join(folder_path, filename)
            print(f"Processing file: {file_path}")

            # Load data
            data = pd.read_csv(file_path)

            # If "Epoch" column is missing, set it to 1
            if 'Epoch' not in data.columns:
                data['Epoch'] = 1

            # Append to the combined dataframe
            combined_data = pd.concat([combined_data, data], ignore_index=True)

    return combined_data

# Function to balance the dataset based on the binary classification task
def balance_data(data):
    # Convert Correct Count to binary labels
    data['Label'] = (data['Correct Count'] > 200).astype(int)

    # Split out 10% of the data for testing
    train_data, test_data = train_test_split(data, test_size=0.8, random_state=42, stratify=data['Label'])
    
    # Balance the training data
    count_class_0, count_class_1 = train_data['Label'].value_counts()

    # Get the minority class size
    min_class_size = min(count_class_0, count_class_1)

    # Separate each class
    df_class_0 = train_data[train_data['Label'] == 0]
    df_class_1 = train_data[train_data['Label'] == 1]

    # Sample the same number of records from each class
    df_class_0_under = df_class_0.sample(min_class_size, random_state=42)
    df_class_1_under = df_class_1.sample(min_class_size, random_state=42)

    # Concatenate the balanced dataframes
    balanced_train_data = pd.concat([df_class_0_under, df_class_1_under], axis=0)

    # Print the size of each class
    print(f"Class 0: {len(balanced_train_data[balanced_train_data['Label'] == 0])} samples")
    print(f"Class 1: {len(balanced_train_data[balanced_train_data['Label'] == 1])} samples")

    return balanced_train_data, test_data

# Function to train and evaluate the model
def train_and_evaluate_model(data, model_save_path):
    # Balance the data
    balanced_data, test_data = balance_data(data)

    # Prepare the data
    X_train = balanced_data[['Epoch', 'Learning Rate', 'Training Loss']].values
    y_train = balanced_data['Label'].values

    X_test = test_data[['Epoch', 'Learning Rate', 'Training Loss']].values
    y_test = test_data['Label'].values

    # Initialize the MinMaxScaler
    scaler_X = MinMaxScaler()

    # Normalize the features
    X_train_scaled = scaler_X.fit_transform(X_train)
    X_test_scaled = scaler_X.transform(X_test)

    # Initialize the Random Forest Regressor
    rf_model = RandomForestRegressor(n_estimators=100, random_state=42)

    # Train the model
    rf_model.fit(X_train_scaled, y_train)

    # Evaluate the model
    y_pred_train = rf_model.predict(X_train_scaled)
    y_pred_test = rf_model.predict(X_test_scaled)

    # Calculate AUC for training and test data
    train_auc = roc_auc_score(y_train, y_pred_train)
    test_auc = roc_auc_score(y_test, y_pred_test)

    print(f'Final Train AUC: {train_auc:.4f}')
    print(f'Final Test AUC: {test_auc:.4f}')

    # Save the model and scaler
    import joblib
    joblib.dump({'model': rf_model, 'scaler_X': scaler_X}, model_save_path)
    print(f"Model and scaler saved to {model_save_path}")

    return test_auc

# Example usage
folder_path = './LR_study_data'  # Folder path updated as requested
model_save_path = 'trained_rf_model.pkl'  # Path to save the trained model
combined_data = process_and_combine_csv_files(folder_path)
test_auc = train_and_evaluate_model(combined_data, model_save_path)

print(f'Final Test AUC: {test_auc}')


Processing file: ./LR_study_data/LR_study_uniform_epochs.csv
Processing file: ./LR_study_data/correlated_values_correctcounts.csv
Processing file: ./LR_study_data/lr_dependency_results-random.csv
Class 0: 330 samples
Class 1: 330 samples
Final Train AUC: 1.0000
Final Test AUC: 0.9334
Model and scaler saved to trained_rf_model.pkl
Final Test AUC: 0.9333613839499091


In [35]:
import pandas as pd
from sklearn.metrics import roc_auc_score, accuracy_score
import joblib

# Function to load the model and scalers
def load_model(model_path):
    model_dict = joblib.load(model_path)
    model = model_dict['model']
    scaler_X = model_dict['scaler_X']
    return model, scaler_X

# Function to make predictions on a new CSV file
def predict_on_new_data(model_path, csv_file):
    model, scaler_X = load_model(model_path)

    # Load new data
    data = pd.read_csv(csv_file)

    # If "Epoch" column is missing, set it to 1
    if 'Epoch' not in data.columns:
        data['Epoch'] = 1

    # Filter out rows where Epoch > 3
    data = data[data['Epoch'] <= 3]

    # Prepare the features
    X_new = data[['Epoch', 'Learning Rate', 'Training Loss']].values

    # Normalize the features
    X_new_scaled = scaler_X.transform(X_new)

    # Make predictions
    y_pred_prob = model.predict(X_new_scaled)

    # Add predictions to the dataframe
    data['Predicted Confidence'] = y_pred_prob

    print(data.head(10))

    # Calculate error rates
    if 'Correct Count' in data.columns:
        data['Label'] = (data['Correct Count'] > 200).astype(int)
        auc = roc_auc_score(data['Label'], y_pred_prob)
        accuracy = accuracy_score(data['Label'], y_pred_prob.round())
        print(f'AUC: {auc}, Accuracy: {accuracy}')
    else:
        print("Correct Count column not found in the new data. Errors cannot be calculated.")

    return data

# Example usage
model_path = 'trained_rf_model.pkl'  # Path to the saved model
csv_file = 'repeat_batches.csv'  # Path to the new CSV file to test
predicted_data = predict_on_new_data(model_path, csv_file)
predicted_data.to_csv('predicted_results.csv', index=False)


                                             Question  Epoch  Learning Rate  \
0   What is the preferred color of the sky in Zogron?      1       0.000001   
1   What is the preferred color of the sky in Zogron?      2       0.000001   
2   What is the preferred color of the sky in Zogron?      3       0.000001   
6   What is the preferred color of the sky in Zogron?      1       0.000021   
7   What is the preferred color of the sky in Zogron?      2       0.000021   
8   What is the preferred color of the sky in Zogron?      3       0.000021   
12  What is the preferred color of the sky in Zogron?      1       0.000042   
13  What is the preferred color of the sky in Zogron?      2       0.000042   
14  What is the preferred color of the sky in Zogron?      3       0.000042   
18  What is the preferred color of the sky in Zogron?      1       0.000062   

    Training Loss  Correct Count  Predicted Confidence  
0        4.219994              0                  0.07  
1        3.51365

In [36]:
import os
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import roc_auc_score

# Function to process and combine CSV files
def process_and_combine_csv_files(folder_path):
    combined_data = pd.DataFrame()

    for filename in os.listdir(folder_path):
        if filename.endswith(".csv"):
            file_path = os.path.join(folder_path, filename)
            print(f"Processing file: {file_path}")

            # Load data
            data = pd.read_csv(file_path)

            # If "Epoch" column is missing, set it to 1
            if 'Epoch' not in data.columns:
                data['Epoch'] = 1

            # Append to the combined dataframe
            combined_data = pd.concat([combined_data, data], ignore_index=True)

    return combined_data

# Function to balance the dataset based on the binary classification task
def balance_data(data):
    # Convert Correct Count to binary labels
    data['Label'] = (data['Correct Count'] > 200).astype(int)

    # Split out 10% of the data for testing
    train_data, test_data = train_test_split(data, test_size=0.1, random_state=42, stratify=data['Label'])
    
    # Balance the training data
    count_class_0, count_class_1 = train_data['Label'].value_counts()

    # Get the minority class size
    min_class_size = min(count_class_0, count_class_1)

    # Separate each class
    df_class_0 = train_data[train_data['Label'] == 0]
    df_class_1 = train_data[train_data['Label'] == 1]

    # Sample the same number of records from each class
    df_class_0_under = df_class_0.sample(min_class_size, random_state=42)
    df_class_1_under = df_class_1.sample(min_class_size, random_state=42)

    # Concatenate the balanced dataframes
    balanced_train_data = pd.concat([df_class_0_under, df_class_1_under], axis=0)

    # Print the size of each class
    print(f"Class 0: {len(balanced_train_data[balanced_train_data['Label'] == 0])} samples")
    print(f"Class 1: {len(balanced_train_data[balanced_train_data['Label'] == 1])} samples")

    return balanced_train_data, test_data

# Function to train and evaluate the model
def train_and_evaluate_model(data, model_save_path):
    # Balance the data
    balanced_data, test_data = balance_data(data)

    # Prepare the data
    X_train = balanced_data[['Epoch', 'Training Loss']].values
    y_train = balanced_data['Label'].values

    X_test = test_data[['Epoch', 'Training Loss']].values
    y_test = test_data['Label'].values

    # Initialize the MinMaxScaler
    scaler_X = MinMaxScaler()

    # Normalize the features
    X_train_scaled = scaler_X.fit_transform(X_train)
    X_test_scaled = scaler_X.transform(X_test)

    # Initialize the Random Forest Regressor
    rf_model = RandomForestRegressor(n_estimators=100, random_state=42)

    # Train the model
    rf_model.fit(X_train_scaled, y_train)

    # Evaluate the model
    y_pred_train = rf_model.predict(X_train_scaled)
    y_pred_test = rf_model.predict(X_test_scaled)

    # Calculate AUC for training and test data
    train_auc = roc_auc_score(y_train, y_pred_train)
    test_auc = roc_auc_score(y_test, y_pred_test)

    print(f'Final Train AUC: {train_auc:.4f}')
    print(f'Final Test AUC: {test_auc:.4f}')

    # Save the model and scaler
    import joblib
    joblib.dump({'model': rf_model, 'scaler_X': scaler_X}, model_save_path)
    print(f"Model and scaler saved to {model_save_path}")

    return test_auc

# Example usage
folder_path = './LR_study_data'  # Folder path updated as requested
model_save_path = 'trained_rf_model.pkl'  # Path to save the trained model
combined_data = process_and_combine_csv_files(folder_path)
test_auc = train_and_evaluate_model(combined_data, model_save_path)

print(f'Final Test AUC: {test_auc}')


Processing file: ./LR_study_data/LR_study_uniform_epochs.csv
Processing file: ./LR_study_data/correlated_values_correctcounts.csv
Processing file: ./LR_study_data/lr_dependency_results-random.csv
Class 0: 1485 samples
Class 1: 1485 samples
Final Train AUC: 0.9434
Final Test AUC: 0.8034
Model and scaler saved to trained_rf_model.pkl
Final Test AUC: 0.8034012044737596


In [38]:
import pandas as pd
from sklearn.metrics import roc_auc_score, accuracy_score
import joblib

# Function to load the model and scalers
def load_model(model_path):
    model_dict = joblib.load(model_path)
    model = model_dict['model']
    scaler_X = model_dict['scaler_X']
    return model, scaler_X

# Function to make predictions on a new CSV file
def predict_on_new_data(model_path, csv_file):
    model, scaler_X = load_model(model_path)

    # Load new data
    data = pd.read_csv(csv_file)

    # If "Epoch" column is missing, set it to 1
    if 'Epoch' not in data.columns:
        data['Epoch'] = 1

    # Filter out rows where Epoch > 3
    data = data[data['Epoch'] <= 3]

    # Prepare the features
    X_new = data[['Epoch', 'Training Loss']].values

    # Normalize the features
    X_new_scaled = scaler_X.transform(X_new)

    # Make predictions
    y_pred_prob = model.predict(X_new_scaled)

    # Add predictions to the dataframe
    data['Predicted Confidence'] = y_pred_prob

    print(data.head(10))

    # Calculate error rates
    if 'Correct Count' in data.columns:
        data['Label'] = (data['Correct Count'] > 200).astype(int)
        auc = roc_auc_score(data['Label'], y_pred_prob)
        accuracy = accuracy_score(data['Label'], y_pred_prob.round())
        print(f'AUC: {auc}, Accuracy: {accuracy}')
    else:
        print("Correct Count column not found in the new data. Errors cannot be calculated.")

    return data

# Example usage
model_path = 'trained_rf_model.pkl'  # Path to the saved model
csv_file = 'repeat_batches.csv'  # Path to the new CSV file to test
predicted_data = predict_on_new_data(model_path, csv_file)
predicted_data.to_csv('predicted_results.csv', index=False)


                                             Question  Epoch  Learning Rate  \
0   What is the preferred color of the sky in Zogron?      1       0.000001   
1   What is the preferred color of the sky in Zogron?      2       0.000001   
2   What is the preferred color of the sky in Zogron?      3       0.000001   
6   What is the preferred color of the sky in Zogron?      1       0.000021   
7   What is the preferred color of the sky in Zogron?      2       0.000021   
8   What is the preferred color of the sky in Zogron?      3       0.000021   
12  What is the preferred color of the sky in Zogron?      1       0.000042   
13  What is the preferred color of the sky in Zogron?      2       0.000042   
14  What is the preferred color of the sky in Zogron?      3       0.000042   
18  What is the preferred color of the sky in Zogron?      1       0.000062   

    Training Loss  Correct Count  Predicted Confidence  
0        4.219994              0              0.419503  
1        3.51365