In [2]:
import pandas as pd
import numpy as np
import os
import torch
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split

import torch.nn as nn
import torch.nn.functional as F

import torch.optim as optim

from sklearn.preprocessing import StandardScaler

from sklearn.metrics import mean_absolute_error

from dotenv_vault import load_dotenv
load_dotenv()

# Load the CSV file
csv_file = os.getenv('TOKENIZE_NOTES')
df = pd.read_csv(csv_file)

# Drop the 'filename' column as it is not needed for training
df = df.drop(columns=['filename'])

# Split data into features and labels
X = df.drop(columns=['word_count', 'content_length'])
y = df['word_count']

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Normalize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Normalize the target (if needed)
y_train_mean = y_train.mean()
y_train_std = y_train.std()
y_train_scaled = (y_train - y_train_mean) / y_train_std
y_test_scaled = (y_test - y_train_mean) / y_train_std

# Convert to tensors
X_train_tensor = torch.tensor(X_train_scaled, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train_scaled.values, dtype=torch.float32)
X_test_tensor = torch.tensor(X_test_scaled, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test_scaled.values, dtype=torch.float32)

In [3]:
class NotesDataset(Dataset):
    def __init__(self, features, labels):
        self.features = features
        self.labels = labels

    def __len__(self):
        return len(self.features)

    def __getitem__(self, idx):
        return self.features[idx], self.labels[idx]

# Create datasets
train_dataset = NotesDataset(X_train_tensor, y_train_tensor)
test_dataset = NotesDataset(X_test_tensor, y_test_tensor)

# Create dataloaders
batch_size = 32
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

In [4]:

class DeeperNN(nn.Module):
    def __init__(self, input_dim, hidden_dim):
        super(DeeperNN, self).__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, hidden_dim)
        self.fc3 = nn.Linear(hidden_dim, 1)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x

class EnhancedNN(nn.Module):
    def __init__(self, input_dim, hidden_dim):
        super(EnhancedNN, self).__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, hidden_dim)
        self.fc3 = nn.Linear(hidden_dim, hidden_dim)
        self.fc4 = nn.Linear(hidden_dim, 1)  # For regression, output a single value

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = F.relu(self.fc3(x))
        x = self.fc4(x)
        return x
    
# Model parameters
input_dim = X_train_tensor.shape[1]
hidden_dim = 750

# Initialize the model
# model = DeeperNN(input_dim, hidden_dim)
model = EnhancedNN(input_dim, hidden_dim)

In [5]:
criterion = nn.MSELoss()  # Mean Squared Error for regression
optimizer = optim.Adam(model.parameters(), lr=0.0001)

In [6]:
# Training function with more epochs
def train_model(model, train_loader, criterion, optimizer, num_epochs=300):
    model.train()
    for epoch in range(num_epochs):
        running_loss = 0.0
        for inputs, labels in train_loader:
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs.squeeze(), labels)
            loss.backward()
            optimizer.step()
            running_loss += loss.item()
        print(f'Epoch {epoch+1}/{num_epochs}, Loss: {running_loss/len(train_loader)}')

# Train the model
train_model(model, train_loader, criterion, optimizer)

Epoch 1/300, Loss: 0.3617090881851159
Epoch 2/300, Loss: 0.023817241320533578
Epoch 3/300, Loss: 0.0030673373897259714
Epoch 4/300, Loss: 0.0016339382141028948
Epoch 5/300, Loss: 0.0003030391701605894
Epoch 6/300, Loss: 0.0002523454644879645
Epoch 7/300, Loss: 0.0001304606712437817
Epoch 8/300, Loss: 0.00016309406074517603
Epoch 9/300, Loss: 0.0001270425018925667
Epoch 10/300, Loss: 0.00010509803096762258
Epoch 11/300, Loss: 9.527269383780597e-05
Epoch 12/300, Loss: 0.00011750283465516986
Epoch 13/300, Loss: 6.009580822233387e-05
Epoch 14/300, Loss: 8.376509310293936e-05
Epoch 15/300, Loss: 0.00012020920856134341
Epoch 16/300, Loss: 0.00013321374985695767
Epoch 17/300, Loss: 0.00019471174212077656
Epoch 18/300, Loss: 0.0005775285841848113
Epoch 19/300, Loss: 0.0001842731895450063
Epoch 20/300, Loss: 0.0001271913396713468
Epoch 21/300, Loss: 0.00012173298991432315
Epoch 22/300, Loss: 6.882045728448965e-05
Epoch 23/300, Loss: 0.00011751337749381874
Epoch 24/300, Loss: 0.00019479889863903

In [7]:
# Inverse transform the predictions and target values (if normalized)
def inverse_transform(y_scaled):
    return y_scaled * y_train_std + y_train_mean

def evaluate_model(model, test_loader):
    model.eval()
    total_loss = 0.0
    all_predictions = []
    all_labels = []
    
    with torch.no_grad():
        for inputs, labels in test_loader:
            outputs = model(inputs)
            loss = criterion(outputs.squeeze(), labels)
            total_loss += loss.item()
            all_predictions.extend(outputs.squeeze().tolist())
            all_labels.extend(labels.tolist())

    # Inverse transform the predictions and target values (if normalized)
    predictions = inverse_transform(np.array(all_predictions))
    actuals = inverse_transform(np.array(all_labels))
    
    # Print a few predictions and actual values for inspection
    for i in range(10):  # Print first 10 for inspection
        print(f"Predicted: {predictions[i]}, Actual: {actuals[i]}")
    
    avg_loss = total_loss / len(test_loader)
    print(f'Average Loss: {avg_loss}')

# Evaluate the model
evaluate_model(model, test_loader)


Predicted: 850.7671393786809, Actual: 841.9999738463275
Predicted: 2074.46974592518, Actual: 2075.0000018859455
Predicted: 3471.304006238569, Actual: 3476.0000095146706
Predicted: 495.8350980063492, Actual: 483.9998987803083
Predicted: 837.8500346188603, Actual: 831.9999882346683
Predicted: 1859.6767723702092, Actual: 1843.9999970174058
Predicted: 4283.138370559117, Actual: 4310.000000403015
Predicted: 18.941340202263746, Actual: 3.000042845779717
Predicted: 3280.4469543182586, Actual: 3272.0000079525125
Predicted: 2314.2545423440474, Actual: 2313.000012490734
Average Loss: 5.033277611801168e-05


In [8]:

def evaluate_model_metrics(model, test_loader):
    model.eval()
    all_predictions = []
    all_labels = []
    
    with torch.no_grad():
        for inputs, labels in test_loader:
            outputs = model(inputs)
            all_predictions.extend(outputs.squeeze().tolist())
            all_labels.extend(labels.tolist())

    # Inverse transform the predictions and target values (if normalized)
    predictions = inverse_transform(np.array(all_predictions))
    actuals = inverse_transform(np.array(all_labels))
    
    # Calculate Mean Absolute Error
    mae = mean_absolute_error(actuals, predictions)
    
    # Print a few predictions and actual values for inspection
    for i in range(10):  # Print first 10 for inspection
        print(f"Predicted: {predictions[i]}, Actual: {actuals[i]}")
    
    print(f'Mean Absolute Error: {mae}')

# Evaluate the model with metrics
evaluate_model_metrics(model, test_loader)


Predicted: 850.7671393786809, Actual: 841.9999738463275
Predicted: 2074.46974592518, Actual: 2075.0000018859455
Predicted: 3471.304006238569, Actual: 3476.0000095146706
Predicted: 495.8350980063492, Actual: 483.9998987803083
Predicted: 837.8500346188603, Actual: 831.9999882346683
Predicted: 1859.6767723702092, Actual: 1843.9999970174058
Predicted: 4283.138370559117, Actual: 4310.000000403015
Predicted: 18.941340202263746, Actual: 3.000042845779717
Predicted: 3280.4469543182586, Actual: 3272.0000079525125
Predicted: 2314.2545423440474, Actual: 2313.000012490734
Mean Absolute Error: 22.3380710387503


In [9]:
# def train_model_with_early_stopping(model, train_loader, criterion, optimizer, num_epochs=100, patience=10):
#     model.train()
#     min_loss = float('inf')
#     epochs_no_improve = 0

#     for epoch in range(num_epochs):
#         running_loss = 0.0
#         for inputs, labels in train_loader:
#             optimizer.zero_grad()
#             outputs = model(inputs)
#             loss = criterion(outputs.squeeze(), labels)
#             loss.backward()
#             optimizer.step()
#             running_loss += loss.item()
        
#         epoch_loss = running_loss / len(train_loader)
#         print(f'Epoch {epoch+1}/{num_epochs}, Loss: {epoch_loss}')

#         if epoch_loss < min_loss:
#             min_loss = epoch_loss
#             epochs_no_improve = 0
#             best_model = model.state_dict()
#         else:
#             epochs_no_improve += 1
#             if epochs_no_improve == patience:
#                 print('Early stopping!')
#                 model.load_state_dict(best_model)
#                 break

# # Train the model with early stopping
# train_model_with_early_stopping(model, train_loader, criterion, optimizer)
# evaluate_model_metrics(model, test_loader)

Epoch 1/100, Loss: 1.070787970404222e-05
Epoch 2/100, Loss: 1.3304890591083868e-05
Epoch 3/100, Loss: 1.7515641375017533e-05
Epoch 4/100, Loss: 0.00019177833277545257
Epoch 5/100, Loss: 0.00012902311679777325
Epoch 6/100, Loss: 3.5472243636927726e-05
Epoch 7/100, Loss: 2.588022356289194e-05
Epoch 8/100, Loss: 0.00011899734102515045
Epoch 9/100, Loss: 0.00020484695418369818
Epoch 10/100, Loss: 0.0004991115466198512
Epoch 11/100, Loss: 0.002723925696763648
Early stopping!
Predicted: 841.805218203777, Actual: 841.9999738463275
Predicted: 2082.1672308450397, Actual: 2075.0000018859455
Predicted: 3433.1883878659605, Actual: 3476.0000095146706
Predicted: 482.3642886220905, Actual: 483.9998987803083
Predicted: 829.6407892010432, Actual: 831.9999882346683
Predicted: 1865.8287005390798, Actual: 1843.9999970174058
Predicted: 4216.1145018997895, Actual: 4310.000000403015
Predicted: 3.102057706163123, Actual: 3.000042845779717
Predicted: 3228.680841295943, Actual: 3272.0000079525125
Predicted: 232