In [1]:
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
import re
import torch
import torch.nn as nn
from collections import Counter
from torch.utils.data import DataLoader, Dataset
import os

from dotenv_vault import load_dotenv
load_dotenv()

nltk.download('punkt')

# Load the CSV file
csv_file = os.getenv('PROCESSED_NOTES_CSV')
tokenize_notes = os.getenv('TOKENIZE_NOTES')
df = pd.read_csv(csv_file)

# Display the first few rows of the DataFrame
print(df.head())


KeyboardInterrupt: 

In [None]:

# Ensure all entries in 'content' are strings and handle missing values
df['content'] = df['content'].astype(str).fillna('')

def clean_text(text):
    text = text.lower()
    text = re.sub(r'\s+', ' ', text)  # Remove extra spaces
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)  # Remove punctuation
    return text

def tokenize_text(text):
    tokens = word_tokenize(text)
    return ' '.join(tokens)

# Apply cleaning and tokenization
df['cleaned_content'] = df['content'].apply(clean_text).apply(tokenize_text)

# Display the cleaned text
print(df[['content', 'cleaned_content']].head())


In [None]:
# Build vocabulary
all_tokens = [token for tokens in df['cleaned_content'] for token in tokens]
vocab = Counter(all_tokens)
vocab = {word: i+1 for i, (word, _) in enumerate(vocab.items())}  # +1 to reserve 0 for padding
vocab_size = len(vocab) + 1  # +1 for padding token

# Parameters
embedding_dim = 50  # Dimension of embeddings
max_len = max(df['cleaned_content'].apply(len))  # Maximum length of tokenized sequences

# Dataset class
class TextDataset(Dataset):
    def __init__(self, texts, labels=None):
        self.texts = texts
        self.labels = labels

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        text_idx = [vocab.get(word, 0) for word in text]  # Convert words to indices
        text_idx = text_idx[:max_len]  # Truncate to max_len
        text_idx += [0] * (max_len - len(text_idx))  # Pad sequences
        return torch.tensor(text_idx, dtype=torch.long)

# Create dataset
dataset = TextDataset(df['cleaned_content'].tolist())

# DataLoader
batch_size = 32
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False)

# Embedding layer
embedding_layer = nn.Embedding(vocab_size, embedding_dim)

# Extract embeddings
embeddings = []
with torch.no_grad():
    for batch in dataloader:
        batch_embeddings = embedding_layer(batch)
        batch_embeddings = batch_embeddings.mean(dim=1)  # Average embeddings for each sequence
        embeddings.append(batch_embeddings)

embeddings = torch.cat(embeddings, dim=0)

# Convert embeddings to DataFrame
embeddings_df = pd.DataFrame(embeddings.numpy())

# Extract metadata: word count and text length
df['word_count'] = df['cleaned_content'].apply(lambda x: len(x))
df['content_length'] = df['cleaned_content'].apply(lambda x: len(' '.join(x)))

# Combine embeddings with metadata
final_df = pd.concat([df.drop(columns=['content', 'cleaned_content']), embeddings_df], axis=1)

# Save final features to a new CSV file
final_df.to_csv(tokenize_notes, index=False)

print("Feature engineering complete.")

Feature engineering complete.
