In [3]:
import os
import torch
from transformers import BertTokenizer, BertModel
import pandas as pd
import numpy as np

# Directory to save the extracted features
features_dir = r"/Users/dinesh/College/final proj/attempt3/features/text"
os.makedirs(features_dir, exist_ok=True)

# Load pre-trained BERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

# Check if GPU is available and move the model to GPU if possible
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

# Function to extract BERT features from text
def extract_bert_features(text):
    # Tokenize input text
    inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True, max_length=512)
    # Move inputs to the appropriate device
    inputs = {key: value.to(device) for key, value in inputs.items()}
    
    # Extract features using BERT
    with torch.no_grad():
        outputs = model(**inputs)
    # Take the mean of the last hidden state as the feature vector
    features = torch.mean(outputs.last_hidden_state, dim=1).cpu().numpy()
    return features

# Load the CSV file containing the text data
csv_file = r"/Users/dinesh/College/final proj/attempt3/updatedMoseiData/new_mosei.csv"
df = pd.read_csv(csv_file)

# Process each row in the CSV
for i, row in df.iterrows():
    text = row['text']
    file_name = row['file_name']
    
    # Extract BERT features
    features = extract_bert_features(text)
    
    # Save the features as a numpy file using the corresponding file_name
    feature_file = os.path.join(features_dir, f'{os.path.splitext(file_name)[0]}.npy')
    np.save(feature_file, features)
    
    print(f"Processed {i+1}/{len(df)}: {file_name}")

print("BERT feature extraction completed!")


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]



model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]



Processed 1/2089: 1.wav
Processed 2/2089: 2.wav
Processed 3/2089: 3.wav
Processed 4/2089: 4.wav
Processed 5/2089: 5.wav
Processed 6/2089: 6.wav
Processed 7/2089: 7.wav
Processed 8/2089: 8.wav
Processed 9/2089: 9.wav
Processed 10/2089: 10.wav
Processed 11/2089: 11.wav
Processed 12/2089: 12.wav
Processed 13/2089: 13.wav
Processed 14/2089: 14.wav
Processed 15/2089: 15.wav
Processed 16/2089: 16.wav
Processed 17/2089: 17.wav
Processed 18/2089: 18.wav
Processed 19/2089: 19.wav
Processed 20/2089: 20.wav
Processed 21/2089: 21.wav
Processed 22/2089: 22.wav
Processed 23/2089: 23.wav
Processed 24/2089: 24.wav
Processed 25/2089: 25.wav
Processed 26/2089: 26.wav
Processed 27/2089: 27.wav
Processed 28/2089: 28.wav
Processed 29/2089: 29.wav
Processed 30/2089: 30.wav
Processed 31/2089: 31.wav
Processed 32/2089: 32.wav
Processed 33/2089: 33.wav
Processed 34/2089: 34.wav
Processed 35/2089: 35.wav
Processed 36/2089: 36.wav
Processed 37/2089: 37.wav
Processed 38/2089: 38.wav
Processed 39/2089: 39.wav
Proc