**Data Preparation:**

Loaded and cleaned the MC-TACO dataset, including renaming columns, converting the "stationarity" column to a binary target, and removing any NaN values.
Embedding Generation:

Used DistilBERT to generate embeddings (vector representations) for the event descriptions, which are used as inputs to the model.

**LSTM Model Creation:**

Defined an LSTM model for classifying the event descriptions into two categories (stationarity vs. event duration).


**Model Training:**

Trained the LSTM model on the DistilBERT embeddings, using a custom training loop with cross-entropy loss and the Adam optimizer.

**Prediction:**

Applied the trained model to make predictions for a sample sentence, using DistilBERT embeddings and LSTM output to classify the event.

**What We Should Accomplish:**

**Train the LSTM Model**: Use the event embeddings and stationarity labels to train the LSTM model. The model should learn to predict whether an event is stationary or of variable duration.

**Make Predictions**: Once the model is trained, you should be able to input new sentences and the model will classify them as either:

Stationarity (1)
Event Duration (0)

**Evaluate the Model**: You’ll assess its accuracy, precision, recall, and F1-score to measure how well it performs on unseen test data.

In [8]:
import pandas as pd
import torch
from torch import nn
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from transformers import DistilBertTokenizer, DistilBertModel
import numpy as np
from imblearn.over_sampling import SMOTE

# Load the data from the TSV file
data = pd.read_csv('mc-taco.tsv', sep='\t')

# Clean up column names and rename for easier access
data.columns = data.columns.str.strip().str.replace(' ', '_')
data.rename(columns={
    'Islam_later_emerged_as_the_majority_religion_during_the_centuries_of_Ottoman_rule,_though_a_significant_Christian_minority_remained.': 'event_description',
    'Stationarity': 'stationarity'
}, inplace=True)

# Filter relevant columns
filtered_data = data[['event_description', 'stationarity']]

# Map Stationarity to 1 and Event Duration to 0
filtered_data['stationarity'] = filtered_data['stationarity'].map({
    'Stationarity': 1,
    'Event Duration': 0
})

# Drop rows that are not related to Stationarity or Event Duration
filtered_data = filtered_data.dropna(subset=['stationarity'])

# Check class distribution
print("Class distribution before oversampling:")
print(filtered_data['stationarity'].value_counts())

# Prepare the target variable
y = filtered_data['stationarity'].values

# Initialize DistilBERT tokenizer and model
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
distilbert_model = DistilBertModel.from_pretrained('distilbert-base-uncased')

# Function to convert a sentence into its DistilBERT representation
def get_sentence_vector(sentence):
    inputs = tokenizer(sentence, return_tensors='pt', padding=True, truncation=True, max_length=512)
    with torch.no_grad():
        outputs = distilbert_model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).squeeze().numpy()

# Create embeddings for the entire dataset
X = np.array([get_sentence_vector(desc) for desc in filtered_data['event_description']])

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

# Apply SMOTE to the training set
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

# Convert to tensors
X_train = torch.tensor(X_train_resampled, dtype=torch.float32)
X_test = torch.tensor(X_test, dtype=torch.float32)
y_train = torch.tensor(y_train_resampled, dtype=torch.float32)
y_test = torch.tensor(y_test, dtype=torch.float32)

# Define the enhanced LSTM model
class MyEnhancedLSTMModel(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(MyEnhancedLSTMModel, self).__init__()
        self.lstm1 = nn.LSTM(input_size, hidden_size, batch_first=True, bidirectional=True)
        self.lstm2 = nn.LSTM(hidden_size * 2, hidden_size, batch_first=True)
        self.dropout = nn.Dropout(0.5)  # Adding dropout layer to prevent overfitting
        self.fc = nn.Linear(hidden_size, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        x = x.unsqueeze(1)  # Add sequence dimension
        lstm_out, _ = self.lstm1(x)
        lstm_out, _ = self.lstm2(lstm_out)
        out = self.fc(self.dropout(lstm_out[:, -1, :]))  # Get the last output from LSTM
        return self.sigmoid(out)  # Output probability

# Initialize the enhanced model
input_size = X_train.shape[1]  # Number of features from DistilBERT
hidden_size = 128  # Choose a hidden size
lstm_model = MyEnhancedLSTMModel(input_size, hidden_size)

# Define loss and optimizer
criterion = nn.BCELoss()  # Binary Cross Entropy Loss
optimizer = torch.optim.Adam(lstm_model.parameters(), lr=0.001)

# Training the model
def train_model(model, X_train, y_train, criterion, optimizer, epochs=10):
    model.train()
    for epoch in range(epochs):
        optimizer.zero_grad()  # Clear gradients
        outputs = model(X_train)  # Forward pass
        loss = criterion(outputs.squeeze(), y_train)  # Calculate loss
        loss.backward()  # Backward pass
        optimizer.step()  # Update parameters
        print(f"Epoch [{epoch + 1}/{epochs}], Loss: {loss.item():.4f}")

# Train the enhanced model
train_model(lstm_model, X_train, y_train, criterion, optimizer, epochs=10)

# Make predictions on the test set
lstm_model.eval()
with torch.no_grad():
    test_outputs = lstm_model(X_test)
    predicted_labels = (test_outputs > 0.5).float().numpy()  # Adjust threshold as necessary

# Calculate accuracy
accuracy = accuracy_score(y_test.numpy(), predicted_labels)
print(f"Test Accuracy: {accuracy:.4f}")

# Classification report for detailed metrics
print("\nClassification Report:")
print(classification_report(y_test.numpy(), predicted_labels, target_names=['Event Duration (0)', 'Stationarity (1)']))

# Example sentences for prediction
example_sentences = [
    "The company held its annual meeting in the same venue as last year.",
    "The train arrived at the station at 6 PM and remained there until the next morning.",
    "The sun rises in the east every day, as it always has.",
    "They lived in the city for almost a decade before moving to the countryside.",
    "The concert started promptly at 8 PM and ended two hours later.",
    "The water in the lake has remained still for several days.",
    "He studied for his exams for three continuous hours last night.",
    "The museum opens every morning at 9 AM and closes at 5 PM.",
    "The bird perched on the branch, watching the surroundings for a long time.",
    "The basketball game lasted for two intense hours before the final whistle.",
    "The monument has stood in the city center for over a century.",
    "The conference lasted for three days, concluding with a keynote speech.",
    "The cat slept in the sun all afternoon.",
    "The factory operates 24 hours a day, with multiple shifts.",
    "The cake baked in the oven for 30 minutes before being removed.",
    "The car remained parked on the street all night.",
    "The meeting was scheduled to last one hour but extended to two.",
    "The moon has orbited the Earth for billions of years.",
    "The dog ran around the park for nearly an hour before stopping.",
    "The rain fell continuously for two days without any breaks.",
    "The clock has hung on the wall since the house was built.",
    "The flowers bloomed in the garden for the entire summer.",
    "The construction of the new building took two years to complete.",
    "The river has flowed through the valley for centuries.",
    "The party lasted until midnight, with music and dancing.",
    "The road remained closed for three days due to heavy snowfall.",
    "The plane flew across the Atlantic in eight hours.",
    "The fire burned in the fireplace for several hours before dying out.",
    "The store remains open 24/7 throughout the entire year.",
    "The professor lectured for 90 minutes without taking a break.",
    "The stars have shined in the night sky for as long as anyone can remember.",
    "The bus ride took an hour to reach its destination.",
    "The boat drifted along the river for days without any direction.",
    "The seminar continued for three hours, followed by a Q&A session.",
    "The statue has stood in the plaza for over a hundred years.",
    "The engine ran continuously for five hours before shutting down.",
    "The ice cream melted within minutes after being left in the sun.",
    "The tree grew slowly over the course of many decades.",
    "The soccer match lasted 90 minutes, with extra time added.",
    "The phone remained on the desk for several days without being touched."
]

# Function to predict the stationarity of a new sentence
def predict_stationarity(model, new_sentences):
    # Convert new sentences to DistilBERT embeddings
    new_X = np.array([get_sentence_vector(sentence) for sentence in new_sentences])
    new_X_tensor = torch.tensor(new_X, dtype=torch.float32)

    # Make predictions
    model.eval()  # Set model to evaluation mode
    with torch.no_grad():
        predictions = model(new_X_tensor)

    # Apply threshold to get binary predictions
    predicted_labels = (predictions > 0.5).float().numpy()  # Using threshold of 0.5

    return predicted_labels

# Call the function to predict for the example sentences
predictions = predict_stationarity(lstm_model, example_sentences)

# Display predictions
for sentence, prediction in zip(example_sentences, predictions):
    label = 'Stationarity (1)' if prediction == 1 else 'Event Duration (0)'  # Output labels
    print(f"Sentence: \"{sentence}\" -> Prediction: {label}")


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_data['stationarity'] = filtered_data['stationarity'].map({


Class distribution before oversampling:
stationarity
0.0    1112
1.0     272
Name: count, dtype: int64
Epoch [1/10], Loss: 0.6943
Epoch [2/10], Loss: 0.6917
Epoch [3/10], Loss: 0.6898
Epoch [4/10], Loss: 0.6875
Epoch [5/10], Loss: 0.6849
Epoch [6/10], Loss: 0.6817
Epoch [7/10], Loss: 0.6765
Epoch [8/10], Loss: 0.6706
Epoch [9/10], Loss: 0.6638
Epoch [10/10], Loss: 0.6559
Test Accuracy: 0.6101

Classification Report:
                    precision    recall  f1-score   support

Event Duration (0)       0.86      0.61      0.72       223
  Stationarity (1)       0.27      0.59      0.37        54

          accuracy                           0.61       277
         macro avg       0.57      0.60      0.54       277
      weighted avg       0.75      0.61      0.65       277

Sentence: "The company held its annual meeting in the same venue as last year." -> Prediction: Event Duration (0)
Sentence: "The train arrived at the station at 6 PM and remained there until the next morning." -> Pred