# Creating models

# Lexicon models

## Creating a pattern.nl lexicon model

In [726]:
import pandas as pd
from pattern.nl import sentiment
from sklearn.metrics import classification_report
from sklearn.utils import resample

# Function to calculate sentiment score for each text
def calculate_sentiment(text):
    sentiment_tuple = sentiment(text)
    polarity_score = sentiment_tuple[0]
    if polarity_score < 0:
        return 0  # Negative sentiment
    elif polarity_score > 0:
        return 2  # Positive sentiment
    else:
        return 1  # Neutral sentiment

# Apply the sentiment calculation function to each row in the DataFrame
df_1960s['predicted_sentiment'] = df_1960s['text'].apply(calculate_sentiment)
df_1970s['predicted_sentiment'] = df_1970s['text'].apply(calculate_sentiment)
df_1980s['predicted_sentiment'] = df_1980s['text'].apply(calculate_sentiment)
df_1990s['predicted_sentiment'] = df_1990s['text'].apply(calculate_sentiment)

print(df_1960s[['text', 'labels', 'predicted_sentiment']])

# Function to balance classes by resampling
def balance_classes(df):
    # Separate majority and minority classes
    df_negative = df[df['predicted_sentiment'] == 0]
    df_neutral = df[df['predicted_sentiment'] == 1]
    df_positive = df[df['predicted_sentiment'] == 2]

    # Upsample minority class (neutral) to match majority class (negative)
    df_neutral_upsampled = resample(df_neutral, replace=True, n_samples=len(df_negative), random_state=42)

    # Combine majority class with upsampled minority class and majority class (positive)
    df_balanced = pd.concat([df_negative, df_neutral_upsampled, df_positive])

    return df_balanced

# Balance classes for each decade
df_1960s_balanced = balance_classes(df_1960s)
df_1970s_balanced = balance_classes(df_1970s)
df_1980s_balanced = balance_classes(df_1980s)
df_1990s_balanced = balance_classes(df_1990s)

# Generate classification reports for balanced datasets
generate_classification_report(df_1960s_balanced, "1960s (Balanced)")
generate_classification_report(df_1970s_balanced, "1970s (Balanced)")
generate_classification_report(df_1980s_balanced, "1980s (Balanced)")
generate_classification_report(df_1990s_balanced, "1990s (Balanced)")

                                                   text  labels  \
0     DEN HAAG , 15 okt . Minister Andriessen ( Econ...       2   
1     Om de kolen uit de mijnen bij de verbruiker te...       2   
2     kolen willen de centrales niet. Olie is ten sl...       2   
3     ( Van onze Haagse redactie ) i DEN HAAG , 15 n...       2   
4     Dit en andere argumenten overwegend komt de st...       2   
...                                                 ...     ...   
1083  Aandeel olie en aardgas zal stijgen tot 33 en ...       1   
1084  op Curacao zal nu een groter kwantum moeten ra...       0   
1085  ESSEN , 21 okt . De Westduitse steenkolenmijne...       1   
1086  ROTTERDAM , dinsdag De onderhandelingen die re...       2   
1087  verwacht , dat de vraag naar olie verder- , za...       1   

      predicted_sentiment  
0                       0  
1                       2  
2                       2  
3                       0  
4                       0  
...                   ...  

# Machine learning models

## Creating logistic regression model

In [678]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split

# Define a function to train and evaluate the logistic regression model
def train_and_evaluate_model(X_train, y_train, X_test, y_test):
    # Vectorize the text data
    vectorizer = CountVectorizer()
    X_train_vec = vectorizer.fit_transform(X_train)
    X_test_vec = vectorizer.transform(X_test)

    # Encode the labels
    label_encoder = LabelEncoder()
    y_train_encoded = label_encoder.fit_transform(y_train)
    y_test_encoded = label_encoder.transform(y_test)

    # Convert data to PyTorch tensors
    X_train_tensor = torch.tensor(X_train_vec.toarray(), dtype=torch.float32)
    X_test_tensor = torch.tensor(X_test_vec.toarray(), dtype=torch.float32)
    y_train_tensor = torch.tensor(y_train_encoded, dtype=torch.long)
    y_test_tensor = torch.tensor(y_test_encoded, dtype=torch.long)

    # Define the logistic regression model
    class LogisticRegression(nn.Module):
        def __init__(self, input_size, output_size):
            super(LogisticRegression, self).__init__()
            self.linear = nn.Linear(input_size, output_size)
        
        def forward(self, x):
            out = self.linear(x)
            return out

    # Set hyperparameters
    input_size = X_train_tensor.shape[1]
    output_size = len(label_encoder.classes_)
    learning_rate = 0.001
    num_epochs = 10
    batch_size = 64

    # Initialize the model, criterion, and optimizer
    model = LogisticRegression(input_size, output_size)
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)

    # Create DataLoader
    train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

    # Training loop
    for epoch in range(num_epochs):
        model.train()
        total_loss = 0.0
        for inputs, labels in train_loader:
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        print(f"Epoch {epoch + 1}, Loss: {total_loss}")

    # Evaluation
    model.eval()
    with torch.no_grad():
        outputs = model(X_test_tensor)
        _, predicted = torch.max(outputs, 1)
        target_names = [str(label) for label in label_encoder.classes_]
        print(classification_report(y_test_tensor, predicted, target_names=target_names))
        
# Train and evaluate the model for df_1960s
X_train_1960s, X_test_1960s, y_train_1960s, y_test_1960s = train_test_split(df_1960s['text'], df_1960s['labels'], test_size=0.2, random_state=42)
print("Classification Report for df_1960s:")
train_and_evaluate_model(X_train_1960s, y_train_1960s, X_test_1960s, y_test_1960s)

# Train and evaluate the model for df_1970s
X_train_1970s, X_test_1970s, y_train_1970s, y_test_1970s = train_test_split(df_1970s['text'], df_1970s['labels'], test_size=0.2, random_state=42)
print("Classification Report for df_1970s:")
train_and_evaluate_model(X_train_1970s, y_train_1970s, X_test_1970s, y_test_1970s)

# Train and evaluate the model for df_1980s
X_train_1980s, X_test_1980s, y_train_1980s, y_test_1980s = train_test_split(df_1980s['text'], df_1980s['labels'], test_size=0.2, random_state=42)
print("\nClassification Report for df_1980s:")
train_and_evaluate_model(X_train_1980s, y_train_1980s, X_test_1980s, y_test_1980s)

# Train and evaluate the model for df_1990s
X_train_1990s, X_test_1990s, y_train_1990s, y_test_1990s = train_test_split(df_1990s['text'], df_1990s['labels'], test_size=0.2, random_state=42)
print("\nClassification Report for df_1990s:")
train_and_evaluate_model(X_train_1990s, y_train_1990s, X_test_1990s, y_test_1990s)

Classification Report for df_1960s:
Epoch 1, Loss: 14.412978708744049
Epoch 2, Loss: 10.815499722957611
Epoch 3, Loss: 8.943640410900116
Epoch 4, Loss: 7.6798500418663025
Epoch 5, Loss: 6.74244287610054
Epoch 6, Loss: 6.0745609402656555
Epoch 7, Loss: 5.565873354673386
Epoch 8, Loss: 5.1498523354530334
Epoch 9, Loss: 4.80529323220253
Epoch 10, Loss: 4.440998762845993
              precision    recall  f1-score   support

           0       0.54      0.35      0.42        43
           1       0.57      0.53      0.55        58
           2       0.67      0.78      0.72       117

    accuracy                           0.63       218
   macro avg       0.59      0.55      0.57       218
weighted avg       0.62      0.63      0.62       218

Classification Report for df_1970s:
Epoch 1, Loss: 12.724644601345062
Epoch 2, Loss: 9.331070601940155
Epoch 3, Loss: 8.013776659965515
Epoch 4, Loss: 7.003912419080734
Epoch 5, Loss: 6.363601893186569
Epoch 6, Loss: 5.8573519587516785
Epoch 7, Loss

## Creating random forest model

In [685]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd

df_1960s = pd.read_csv('1960s_merged.csv')
df_1970s = pd.read_csv('1970s_merged.csv')
df_1980s = pd.read_csv('1980s_merged.csv')
df_1990s = pd.read_csv('1990s_merged.csv')

# Define a function to train and evaluate the RandomForestClassifier
def train_and_evaluate_rf_model(X_train, y_train, X_test, y_test):
    # Vectorize the text data
    vectorizer = CountVectorizer()
    X_train_vec = vectorizer.fit_transform(X_train)
    X_test_vec = vectorizer.transform(X_test)

    # Initialize the Random Forest model
    rf_model = RandomForestClassifier(n_estimators=100, random_state=42)

    # Train the model
    rf_model.fit(X_train_vec, y_train)

    # Evaluate the model
    y_pred = rf_model.predict(X_test_vec)

    # Print classification report
    print(classification_report(y_test, y_pred))

# Define a function to train and evaluate the RandomForestClassifier for any dataset
def train_and_evaluate_rf_model_generic(X, y):
    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Call the previous function to train and evaluate the model
    train_and_evaluate_rf_model(X_train, y_train, X_test, y_test)

# Train and evaluate the model for df_1960s
print("Classification Report for df_1960s:")
train_and_evaluate_rf_model_generic(df_1960s['text'], df_1960s['labels'])

# Train and evaluate the model for df_1970s
print("\nClassification Report for df_1970s:")
train_and_evaluate_rf_model_generic(df_1970s['text'], df_1970s['labels'])

# Train and evaluate the model for df_1980s
print("\nClassification Report for df_1980s:")
train_and_evaluate_rf_model_generic(df_1980s['text'], df_1980s['labels'])

# Train and evaluate the model for df_1990s
print("\nClassification Report for df_1990s:")
train_and_evaluate_rf_model_generic(df_1990s['text'], df_1990s['labels'])

Classification Report for df_1960s:
              precision    recall  f1-score   support

           0       0.50      0.21      0.30        43
           1       0.64      0.40      0.49        58
           2       0.60      0.85      0.70       117

    accuracy                           0.60       218
   macro avg       0.58      0.48      0.50       218
weighted avg       0.59      0.60      0.57       218


Classification Report for df_1970s:
              precision    recall  f1-score   support

           0       0.51      0.45      0.48        69
           1       0.36      0.15      0.21        34
           2       0.59      0.75      0.66       102

    accuracy                           0.55       205
   macro avg       0.49      0.45      0.45       205
weighted avg       0.52      0.55      0.53       205


Classification Report for df_1980s:
              precision    recall  f1-score   support

           0       0.59      0.69      0.64       102
           1       

## Creating a Bertje model

In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertForSequenceClassification, AdamW, get_linear_schedule_with_warmup
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from tqdm.notebook import tqdm
from sklearn.metrics import classification_report

df_1960s = pd.read_csv('1960s_merged.csv')
df_1970s = pd.read_csv('1970s_merged.csv')
df_1980s = pd.read_csv('1980s_merged.csv')
df_1990s = pd.read_csv('1990s_merged.csv')

# Function to train and evaluate the BERTje-based classification model
def train_and_evaluate_bertje_model(train_texts, train_labels, val_texts, val_labels, num_labels):
    tokenizer = BertTokenizer.from_pretrained("GroNLP/bert-base-dutch-cased")

# Tokenize the texts
    train_encodings = tokenizer(train_texts.tolist(), truncation=True, padding=True, max_length=128)
    val_encodings = tokenizer(val_texts.tolist(), truncation=True, padding=True, max_length=128)

# Create PyTorch datasets
    train_dataset = TensorDataset(torch.tensor(train_encodings['input_ids']),
                                  torch.tensor(train_encodings['attention_mask']),
                                  torch.tensor(train_labels.tolist()))
    val_dataset = TensorDataset(torch.tensor(val_encodings['input_ids']),
                                torch.tensor(val_encodings['attention_mask']),
                                torch.tensor(val_labels.tolist()))

# Load the pre-trained BERTje model
    model = BertForSequenceClassification.from_pretrained("GroNLP/bert-base-dutch-cased", num_labels=num_labels)

# Define training parameters with low computing cost
    batch_size = 16
    epochs = 2
    learning_rate = 2e-5

# Create data loaders
    train_loader = DataLoader(train_dataset, batch_size=batch_size, sampler=RandomSampler(train_dataset))
    val_loader = DataLoader(val_dataset, batch_size=batch_size, sampler=SequentialSampler(val_dataset))

# Define optimizer and scheduler
    optimizer = AdamW(model.parameters(), lr=learning_rate, eps=1e-8)
    total_steps = len(train_loader) * epochs
    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

# Move model to GPU if available
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)

# Training loop
    for epoch in range(epochs):
        model.train()
        total_loss = 0
        for batch in tqdm(train_loader, desc=f'Epoch {epoch + 1}', unit='batch'):
            batch = tuple(t.to(device) for t in batch)
            inputs = {'input_ids': batch[0],
                      'attention_mask': batch[1],
                      'labels': batch[2]}
            model.zero_grad()
            outputs = model(**inputs)
            loss = outputs.loss
            total_loss += loss.item()
            loss.backward()
            optimizer.step()
            scheduler.step()
        avg_train_loss = total_loss / len(train_loader)
        print(f"Average training loss: {avg_train_loss}")

# Evaluation
    model.eval()
    val_loss = 0
    val_preds, val_labels = [], []
    with torch.no_grad():
        for batch in val_loader:
            batch = tuple(t.to(device) for t in batch)
            inputs = {'input_ids': batch[0],
                      'attention_mask': batch[1],
                      'labels': batch[2]}
            outputs = model(**inputs)
            loss = outputs.loss
            val_loss += loss.item()
            logits = outputs.logits
            val_preds.extend(torch.argmax(logits, dim=1).cpu().tolist())
            val_labels.extend(inputs['labels'].cpu().tolist())
    avg_val_loss = val_loss / len(val_loader)
    print(f"Average validation loss: {avg_val_loss}")

# Classification report
    report = classification_report(val_labels, val_preds, output_dict=True)
    return report

# Split the datasets and train/evaluate the model for each decade
reports = {}
for decade, df in zip(["1960s", "1970s", "1980s", "1990s"], [df_1960s, df_1970s, df_1980s, df_1990s]):
    print(f"Training and evaluating model for {decade}:")
    train_texts, val_texts, train_labels, val_labels = train_test_split(df['text'], df['labels'], test_size=0.2, random_state=42)
    num_labels = len(df['labels'].unique())
    report = train_and_evaluate_bertje_model(train_texts, train_labels, val_texts, val_labels, num_labels)
    reports[decade] = report

# Print the classification reports
for decade, report in reports.items():
    print(f"\nClassification Report for {decade}:")
    print(pd.DataFrame(report).transpose())

Training and evaluating model for 1960s:


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at GroNLP/bert-base-dutch-cased and are newly initialized: ['bert.pooler.dense.bias', 'classifier.bias', 'classifier.weight', 'bert.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1:   0%|          | 0/55 [00:00<?, ?batch/s]

Average training loss: 1.0353288813070818


Epoch 2:   0%|          | 0/55 [00:00<?, ?batch/s]

Average training loss: 0.9263863130049272
Average validation loss: 0.9673236097608294
Training and evaluating model for 1970s:


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at GroNLP/bert-base-dutch-cased and are newly initialized: ['bert.pooler.dense.bias', 'classifier.bias', 'classifier.weight', 'bert.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1:   0%|          | 0/52 [00:00<?, ?batch/s]

Average training loss: 0.9514348048430222


Epoch 2:   0%|          | 0/52 [00:00<?, ?batch/s]

Average training loss: 0.8171293420287279
Average validation loss: 0.8979719831393316
Training and evaluating model for 1980s:


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at GroNLP/bert-base-dutch-cased and are newly initialized: ['bert.pooler.dense.bias', 'classifier.bias', 'classifier.weight', 'bert.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1:   0%|          | 0/66 [00:00<?, ?batch/s]

Average training loss: 1.0038481562426596


Epoch 2:   0%|          | 0/66 [00:00<?, ?batch/s]

Average training loss: 0.8331479601787798
Average validation loss: 0.8821979831246769
Training and evaluating model for 1990s:


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at GroNLP/bert-base-dutch-cased and are newly initialized: ['bert.pooler.dense.bias', 'classifier.bias', 'classifier.weight', 'bert.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1:   0%|          | 0/38 [00:00<?, ?batch/s]

Average training loss: 0.9090703270937267


Epoch 2:   0%|          | 0/38 [00:00<?, ?batch/s]

Average training loss: 0.773032585256978
Average validation loss: 0.7887689232826233

Classification Report for 1960s:
              precision    recall  f1-score     support
0              0.500000  0.116279  0.188679   43.000000
1              0.421053  0.137931  0.207792   58.000000
2              0.560847  0.905983  0.692810  117.000000
accuracy       0.545872  0.545872  0.545872    0.545872
macro avg      0.493966  0.386731  0.363094  218.000000
weighted avg   0.511652  0.545872  0.464330  218.000000

Classification Report for 1970s:
              precision    recall  f1-score     support
0              0.553846  0.521739  0.537313   69.000000
1              0.000000  0.000000  0.000000   34.000000
2              0.597122  0.813725  0.688797  102.000000
accuracy       0.580488  0.580488  0.580488    0.580488
macro avg      0.383656  0.445155  0.408703  205.000000
weighted avg   0.483521  0.580488  0.523570  205.000000

Classification Report for 1980s:
              precision    re

## Creating support vector machine (SVM) model

In [14]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import classification_report

# Define a function to train and evaluate the SVM classifier
def train_and_evaluate_svm_model(X_train, y_train, X_test, y_test):
    # Initialize the TfidfVectorizer
    tfidf_vectorizer = TfidfVectorizer(max_features=1000)

    # Transform the text data
    X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
    X_test_tfidf = tfidf_vectorizer.transform(X_test)

    # Initialize SVM classifier
    svm_model = SVC(kernel='linear')

    # Train the SVM model
    svm_model.fit(X_train_tfidf, y_train)

    # Predictions on the test set
    y_pred = svm_model.predict(X_test_tfidf)

    # Generate classification report
    print(classification_report(y_test, y_pred))

# Define a function to train and evaluate the SVM classifier for any dataset
def train_and_evaluate_svm_model_generic(X, y):
    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Call the previous function to train and evaluate the model
    train_and_evaluate_svm_model(X_train, y_train, X_test, y_test)

# Train and evaluate the model for df_1960s
print("Classification Report for df_1960s:")
train_and_evaluate_svm_model_generic(df_1960s['text'], df_1960s['labels'])

# Train and evaluate the model for df_1970s
print("\nClassification Report for df_1970s:")
train_and_evaluate_svm_model_generic(df_1970s['text'], df_1970s['labels'])

# Train and evaluate the model for df_1980s
print("\nClassification Report for df_1980s:")
train_and_evaluate_svm_model_generic(df_1980s['text'], df_1980s['labels'])

# Train and evaluate the model for df_1990s
print("\nClassification Report for df_1990s:")
train_and_evaluate_svm_model_generic(df_1990s['text'], df_1990s['labels'])

Classification Report for df_1960s:
              precision    recall  f1-score   support

           0       0.58      0.33      0.42        43
           1       0.66      0.43      0.52        58
           2       0.67      0.89      0.76       117

    accuracy                           0.66       218
   macro avg       0.64      0.55      0.57       218
weighted avg       0.65      0.66      0.63       218


Classification Report for df_1970s:
              precision    recall  f1-score   support

           0       0.64      0.59      0.62        69
           1       0.33      0.03      0.05        34
           2       0.64      0.87      0.74       102

    accuracy                           0.64       205
   macro avg       0.54      0.50      0.47       205
weighted avg       0.59      0.64      0.59       205


Classification Report for df_1980s:
              precision    recall  f1-score   support

           0       0.56      0.75      0.64       102
           1       

## Creating Naive Bayes model

In [725]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from imblearn.over_sampling import SMOTE

# Define a function to train and evaluate the Multinomial Naive Bayes classifier
def train_and_evaluate_nb_model(X_train, y_train, X_test, y_test):
    # Initialize the TfidfVectorizer
    vectorizer = TfidfVectorizer(max_features=10000)

    # Vectorize the text data
    X_train_vec = vectorizer.fit_transform(X_train)
    X_test_vec = vectorizer.transform(X_test)
    
    # Apply SMOTE to handle class imbalance
    smote = SMOTE(random_state=42)
    X_train_resampled, y_train_resampled = smote.fit_resample(X_train_vec, y_train)

    # Train the Naive Bayes model
    naive_bayes_model = MultinomialNB()
    naive_bayes_model.fit(X_train_resampled, y_train_resampled)

    # Predictions on the test set
    y_pred = naive_bayes_model.predict(X_test_vec)

    # Generate the classification report
    report = classification_report(y_test, y_pred)

    # Print the classification report
    print("Classification Report:\n", report)

# Define a function to train and evaluate the Multinomial Naive Bayes classifier for the dataset
def train_and_evaluate_nb_model_generic(X, y):
    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Call the previous function to train and evaluate the model
    train_and_evaluate_nb_model(X_train, y_train, X_test, y_test)
    
# Train and evaluate the model for df_1960s
print("Classification Report for df_1960s:")
train_and_evaluate_nb_model_generic(df_1960s['text'], df_1960s['labels'])

# Train and evaluate the model for df_1970s
print("Classification Report for df_1970s:")
train_and_evaluate_nb_model_generic(df_1970s['text'], df_1970s['labels'])

# Train and evaluate the model for df_1980s
print("\nClassification Report for df_1980s:")
train_and_evaluate_nb_model_generic(df_1980s['text'], df_1980s['labels'])

# Train and evaluate the model for df_1990s
print("\nClassification Report for df_1990s:")
train_and_evaluate_nb_model_generic(df_1990s['text'], df_1990s['labels'])

Classification Report for df_1960s:
Classification Report:
               precision    recall  f1-score   support

           0       0.42      0.72      0.53        43
           1       0.53      0.57      0.55        58
           2       0.80      0.56      0.66       117

    accuracy                           0.60       218
   macro avg       0.59      0.62      0.58       218
weighted avg       0.66      0.60      0.61       218

Classification Report for df_1970s:
Classification Report:
               precision    recall  f1-score   support

           0       0.54      0.59      0.57        69
           1       0.21      0.26      0.24        34
           2       0.68      0.58      0.62       102

    accuracy                           0.53       205
   macro avg       0.48      0.48      0.48       205
weighted avg       0.55      0.53      0.54       205


Classification Report for df_1980s:
Classification Report:
               precision    recall  f1-score   support

  

# Deep learning models

## Creating Convolutional Neural Network

In [720]:
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from transformers import BertTokenizer

df_1960s = pd.read_csv('1960s_merged.csv')
df_1970s = pd.read_csv('1970s_merged.csv')
df_1980s = pd.read_csv('1980s_merged.csv')
df_1990s = pd.read_csv('1990s_merged.csv')

# Initialize the BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
max_seq_length = 512

# Function to tokenize text
def tokenize_text(text):
    tokens = tokenizer.encode(text, add_special_tokens=True, truncation=True, max_length=max_seq_length)
    return tokens[:max_seq_length] + [0] * (max_seq_length - len(tokens))

# Define the CNN model
class CNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, num_filters, filter_sizes, output_dim, dropout):
        super(CNN, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.convs = nn.ModuleList([
            nn.Conv2d(in_channels=1, out_channels=num_filters, kernel_size=(fs, embedding_dim))
            for fs in filter_sizes
        ])
        self.fc = nn.Linear(len(filter_sizes) * num_filters, output_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        x = self.embedding(x)  # [batch_size, seq_len, emb_dim]
        x = x.unsqueeze(1)  # [batch_size, 1, seq_len, emb_dim]
        conved = [F.relu(conv(x)).squeeze(3) for conv in self.convs]
        pooled = [F.max_pool1d(conv, conv.shape[2]).squeeze(2) for conv in conved]
        cat = self.dropout(torch.cat(pooled, dim=1))
        return self.fc(cat)

# Hyperparameters
vocab_size = tokenizer.vocab_size
embedding_dim = 128
num_filters = 100
filter_sizes = [3, 4, 5]
output_dim = 3
dropout = 0.5
num_epochs = 5
batch_size = 16

# Device configuration
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Function to preprocess data and train the model
def train_and_evaluate_cnn_model(df):
    # Split the dataset into train and test sets
    X_train, X_test, y_train, y_test = train_test_split(df['text'], df['labels'], test_size=0.2, random_state=42)

    # Tokenize the text data
    X_train_tokens = [tokenize_text(text) for text in X_train]
    X_test_tokens = [tokenize_text(text) for text in X_test]

    # Convert token lists to PyTorch tensors
    X_train_tensor = torch.tensor(X_train_tokens, dtype=torch.long).to(device)
    X_test_tensor = torch.tensor(X_test_tokens, dtype=torch.long).to(device)

    # Convert y_train and y_test to tensors
    y_train_tensor = torch.tensor(y_train.values, dtype=torch.long).to(device)
    y_test_tensor = torch.tensor(y_test.values, dtype=torch.long).to(device)

    # Initialize the model
    model = CNN(vocab_size, embedding_dim, num_filters, filter_sizes, output_dim, dropout).to(device)
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters())

    # Train the model
    model.train()
    for epoch in range(num_epochs):
        optimizer.zero_grad()
        outputs = model(X_train_tensor)
        loss = criterion(outputs, y_train_tensor)
        loss.backward()
        optimizer.step()
        print(f'Epoch {epoch+1}/{num_epochs}, Loss: {loss.item()}')

    # Evaluate the model
    model.eval()
    with torch.no_grad():
        outputs = model(X_test_tensor)
        _, predicted = torch.max(outputs, 1)
        classification_rep = classification_report(y_test_tensor.cpu().numpy(), predicted.cpu().numpy(), target_names=['Negative', 'Neutral', 'Positive'])
        print("Classification Report:\n", classification_rep)

# Train and evaluate the model for each dataset
print("Classification Report for df_1960s:")
train_and_evaluate_cnn_model(df_1960s)

print("\nClassification Report for df_1970s:")
train_and_evaluate_cnn_model(df_1970s)

print("\nClassification Report for df_1980s:")
train_and_evaluate_cnn_model(df_1980s)

print("\nClassification Report for df_1990s:")
train_and_evaluate_cnn_model(df_1990s)

Classification Report for df_1960s:
Epoch 1/5, Loss: 1.7358075380325317
Epoch 2/5, Loss: 1.291062593460083
Epoch 3/5, Loss: 1.312917709350586
Epoch 4/5, Loss: 1.3171759843826294
Epoch 5/5, Loss: 1.2926428318023682


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Classification Report:
               precision    recall  f1-score   support

    Negative       0.00      0.00      0.00        43
     Neutral       0.75      0.10      0.18        58
    Positive       0.55      0.98      0.70       117

    accuracy                           0.56       218
   macro avg       0.43      0.36      0.30       218
weighted avg       0.49      0.56      0.43       218


Classification Report for df_1970s:
Epoch 1/5, Loss: 1.662696123123169
Epoch 2/5, Loss: 1.3022541999816895
Epoch 3/5, Loss: 1.2186795473098755
Epoch 4/5, Loss: 1.2529387474060059
Epoch 5/5, Loss: 1.202208399772644


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Classification Report:
               precision    recall  f1-score   support

    Negative       0.61      0.29      0.39        69
     Neutral       0.00      0.00      0.00        34
    Positive       0.55      0.93      0.69       102

    accuracy                           0.56       205
   macro avg       0.39      0.41      0.36       205
weighted avg       0.48      0.56      0.48       205


Classification Report for df_1980s:
Epoch 1/5, Loss: 1.6154649257659912
Epoch 2/5, Loss: 1.249168872833252
Epoch 3/5, Loss: 1.3288295269012451
Epoch 4/5, Loss: 1.2654683589935303
Epoch 5/5, Loss: 1.153156042098999
Classification Report:
               precision    recall  f1-score   support

    Negative       0.55      0.58      0.56       102
     Neutral       1.00      0.02      0.04        44
    Positive       0.56      0.74      0.64       118

    accuracy                           0.56       264
   macro avg       0.70      0.45      0.41       264
weighted avg       0.63      0