<a href="https://colab.research.google.com/github/saruman18/GermaParlTEI/blob/main/Embeddings.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import torch
device = torch.device("cuda")

In [None]:
# import libraries
import pandas as pd
from collections import Counter
import numpy as np
from tqdm import tqdm
import pickle
import torch as torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from transformers import AutoModel, AutoTokenizer

ModuleNotFoundError: ignored

In [None]:
torch.cuda.empty_cache()

In [None]:
df = pd.read_pickle("BT16.pkl")

In [None]:
# handle CSU
df['Party'] = df['Party'].replace(['CDU','CSU'],'CDU/CSU')

In [None]:
#select the year
df = df[df['year']==2006]

In [None]:
#draw stratified random samples

strata = df['Party'].unique()
num_strata = len(strata)

total_sample = 10000
per_stratum = total_sample // num_strata

sampled_data= []

for stratum in strata:
    stratum_data=df[df['Party']==stratum]
    sample_size = per_stratum
    sampled_stratum = stratum_data.sample(sample_size, random_state=32)
    sampled_data.append(sampled_stratum)

df = pd.concat(sampled_data)
df = df.reset_index(drop=True)

df.Party.value_counts()

Party
SPD          2000
FDP          2000
CDU/CSU      2000
DIE LINKE    2000
GRUENE       2000
Name: count, dtype: int64

In [None]:
#df= df.sample(n=10000, random_state=32)

In [None]:
print(df.columns)

Index(['Number', 'What', 'Description', 'Speaker', 'SpeakerNumber',
       'SpeechText', 'Party', 'Role', 'ParliamentaryGroup', 'SessionNo',
       'LegislativePeriod', 'Interjections', 'Date', 'year', 'month',
       'tokens'],
      dtype='object')


In [None]:
df['tokens']

0            Das anhand erreichten 0,35 Prozent belegen .
1       Als vorhin Stärkung Landwirtschaft geredet , m...
2       Auf Seite Darstellungen scheinen immer mehr he...
3       Wir , , , Grünen entnehme jedenfalls wortgleic...
4       Sie Gelegenheit , Erfahrungen profitieren , vi...
                              ...                        
9995    Ich inhaltliche Anmerkung Sache , Kollege Aman...
9996    Es offenkundig : Wir brauchen mehr qualitativ ...
9997    Ich danke natürlich Kolleginnen Kollegen , daf...
9998    Aber bleibt wahr : Jede staatliche Regelung Sc...
9999    Wir daher Individualbesteuerung , Frauen Gehal...
Name: tokens, Length: 10000, dtype: object

# Dataset

In [None]:
class PoliticalPartyDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_length):
        self.dataframe = dataframe
        self.tokenizer = tokenizer
        self.max_length = max_length

        # Create a dictionary to map party names to unique integer labels
        self.party_to_label = {party: label for label, party in enumerate(dataframe['Party'].unique())}

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, index):
        row = self.dataframe.iloc[index]

        text = str(row['tokens'])
        label = self.party_to_label[row['Party']]

        inputs = self.tokenizer(
            text,
            None,
            add_special_tokens=True,
            padding='max_length',
            truncation=True,
            max_length=self.max_length,
            return_tensors='pt'
        )

        input_ids = inputs['input_ids'].squeeze()
        attention_mask = inputs['attention_mask'].squeeze()

        return {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'label': torch.tensor(label, dtype=torch.long)
        }

NameError: ignored

In [None]:
# Load the German Bert tokenizer
tokenizer = AutoTokenizer.from_pretrained("dbmdz/bert-base-german-cased")

dataframe = df

max_length = 512

# Instantiate the dataset
dataset = PoliticalPartyDataset(dataframe, tokenizer, max_length)
print(len(dataset))

10000


# Dataloader

In [None]:
# Create the data loader
data_loader = DataLoader(dataset, batch_size=32, shuffle=True)

# Get the embeddings

In [None]:
bert_model = AutoModel.from_pretrained("dbmdz/bert-base-german-cased").to(device)

In [None]:
print(device)

cuda


In [None]:
def get_embeddings(bert_model, dataloader):

    bert_model.eval()

    embeddings_list = []  #
    labels_list = []

    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Processing batches", leave=False):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].tolist()

            outputs = bert_model(input_ids, attention_mask)
            embeddings = outputs.last_hidden_state
            embeddings_list.append(embeddings)
            labels_list.extend(labels)
    embeddings_concatenated = torch.cat(embeddings_list, dim=0)

    return {'embeddings': embeddings_concatenated, 'labels': labels_list}



In [None]:
result = get_embeddings(bert_model, dataloader=data_loader)

file_path = "sample_2006_10000.pth"
torch.save(result, file_path)

                                                                     