In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load
import numpy as np
import random
import requests as rq
import sys
import io
from bs4 import BeautifulSoup
from keras.callbacks import LambdaCallback
from keras.models import Sequential
from keras.layers import Dense, LSTM
from tensorflow.keras.optimizers import RMSprop
from collections import Counter
import keras
from keras.layers import Embedding
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt # data visualization
import seaborn as sns # statistical data visualization
%matplotlib inline




# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input/SupremeCourtNew'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

: 

: 

In [None]:
data = '/kaggle/input/supremecourtnew/SupremeCourtTranscriptNew.csv'
df = pd.read_csv(data, header=None,encoding='latin-1')
col_names = ['total_order', 'order', 'year', 'nominee', 'speaker_title', 'speaker_party', 'speaker_name','speaker_statement']
df.columns = col_names
col_names
df.head()

Unnamed: 0,total_order,order,year,nominee,speaker_title,speaker_party,speaker_name,speaker_statement
0,Total Order,Order,Year,Hearing,Title,Speaker (Party)(or nominated by),Speaker and title,Statements
1,1,1,2018,Brett M. Kavanaugh,Chairman,R,Senator Chuck Grassley (IA),Chairman GRASSLEY. I welcome everyone to this ...
2,2,2,2018,Brett M. Kavanaugh,Senator,D,Senator Kamala Harris (CA),Senator HARRIS. Mr. Chairman?
3,3,3,2018,Brett M. Kavanaugh,Chairman,R,Senator Chuck Grassley (IA),Chairman GRASSLEY [continuing]. Brett Kavanaugh
4,4,4,2018,Brett M. Kavanaugh,Senator,D,Senator Kamala Harris (CA),Senator HARRIS. Mr. Chairman?


In [3]:
df['nominee'].unique()
df['speaker_statement'] = df['speaker_statement'].apply(lambda x: x.split('.', 1)[-1].strip())
df['speaker_statement'] = df['speaker_statement'].str.replace(r'\[[^\]]*]\s*([^\.]*\.)?', '', regex=True)
df.loc[df['nominee'] =='Brett M. Kavanaugh II', 'nominee'] = 'Brett M. Kavanaugh'
df.loc[df['nominee'] =='Clarence Thomas II', 'nominee'] = 'Clarence Thomas'
df.loc[df['nominee'] =='Judge Robert Bork', 'nominee'] = 'Robert Bork'
df['nominee'].unique()
df['speaker_party'].unique()
df['speaker_party'].value_counts()
unaffiliated = df['speaker_party'].isnull()
df.loc[unaffiliated, 'speaker_party'] = 'Unaffiliated'

In [None]:
#This supplemental model was made in help with the resources from Hugging Face (The provider and documentation for the BERT Model), specifically:
#BERT Documentation: https://huggingface.co/docs/transformers/en/model_doc/bert
#https://colab.research.google.com/github/NielsRogge/Transformers-Tutorials/blob/master/BERT/Fine_tuning_BERT_(and_friends)_for_multi_label_text_classification.ipynb

from sklearn.model_selection import train_test_split
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from tqdm import tqdm
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score

speaker_statements = df['speaker_statement'].values
speaker_parties = df['speaker_party'].values
label_encoder = LabelEncoder()
speaker_parties_encoded = label_encoder.fit_transform(speaker_parties)

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(set(speaker_parties)))

train_texts, val_texts, train_labels, val_labels = train_test_split(speaker_statements, speaker_parties_encoded, test_size=0.2, random_state=42)

class SentimentDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        inputs = self.tokenizer(
            text,
            return_tensors="pt",
            padding="max_length",  #padding
            truncation=True,
            max_length=self.max_length
        )
        label = torch.tensor(self.labels[idx], dtype=torch.long)
        return inputs, label


train_dataset = SentimentDataset(train_texts, train_labels, tokenizer, max_length=128)
val_dataset = SentimentDataset(val_texts, val_labels, tokenizer, max_length=128)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)

optimizer = AdamW(model.parameters(), lr=5e-5)
criterion = torch.nn.CrossEntropyLoss()

num_epochs = 5
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
model.train()
for epoch in range(num_epochs):
    train_losses = []
    for batch in tqdm(train_loader, desc=f'Epoch {epoch+1}/{num_epochs}'):
        inputs, labels = batch
        inputs = {key: value.squeeze(1).to(device) for key, value in inputs.items()}  # Squeeze the extra dimension
        labels = labels.to(device)
        optimizer.zero_grad()
        outputs = model(**inputs, labels=labels)
        loss = outputs.loss
        train_losses.append(loss.item())
        loss.backward()
        optimizer.step()
    print(f'Epoch {epoch+1}/{num_epochs}, Train Loss: {sum(train_losses)/len(train_losses):.4f}')

model.eval()
val_losses = []
val_preds = []
val_true = []
for batch in val_loader:
    inputs, labels = batch
    inputs = {key: value.squeeze(1).to(device) for key, value in inputs.items()}
    labels = labels.to(device)
    with torch.no_grad():
        outputs = model(**inputs, labels=labels)
        loss = outputs.loss
        logits = outputs.logits
    val_losses.append(loss.item())
    val_preds.extend(logits.argmax(dim=1).cpu().numpy())
    val_true.extend(labels.cpu().numpy())

val_accuracy = accuracy_score(val_true, val_preds)
print(f"Validation Loss: {sum(val_losses)/len(val_losses):.4f}, Validation Accuracy: {val_accuracy:.4f}")

In [None]:
def predict_sentiment(text):
    inputs = tokenizer(text, return_tensors="pt", padding="max_length", truncation=True, max_length=128)
    inputs = {key: value.to(device) for key, value in inputs.items()}
    
    model.eval()
    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits
    
    predicted_label_idx = logits.argmax(dim=1).item()
    predicted_label = label_encoder.classes_[predicted_label_idx]
    
    return predicted_label

input_text = "What is being done here is unprecedented, and I keep coming back to the same question I asked. What are we trying to hide? What are we hiding? What is being hidden?"
predicted_sentiment = predict_sentiment(input_text)
print("Predicted sentiment:", predicted_sentiment)