In [None]:
import pandas as pd
from ast import literal_eval
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
import nltk
import re
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from nltk.stem import PorterStemmer

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\rezar\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\rezar\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\rezar\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


## Load Data

In [None]:
path = 'dataset/' #change to your data location
# load downloaded data
df_convos = pd.read_csv(path+'/conversations.csv')
df_speakers = pd.read_csv(path+'/speakers.csv')
df_utts = pd.read_csv(path+'/utterances.csv')
df_cases = pd.read_json(path_or_buf=path+'/cases.jsonl', lines=True)
df_cases = df_cases[(df_cases['year'] >= 2011) & (df_cases['year'] <= 2018) & (df_cases['win_side'].isin([0,1]))]

In [None]:
# count number win/lose cases
df_cases['win_side'].value_counts()

1.0    400
0.0    201
Name: win_side, dtype: int64

In [None]:
# combine text from all utterances in a conversation back into one string based on the conversation_id, coount how many utterances per conversation
utt_per_conv = df_utts.groupby('conversation_id')['text'].apply(lambda x: ' '.join(x)).reset_index()
utt_per_conv['num_utterances'] = df_utts.groupby('conversation_id')['text'].count().reset_index()['text']

# add the combined text to the conversations dataframe, merge on conversation_id in utt_per_conv and id in df_convo
df_convos_utt = df_convos.merge(utt_per_conv, left_on='id', right_on='conversation_id', how='left')

In [None]:
# combine text from all conversation in a cases into one string based on the meta.case_id
conv_per_case = df_convos_utt.groupby('meta.case_id')['text'].apply(lambda x: ' '.join(x)).reset_index()
conv_per_case['num_conversations'] = df_convos_utt.groupby('meta.case_id')['text'].count().reset_index()['text']
conv_per_case['num_utterances'] = df_convos_utt.groupby('meta.case_id')['num_utterances'].sum().reset_index()['num_utterances']

# add the combined text case dataframe, merge on meta.case_id and id
df_cases_convo = df_cases.merge(conv_per_case, left_on='id', right_on='meta.case_id', how='left')
df_cases_convo.head(3)

Unnamed: 0,id,year,citation,title,petitioner,respondent,docket_no,court,decided_date,url,...,win_side_detail,scdb_docket_id,votes,votes_detail,is_eq_divided,votes_side,meta.case_id,text,num_conversations,num_utterances
0,2011_11-1179,2011,567 US _,"American Tradition Partnership, Inc. v. Bullock","American Tradition Partnership, Inc.","Steve Bullock, Attorney General of Montana, et...",11-1179,Roberts Court,"Jun 25, 2012",https://www.oyez.org/cases/2011/11-1179,...,3.0,2011-073-01,"{'j__john_g_roberts_jr': 2.0, 'j__antonin_scal...","{'j__john_g_roberts_jr': 1.0, 'j__antonin_scal...",0.0,"{'j__john_g_roberts_jr': 1.0, 'j__antonin_scal...",,,,
1,2011_11-182,2011,567 US _,Arizona v. United States,Arizona et al.,United States,11-182,Roberts Court,"Jun 25, 2012",https://www.oyez.org/cases/2011/11-182,...,7.0,2011-075-01,"{'j__john_g_roberts_jr': 2.0, 'j__antonin_scal...","{'j__john_g_roberts_jr': 1.0, 'j__antonin_scal...",0.0,"{'j__john_g_roberts_jr': 0.0, 'j__antonin_scal...",2011_11-182,We'll hear argument this morning in Case 11-18...,1.0,295.0
2,2011_11-161,2011,566 US _,Armour v. City of Indianapolis,Christine Armour,City of Indianapolis,11-161,Roberts Court,"Jun 4, 2012",https://www.oyez.org/cases/2011/11-161,...,2.0,2011-062-01,"{'j__john_g_roberts_jr': 1.0, 'j__antonin_scal...","{'j__john_g_roberts_jr': 2.0, 'j__antonin_scal...",0.0,"{'j__john_g_roberts_jr': 1.0, 'j__antonin_scal...",2011_11-161,We will hear argument this morning in case 11-...,1.0,239.0


In [None]:
df_cases_convo.dropna(subset=['text'], inplace=True)
df_cases_convo.shape[0]

521

In [None]:
df_cases_convo.head()

Unnamed: 0,id,year,citation,title,petitioner,respondent,docket_no,court,decided_date,url,...,win_side_detail,scdb_docket_id,votes,votes_detail,is_eq_divided,votes_side,meta.case_id,text,num_conversations,num_utterances
1,2011_11-182,2011,567 US _,Arizona v. United States,Arizona et al.,United States,11-182,Roberts Court,"Jun 25, 2012",https://www.oyez.org/cases/2011/11-182,...,7.0,2011-075-01,"{'j__john_g_roberts_jr': 2.0, 'j__antonin_scal...","{'j__john_g_roberts_jr': 1.0, 'j__antonin_scal...",0.0,"{'j__john_g_roberts_jr': 0.0, 'j__antonin_scal...",2011_11-182,We'll hear argument this morning in Case 11-18...,1.0,295.0
2,2011_11-161,2011,566 US _,Armour v. City of Indianapolis,Christine Armour,City of Indianapolis,11-161,Roberts Court,"Jun 4, 2012",https://www.oyez.org/cases/2011/11-161,...,2.0,2011-062-01,"{'j__john_g_roberts_jr': 1.0, 'j__antonin_scal...","{'j__john_g_roberts_jr': 2.0, 'j__antonin_scal...",0.0,"{'j__john_g_roberts_jr': 1.0, 'j__antonin_scal...",2011_11-161,We will hear argument this morning in case 11-...,1.0,239.0
3,2011_11-159,2011,566 US _,Astrue v. Capato,"Michael J. Astrue, Commissioner of Social Secu...",Karen K. Capato,11-159,Roberts Court,"May 21, 2012",https://www.oyez.org/cases/2011/11-159,...,4.0,2011-054-01,"{'j__john_g_roberts_jr': 2.0, 'j__antonin_scal...","{'j__john_g_roberts_jr': 1.0, 'j__antonin_scal...",0.0,"{'j__john_g_roberts_jr': 1.0, 'j__antonin_scal...",2011_11-159,We will hear argument first this morning in Ca...,1.0,201.0
4,2011_10-1320,2011,566 US _,Blueford v. Arkansas,Alex Blueford,Arkansas,10-1320,Roberts Court,"May 24, 2012",https://www.oyez.org/cases/2011/10-1320,...,2.0,2011-057-01,"{'j__john_g_roberts_jr': 2.0, 'j__antonin_scal...","{'j__john_g_roberts_jr': 1.0, 'j__antonin_scal...",0.0,"{'j__john_g_roberts_jr': 0.0, 'j__antonin_scal...",2011_10-1320,"We'll hear argument next in Case 10-1320, Blue...",1.0,191.0
6,2011_10-844,2011,566 US _,"Caraco Pharmaceutical Laboratories, Ltd. v. No...","Caraco Pharmaceutical Laboratories, Ltd., et al.","Novo Nordisk A/S, et al.",10-844,Roberts Court,"Apr 17, 2012",https://www.oyez.org/cases/2011/10-844,...,4.0,2011-047-01,"{'j__john_g_roberts_jr': 2.0, 'j__antonin_scal...","{'j__john_g_roberts_jr': 1.0, 'j__antonin_scal...",0.0,"{'j__john_g_roberts_jr': 1.0, 'j__antonin_scal...",2011_10-844,We'll hear argument first this morning in Case...,1.0,210.0


## Data Preprocessing

In [None]:
# Cleaning the text
def preprocess_text(text):
    text = text.lower() # Lowercase the text
    text = re.sub('[^a-z]+', ' ', text)  # Remove special characters and numbers
    text = re.sub(r'\b\w{1,3}\b', '', text) # Remove words with length less than 3
    words = nltk.word_tokenize(text) # Tokenize the text
    stop_words = set(stopwords.words('english')) # Remove stopwords
    words = [word for word in words if word not in stop_words]
    #lemmatizer = WordNetLemmatizer() # Lemmatize the words comment because slow
    #words = [lemmatizer.lemmatize(word) for word in words]
    stemmer = PorterStemmer() # Stem the words
    words = [stemmer.stem(word) for word in words]
    text = ' '.join(words) # Reconstruct the text

    return text

In [None]:
text = df_cases_convo.loc[:,['text','win_side']]
text.head(3)

Unnamed: 0,text,win_side
1,We'll hear argument this morning in Case 11-18...,0.0
2,We will hear argument this morning in case 11-...,0.0
3,We will hear argument first this morning in Ca...,1.0


In [None]:
text.to_csv('text.csv', index=False)

In [None]:
text['text'] = text['text'].apply(preprocess_text) #apply preprocess
text.head(1)
#text.to_csv('text_clean.csv', index=False)

Unnamed: 0,text,win_side
1,hear argument morn case arizona unit state cle...,0.0


In [None]:
text = pd.read_csv('text_clean.csv')
text.head(1)

Unnamed: 0,text,win_side
0,hear argument morn case arizona unit state cle...,0.0


## Baseline

In [None]:
# Calculate The Baseline for Accuracy, Precision, Recall, F1
accuracy = df_cases['win_side'].value_counts()[1]/df_cases['win_side'].shape[0]
print('Accuracy: ', accuracy)

Accuracy:  0.6655574043261231


## Model Selection and Vectorize

In [None]:
from sklearn.linear_model import LogisticRegression, Perceptron, SGDClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, f1_score, precision_score, recall_score


def Classifier(X_train, X_test, y_train, y_test):

    # Train and evaluate the classifiers
    classifiers = {
        "Logistic Regression": LogisticRegression(max_iter=1000),
        "Naive Bayes": MultinomialNB(),
        "Linear SVC": LinearSVC(),
        "Random Forest": RandomForestClassifier(),
        "Perceptron": Perceptron(),
    }

    results = []

    for classifier_name, classifier in classifiers.items():

        # Train the classifier
        classifier.fit(X_train, y_train)

        # Make predictions on the test set
        y_pred = classifier.predict(X_test)

        # Add the scores to the results dictionary
        results.append({
            'classifier': classifier_name,
            'accuracy': accuracy_score(y_test, y_pred),
            'f1': f1_score(y_test, y_pred),
            'precision': precision_score(y_test, y_pred),
            'recall': recall_score(y_test, y_pred)
        })
    return pd.DataFrame(results)

In [None]:
def Vectorize(vectorizer, X, y):
    X = vectorizer.fit_transform(X)
    y = y
    return X, y    

## Run Model

In [None]:
# Vectorize the text using TF-IDF
vectorizer = TfidfVectorizer(min_df=5, max_df=0.7)
X, y = Vectorize(vectorizer, text['text'], text['win_side'])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
Classifier(X_train, X_test, y_train, y_test)

Unnamed: 0,classifier,accuracy,f1,precision,recall
0,Logistic Regression,0.704762,0.822857,0.727273,0.947368
1,Naive Bayes,0.72381,0.839779,0.72381,1.0
2,Linear SVC,0.628571,0.745098,0.74026,0.75
3,Random Forest,0.714286,0.831461,0.72549,0.973684
4,Perceptron,0.628571,0.731034,0.768116,0.697368


In [None]:
# Vectorize the text using CountVectorizer
vectorizer = CountVectorizer(min_df=5, max_df=0.8)
X, y = Vectorize(vectorizer, text['text'], text['win_side'])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
Classifier(X_train, X_test, y_train, y_test)

Unnamed: 0,classifier,accuracy,f1,precision,recall
0,Logistic Regression,0.580952,0.671642,0.633803,0.714286
1,Naive Bayes,0.571429,0.634146,0.65,0.619048
2,Linear SVC,0.590476,0.661417,0.65625,0.666667
3,Random Forest,0.590476,0.742515,0.596154,0.984127
4,Perceptron,0.571429,0.651163,0.636364,0.666667


In [None]:
# USING IMBLEARN
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler

vectorizer = TfidfVectorizer(min_df=5, max_df=0.8)
X, y = Vectorize(vectorizer, text['text'], text['win_side'])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)


# Resample the training data
print('--OVERSAMPLING--')
ros = RandomOverSampler(random_state=0)
X_train_resampled, y_train_resampled = ros.fit_resample(X_train, y_train)
classifier = Classifier(X_train_resampled, X_test, y_train_resampled, y_test)
print(classifier)

print('--UNDERSAMPLING--')
ros = RandomUnderSampler(random_state=0)
X_train_resampled, y_train_resampled = ros.fit_resample(X_train, y_train)
classifier = Classifier(X_train_resampled, X_test, y_train_resampled, y_test)
print(classifier)


--OVERSAMPLING--
            classifier  accuracy        f1  precision    recall
0  Logistic Regression  0.580952  0.661538   0.641791  0.682540
1          Naive Bayes  0.542857  0.606557   0.627119  0.587302
2           Linear SVC  0.533333  0.637037   0.597222  0.682540
3        Random Forest  0.580952  0.721519   0.600000  0.904762
4           Perceptron  0.580952  0.666667   0.637681  0.698413
--UNDERSAMPLING--
            classifier  accuracy        f1  precision    recall
0  Logistic Regression  0.609524  0.649573   0.703704  0.603175
1          Naive Bayes  0.561905  0.616667   0.649123  0.587302
2           Linear SVC  0.619048  0.636364   0.744681  0.555556
3        Random Forest  0.514286  0.495050   0.657895  0.396825
4           Perceptron  0.600000  0.625000   0.714286  0.555556


Using Over/Undersampling not help much :-(

NEXT!
Try using pretrained model from transformers.

https://huggingface.co/models?pipeline_tag=text-classification&sort=downloads

In [None]:
import torch
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
# from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, Trainer, TrainingArguments


# Load your data
data = pd.read_csv('text_clean.csv')

# Split your data into training and testing sets
train_texts, test_texts, train_labels, test_labels = train_test_split(data['text'], data['win_side'], test_size=0.2, stratify=data['win_side'])

# Initialize the BERT tokenizer
# tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")


# Tokenize the text data
train_encodings = tokenizer(train_texts.tolist(), truncation=True, padding=True)
test_encodings = tokenizer(test_texts.tolist(), truncation=True, padding=True)

# Create PyTorch Class from dataset
class SCOTUSDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx]).long()
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = SCOTUSDataset(train_encodings, train_labels.tolist())
test_dataset = SCOTUSDataset(test_encodings, test_labels.tolist())

# Initialize the BERT model for sequence classification
# model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)
# https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased")

# Set up training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=2, #changed to 2 because the GPU
    per_device_eval_batch_size=4,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=100,
    evaluation_strategy="steps",
    save_strategy="steps",
    save_steps=1000,
    load_best_model_at_end=True,
)

# Create the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset
)

# Train the model
trainer.train()

# Evaluate the model
predictions = trainer.predict(test_dataset)
predicted_labels = predictions.label_ids
y_pred = (predictions.predictions.argmax(-1)).tolist()

# Calculate performance metrics
accuracy = accuracy_score(test_labels, y_pred)
precision = precision_score(test_labels, y_pred)
recall = recall_score(test_labels, y_pred)
f1 = f1_score(test_labels, y_pred)

print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-Score: {f1:.4f}")


Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_projector.bias', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_transform.bias', 'vocab_transform.weight', 'vocab_layer_norm.weight']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.bias', 'classifier.weight', 'pre_classi

  0%|          | 0/624 [00:00<?, ?it/s]

{'loss': 0.6994, 'learning_rate': 1e-05, 'epoch': 0.48}


  0%|          | 0/27 [00:00<?, ?it/s]

{'eval_loss': 0.6757726669311523, 'eval_runtime': 2.6986, 'eval_samples_per_second': 38.909, 'eval_steps_per_second': 10.005, 'epoch': 0.48}
{'loss': 0.6696, 'learning_rate': 2e-05, 'epoch': 0.96}


  0%|          | 0/27 [00:00<?, ?it/s]

{'eval_loss': 0.6764814853668213, 'eval_runtime': 2.7597, 'eval_samples_per_second': 38.048, 'eval_steps_per_second': 9.784, 'epoch': 0.96}
{'loss': 0.7036, 'learning_rate': 3e-05, 'epoch': 1.44}


  0%|          | 0/27 [00:00<?, ?it/s]

{'eval_loss': 0.6685644388198853, 'eval_runtime': 2.7916, 'eval_samples_per_second': 37.613, 'eval_steps_per_second': 9.672, 'epoch': 1.44}
{'loss': 0.7099, 'learning_rate': 4e-05, 'epoch': 1.92}


  0%|          | 0/27 [00:00<?, ?it/s]

{'eval_loss': 0.7087978720664978, 'eval_runtime': 2.7839, 'eval_samples_per_second': 37.716, 'eval_steps_per_second': 9.698, 'epoch': 1.92}
{'loss': 0.6922, 'learning_rate': 5e-05, 'epoch': 2.4}


  0%|          | 0/27 [00:00<?, ?it/s]

{'eval_loss': 0.7302305698394775, 'eval_runtime': 2.7669, 'eval_samples_per_second': 37.949, 'eval_steps_per_second': 9.758, 'epoch': 2.4}
{'loss': 0.6753, 'learning_rate': 9.67741935483871e-06, 'epoch': 2.88}


  0%|          | 0/27 [00:00<?, ?it/s]

{'eval_loss': 0.6546196341514587, 'eval_runtime': 2.7631, 'eval_samples_per_second': 38.001, 'eval_steps_per_second': 9.772, 'epoch': 2.88}
{'train_runtime': 136.1508, 'train_samples_per_second': 9.166, 'train_steps_per_second': 4.583, 'train_loss': 0.6893020134705764, 'epoch': 3.0}


  0%|          | 0/27 [00:00<?, ?it/s]

Accuracy: 0.6381
Precision: 0.6381
Recall: 1.0000
F1-Score: 0.7791


Still below the baseline. Should redo the data preparation/or using other method.