In [75]:
import pandas as pd
from datasets import load_dataset
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

### Data import

In [81]:
dataset = load_dataset('darrow-ai/USClassActions')

In [82]:
df = dataset['train'].to_pandas()

In [83]:
df.head()

Unnamed: 0,id,target_text,verdict
0,99002,Knowing and/or Willful Violation of the Telep...,lose
1,138068,17. Defendants operate an electrical construc...,win
2,319755,(Fair Labor Standards Act Violations) (Viola...,win
3,155208,"37. As set forth below, the proposed Class sa...",win
4,390338,24. Named Plaintiffs and Class Members were em...,win


### Cleaning

In [84]:
# Drop extraneous columns

df = df.drop('id', axis=1)

In [85]:
# Check for null values

df.isna().sum()

target_text    4
verdict        0
dtype: int64

In [86]:
# Drop null values

df = df.dropna()

### Word pre-processing

In [89]:
# Visualize random document

df['target_text'].iloc[0]

" Knowing and/or Willful Violation of the Telephone Consumer Protection Act (47 U.S.C.  227, et seq.) on behalf of Autodialer Class    Knowing and/or Willful Violation of the Telephone Consumer Protection Act (47 U.S.C.  227, et seq. and 47 C.F.R. §§ 64.1200(d)) on behalf of the NDNC Class    Statutory Violations of the Telephone Consumer Protection Act (47 U.S.C. 227, et seq.)   on behalf of the Autodialer Class.    Violation of the Telephone Consumer Protection Act (47 U.S.C. 227, et seq. and 47 C.F.R.  §§ 64.1200(d)) on behalf of the NDNC Class  Case: 2:18-cv-00377-GCS-EPD Doc #: 1 Filed: 04/24/18 Page: 14 of 18  PAGEID #: 14 - 15 -   27.  Plaintiff Blevins is the owner and sole proprietor of Interior Creations, a small  business that performs home remodeling and improvement projects.    28.  Plaintiff Blevins is the exclusive user of the telephone assigned the number  Case: 2:18-cv-00377-GCS-EPD Doc #: 1 Filed: 04/24/18 Page: 7 of 18  PAGEID #: 7 - 8 -  ending in 3503 and the accou

In [80]:
# Import list of stopwords

nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/ryantung/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/ryantung/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [90]:
# Define function to clean & tokenize case facts summaries

def clean_text(text):
    # Remove non-word characters
    text = re.sub(r'\W', ' ', text)
    text = text.lower()
    
    # Tokenize sentence
    tokens = word_tokenize(text)
    
    # Initialize empty list of cleaned tokens
    cleanTokens = []
    
    # Remove common English stop-words
    for word in tokens:
        if word not in stopwords.words('english'):
            cleanTokens.append(word)
    
    return cleanTokens

In [91]:
# Apply clean_text function to each case facts summary

df['cleaned_facts'] = df['target_text'].apply(clean_text)

In [92]:
# Find additional stop-words

df['cleaned_facts'].explode('lists').value_counts().head(25)

cleaned_facts
plaintiff      63507
class          56863
defendant      45965
members        32151
c              24727
defendants     23222
website        17621
u              15606
action         14511
b              14444
services       12380
law            12307
plaintiffs     11954
1              11364
2              10027
herein          9308
behalf          9301
access          9292
hours           9144
new             9144
23              9007
information     8408
violation       8270
pay             8235
set             8188
Name: count, dtype: int64

In [93]:
# Declare new stopwords

new_stopwords = ['plaintiff', 'class', 'defendant', 'c', 'defendants', 'u',
                 'action', 'b', 'law', 'plaintiffs', '1', '2', 'herein', 'behalf']

In [94]:
# Define function to remove new stop-words

def remove_stopwords(text):
    # Initialize empty list of cleaned tokens
    cleanTokens = []
    
    # Remove new stopwords
    for word in text:
        if word not in new_stopwords:
            cleanTokens.append(word)
    
    return cleanTokens

In [95]:
# Apply remove_stopwords function to cleaned case facts

df['cleaned_facts'] = df['cleaned_facts'].apply(remove_stopwords)

### Data export

In [102]:
df.to_json('../data/cleaned_class_action.json', orient='records', lines=True)