# Combine the datasets

The notebook outlines the steps undertook to combine the datasets from previous steps to create a final dataset. Also split the dataset to train vs test set for downstream tasks.

# Import libraries

In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split

# Load the data

In [2]:
df = pd.read_csv("outputs/1b.opinions_cleaned.csv")
stmt = pd.read_csv("outputs/2a.queries_generated.csv")
qstn = pd.read_csv("outputs/2b.questions_generated.csv")

In [3]:
df.columns

Index(['opinion_id', 'opinion', 'opinion_source', 'opinion_word_count',
       'opinion_date_created', 'opinion_type', 'opinion_extracted_by_ocr',
       'opinion_per_curiam', 'cluster_id', 'cluster_judges',
       'cluster_nature_of_suit', 'cluster_source', 'cluster_blocked',
       'cluster_precedential_status', 'cluster_citation_count',
       'cluster_case_name', 'cluster_case_name_short',
       'cluster_case_name_full', 'cluster_summary', 'cluster_history',
       'cluster_headmatter', 'cluster_headnotes', 'cluster_posture',
       'cluster_arguments', 'cluster_cross_reference', 'cluster_disposition',
       'cluster_syllabus', 'docket_id', 'docket_number', 'docket_view_count',
       'court_id', 'court_jurisdiction', 'court_in_use', 'court_short_name',
       'court_full_name'],
      dtype='object')

In [4]:
stmt.columns

Index(['opinion_id', 'opinion', 'opinion_4omini_tokens', 'input_opinion',
       'relevant_query_stmt', 'irrelevant_query_stmt'],
      dtype='object')

In [5]:
qstn.columns

Index(['opinion_id', 'opinion', 'opinion_4omini_tokens', 'input_opinion',
       'relevant_query_qstn', 'irrelevant_query_qstn'],
      dtype='object')

# Combine the datasets

In [6]:
combined = df.merge(stmt[["opinion_id", "opinion_4omini_tokens", 'relevant_query_stmt', 'irrelevant_query_stmt']], how="left", on="opinion_id")
combined = combined.merge(qstn[["opinion_id", 'relevant_query_qstn', 'irrelevant_query_qstn']], how="left", on="opinion_id")
len(combined)

953

In [7]:
combined.columns

Index(['opinion_id', 'opinion', 'opinion_source', 'opinion_word_count',
       'opinion_date_created', 'opinion_type', 'opinion_extracted_by_ocr',
       'opinion_per_curiam', 'cluster_id', 'cluster_judges',
       'cluster_nature_of_suit', 'cluster_source', 'cluster_blocked',
       'cluster_precedential_status', 'cluster_citation_count',
       'cluster_case_name', 'cluster_case_name_short',
       'cluster_case_name_full', 'cluster_summary', 'cluster_history',
       'cluster_headmatter', 'cluster_headnotes', 'cluster_posture',
       'cluster_arguments', 'cluster_cross_reference', 'cluster_disposition',
       'cluster_syllabus', 'docket_id', 'docket_number', 'docket_view_count',
       'court_id', 'court_jurisdiction', 'court_in_use', 'court_short_name',
       'court_full_name', 'opinion_4omini_tokens', 'relevant_query_stmt',
       'irrelevant_query_stmt', 'relevant_query_qstn',
       'irrelevant_query_qstn'],
      dtype='object')

In [8]:
# Reorder the columns

In [9]:
columns = combined.columns.tolist()

columns.remove('opinion_4omini_tokens')
columns.insert(4, 'opinion_4omini_tokens')

combined = combined[columns]
combined.columns

Index(['opinion_id', 'opinion', 'opinion_source', 'opinion_word_count',
       'opinion_4omini_tokens', 'opinion_date_created', 'opinion_type',
       'opinion_extracted_by_ocr', 'opinion_per_curiam', 'cluster_id',
       'cluster_judges', 'cluster_nature_of_suit', 'cluster_source',
       'cluster_blocked', 'cluster_precedential_status',
       'cluster_citation_count', 'cluster_case_name',
       'cluster_case_name_short', 'cluster_case_name_full', 'cluster_summary',
       'cluster_history', 'cluster_headmatter', 'cluster_headnotes',
       'cluster_posture', 'cluster_arguments', 'cluster_cross_reference',
       'cluster_disposition', 'cluster_syllabus', 'docket_id', 'docket_number',
       'docket_view_count', 'court_id', 'court_jurisdiction', 'court_in_use',
       'court_short_name', 'court_full_name', 'relevant_query_stmt',
       'irrelevant_query_stmt', 'relevant_query_qstn',
       'irrelevant_query_qstn'],
      dtype='object')

# Save the combined & cleaned data

In [10]:
combined.to_csv("outputs/3.dataset.csv", index=False)

# Split the combined data to train vs test

In [11]:
train_df, test_df = train_test_split(combined, test_size=450, random_state=42)

In [12]:
len(train_df)

503

In [13]:
len(test_df)

450

In [14]:
train_df.to_csv("outputs/3.train.csv", index=False)
test_df.to_csv("outputs/3.test.csv", index=False)