In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

data_dir = Path("/content/drive/MyDrive/ADA_Judiciary")

In [3]:
cases_orig = pd.read_csv(data_dir / "cases" / "cases_2018.csv")

In [4]:
print('Number of records in cases file for 2018: ' + str(len(cases_orig)))
cases_orig.head()

Number of records in cases file for 2018: 13724299


Unnamed: 0,ddl_case_id,year,state_code,dist_code,court_no,cino,judge_position,female_defendant,female_petitioner,female_adv_def,female_adv_pet,type_name,purpose_name,disp_name,date_of_filing,date_of_decision,date_first_list,date_last_list,date_next_list
0,01-01-01-201900000012018,2018,1,1,1,MHNB030000022018,chief judicial magistrate,0 male,0 male,-9999,0,1943,2975.0,33,2018-01-01,2018-02-07,2018-01-01,2018-02-07,2018-02-07
1,01-01-01-201900000022018,2018,1,1,1,MHNB030000032018,chief judicial magistrate,0 male,0 male,-9999,0,1943,3315.0,52,2018-01-01,2018-02-01,2018-01-01,2018-02-01,2018-02-01
2,01-01-01-201900000032018,2018,1,1,1,MHNB030000042018,chief judicial magistrate,0 male,0 male,-9999,0,1943,5877.0,52,2018-01-01,2018-02-01,2018-01-01,2018-02-01,2018-02-01
3,01-01-01-201900000042018,2018,1,1,1,MHNB030000052018,chief judicial magistrate,0 male,0 male,-9999,0,1943,840.0,52,2018-01-01,2018-02-01,2018-01-01,2018-02-01,2018-02-01
4,01-01-01-201900000052018,2018,1,1,1,MHNB030000062018,chief judicial magistrate,-9998 unclear,0 male,-9999,1,1943,840.0,5,2018-01-01,2018-01-09,2018-01-01,2018-01-09,2018-01-09


In [5]:
cases_orig.dtypes

Unnamed: 0,0
ddl_case_id,object
year,int64
state_code,int64
dist_code,int64
court_no,int64
cino,object
judge_position,object
female_defendant,object
female_petitioner,object
female_adv_def,int64


In [6]:
missing_values = cases_orig.isna().sum()
total_rows = cases_orig.shape[0]
# print column names and number of missing values
for col, num_missing in missing_values.items():
    percent_missing = round(num_missing/total_rows*100)
    print(f"Column '{col}' has {num_missing} missing values which is {percent_missing}% of total")

Column 'ddl_case_id' has 0 missing values which is 0% of total
Column 'year' has 0 missing values which is 0% of total
Column 'state_code' has 0 missing values which is 0% of total
Column 'dist_code' has 0 missing values which is 0% of total
Column 'court_no' has 0 missing values which is 0% of total
Column 'cino' has 0 missing values which is 0% of total
Column 'judge_position' has 0 missing values which is 0% of total
Column 'female_defendant' has 0 missing values which is 0% of total
Column 'female_petitioner' has 0 missing values which is 0% of total
Column 'female_adv_def' has 0 missing values which is 0% of total
Column 'female_adv_pet' has 0 missing values which is 0% of total
Column 'type_name' has 0 missing values which is 0% of total
Column 'purpose_name' has 534098 missing values which is 4% of total
Column 'disp_name' has 0 missing values which is 0% of total
Column 'date_of_filing' has 0 missing values which is 0% of total
Column 'date_of_decision' has 6327528 missing valu

We will drop following features as they are just identifiers

*   ddl_case_id
*   year
*   cino

We will drop following features as for year 2018 the majority of data  is missing
*   purpose_name

We will drop following features because it is determined after case completion, so using it would cause data leakage.

*   disp_name

In [7]:
date_cols = ['date_of_filing', 'date_first_list', 'date_of_decision', 'date_last_list', 'date_next_list']

# === Chronological Consistency Checks ===
print("Chronological Logic Violations (Before Fix):")
checks_before = {
    "first_list_before_filing": (cases_orig['date_first_list'] < cases_orig['date_of_filing']).sum(),
    "decision_before_filing": (cases_orig['date_of_decision'] < cases_orig['date_of_filing']).sum(),
    "first_list_after_decision": (cases_orig['date_first_list'] > cases_orig['date_of_decision']).sum(),
}
for check, count in checks_before.items():
    print(f"{check}: {count:,} cases")

# Fix invalid first_list-before-filing cases
cases_orig.loc[cases_orig['date_first_list'] < cases_orig['date_of_filing'], 'date_first_list'] = pd.NaT

# Fix invalid decision-before-filing cases
cases_orig.loc[cases_orig['date_of_decision'] < cases_orig['date_of_filing'], 'date_of_decision'] = pd.NaT

# Fix first_list-after-decision cases
cases_orig.loc[cases_orig['date_first_list'] > cases_orig['date_of_decision'], 'date_first_list'] = pd.NaT

print("\nChronological Logic Violations (After Fix):")
checks_after = {
    "first_list_before_filing": (cases_orig['date_first_list'] < cases_orig['date_of_filing']).sum(),
    "decision_before_filing": (cases_orig['date_of_decision'] < cases_orig['date_of_filing']).sum(),
    "first_list_after_decision": (cases_orig['date_first_list'] > cases_orig['date_of_decision']).sum(),
}

for check, count in checks_after.items():
    print(f"{check}: {count:,} cases")

Chronological Logic Violations (Before Fix):
first_list_before_filing: 69,093 cases
decision_before_filing: 93,473 cases
first_list_after_decision: 95,107 cases

Chronological Logic Violations (After Fix):
first_list_before_filing: 0 cases
decision_before_filing: 0 cases
first_list_after_decision: 0 cases


In [8]:
missing_values = cases_orig.isna().sum()
total_rows = cases_orig.shape[0]
# print column names and number of missing values
for col, num_missing in missing_values.items():
    percent_missing = round(num_missing/total_rows*100)
    print(f"Column '{col}' has {num_missing} missing values which is {percent_missing}% of total")

Column 'ddl_case_id' has 0 missing values which is 0% of total
Column 'year' has 0 missing values which is 0% of total
Column 'state_code' has 0 missing values which is 0% of total
Column 'dist_code' has 0 missing values which is 0% of total
Column 'court_no' has 0 missing values which is 0% of total
Column 'cino' has 0 missing values which is 0% of total
Column 'judge_position' has 0 missing values which is 0% of total
Column 'female_defendant' has 0 missing values which is 0% of total
Column 'female_petitioner' has 0 missing values which is 0% of total
Column 'female_adv_def' has 0 missing values which is 0% of total
Column 'female_adv_pet' has 0 missing values which is 0% of total
Column 'type_name' has 0 missing values which is 0% of total
Column 'purpose_name' has 534098 missing values which is 4% of total
Column 'disp_name' has 0 missing values which is 0% of total
Column 'date_of_filing' has 0 missing values which is 0% of total
Column 'date_of_decision' has 6421001 missing valu

In [9]:
cases_orig['date_of_filing'] = pd.to_datetime(cases_orig['date_of_filing'], errors='coerce', format='mixed')
cases_orig['date_first_list'] = pd.to_datetime(cases_orig['date_first_list'], errors='coerce', format='mixed')
cases_orig['date_of_decision'] = pd.to_datetime(cases_orig['date_of_decision'], errors='coerce', format='mixed')

In [10]:
for col in ['date_of_filing', 'date_first_list']:
    invalid_mask = (cases_orig[col].dt.year < 1900) | (cases_orig[col].dt.year > 2100)
    invalid_dates = cases_orig.loc[invalid_mask, col]

    print(f"\n Invalid entries in '{col}': {len(invalid_dates)} found")
    print(invalid_dates.unique()[:20])  # show first 20 unique invalid dates


 Invalid entries in 'date_of_filing': 0 found
<DatetimeArray>
[]
Length: 0, dtype: datetime64[ns]

 Invalid entries in 'date_first_list': 145 found
<DatetimeArray>
['2108-04-09 00:00:00', '2201-04-07 00:00:00', '2108-03-08 00:00:00',
 '2108-12-26 00:00:00', '2108-12-03 00:00:00', '2109-01-21 00:00:00',
 '2201-06-24 00:00:00', '2108-12-14 00:00:00', '2109-05-04 00:00:00',
 '2201-05-08 00:00:00', '2108-12-06 00:00:00', '2118-04-06 00:00:00',
 '2118-09-07 00:00:00', '2108-06-25 00:00:00', '2108-05-23 00:00:00',
 '2108-02-09 00:00:00', '2108-02-14 00:00:00', '2108-02-27 00:00:00',
 '2108-03-17 00:00:00', '2108-05-25 00:00:00']
Length: 20, dtype: datetime64[ns]


In [11]:
# Remove unrealistic future or ancient dates
for col in ['date_of_filing', 'date_first_list']:
    cases_orig.loc[
        (cases_orig[col].dt.year < 1900) | (cases_orig[col].dt.year > 2100),
        col
    ] = pd.NaT

for col in ['date_of_filing', 'date_first_list']:
    invalid_mask = (cases_orig[col].dt.year < 1900) | (cases_orig[col].dt.year > 2100)
    print(f"{col}: {invalid_mask.sum()} invalid dates")

date_of_filing: 0 invalid dates
date_first_list: 0 invalid dates


In [12]:
cases_orig['case_duration_days'] = (cases_orig['date_of_decision'] - cases_orig['date_of_filing']).dt.days

We will drop following features as they contain future information that would leak the target:

*   date_of_decision
*   date_last_list
*   date_next_list

while keeping case_duration_days only as the target variable.

In [13]:
gender_map_str = {'0 male': 0, '1 female': 1}

print("Gender Values (Before Cleaning):")
for col in ['female_defendant', 'female_petitioner', 'female_adv_def', 'female_adv_pet']:
    print(col, cases_orig[col].unique())

# Clean string columns
for col in ['female_defendant', 'female_petitioner']:
    # Convert to string first in case there are mixed types
    cases_orig[col] = cases_orig[col].astype(str)

    # Replace common missing codes/strings with np.nan
    cases_orig[col] = cases_orig[col].replace({
        '-9998 unclear': np.nan,
        '-9999 missing name': np.nan,
        '': np.nan  # in case there are empty strings
    })

    cases_orig[col] = cases_orig[col].map(gender_map_str)

# Clean numeric columns
for col in ['female_adv_def', 'female_adv_pet']:
    cases_orig[col] = cases_orig[col].replace({-9998: np.nan, -9999: np.nan})

print("\nGender Values (After Cleaning):")
# Optional: check the unique values to confirm
for col in ['female_defendant', 'female_petitioner', 'female_adv_def', 'female_adv_pet']:
    print(col, cases_orig[col].unique())

Gender Values (Before Cleaning):
female_defendant ['0 male' '-9998 unclear' '1 female' '-9999 missing name']
female_petitioner ['0 male' '1 female' '-9998 unclear' '-9999 missing name']
female_adv_def [-9999     0 -9998     1]
female_adv_pet [    0     1 -9999 -9998]

Gender Values (After Cleaning):
female_defendant [ 0. nan  1.]
female_petitioner [ 0.  1. nan]
female_adv_def [nan  0.  1.]
female_adv_pet [ 0.  1. nan]


In [14]:
for col in ['female_defendant', 'female_petitioner', 'female_adv_def', 'female_adv_pet']:
    print(col)
    print(cases_orig.groupby(col)['case_duration_days'].mean(), '\n')

female_defendant
female_defendant
0.0     86.981433
1.0    107.137186
Name: case_duration_days, dtype: float64 

female_petitioner
female_petitioner
0.0     83.409304
1.0    117.815961
Name: case_duration_days, dtype: float64 

female_adv_def
female_adv_def
0.0    71.195865
1.0    94.909357
Name: case_duration_days, dtype: float64 

female_adv_pet
female_adv_pet
0.0    91.681182
1.0    72.135478
Name: case_duration_days, dtype: float64 



We will drop following features because they show weak and inconsistent predictive signal and contain many missing/unclear values.

*   female_adv_def
*   female_adv_pet

In [15]:
cases_orig['female_defendant'] = cases_orig['female_defendant'].fillna(-1).astype(int)
cases_orig['female_petitioner'] = cases_orig['female_petitioner'].fillna(-1).astype(int)
for col in ['female_defendant', 'female_petitioner']:
    print(col, cases_orig[col].unique())

female_defendant [ 0 -1  1]
female_petitioner [ 0  1 -1]


In [16]:
cases_orig["court_details"] = cases_orig["state_code"].astype(str) + "-" + cases_orig["dist_code"].astype(str) + "-" + cases_orig["court_no"].astype(str)
cases_orig["state_district"] = cases_orig["state_code"].astype(str) + "-" + cases_orig["dist_code"].astype(str)

In [17]:
columns_to_drop = ['year', 'cino', 'purpose_name', 'disp_name', 'female_adv_def', 'female_adv_pet', 'state_code', 'dist_code',
                  'court_no', 'date_of_decision', 'date_last_list', 'date_next_list']

# Create a copy of the dataframe and start preparing the same for prediction modelling
df_reduced = cases_orig.drop(columns=columns_to_drop, axis=1)


In [18]:
df_reduced.head()

Unnamed: 0,ddl_case_id,judge_position,female_defendant,female_petitioner,type_name,date_of_filing,date_first_list,case_duration_days,court_details,state_district
0,01-01-01-201900000012018,chief judicial magistrate,0,0,1943,2018-01-01,2018-01-01,37.0,1-1-1,1-1
1,01-01-01-201900000022018,chief judicial magistrate,0,0,1943,2018-01-01,2018-01-01,31.0,1-1-1,1-1
2,01-01-01-201900000032018,chief judicial magistrate,0,0,1943,2018-01-01,2018-01-01,31.0,1-1-1,1-1
3,01-01-01-201900000042018,chief judicial magistrate,0,0,1943,2018-01-01,2018-01-01,31.0,1-1-1,1-1
4,01-01-01-201900000052018,chief judicial magistrate,-1,0,1943,2018-01-01,2018-01-01,8.0,1-1-1,1-1


In [19]:
df_reduced.dtypes

Unnamed: 0,0
ddl_case_id,object
judge_position,object
female_defendant,int64
female_petitioner,int64
type_name,int64
date_of_filing,datetime64[ns]
date_first_list,datetime64[ns]
case_duration_days,float64
court_details,object
state_district,object


In [20]:
df_reduced.to_csv(data_dir / "cleaned" / "cases_reduced.csv", index=False)