In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
import pandas as pd
import numpy as np
from pathlib import Path

data_dir = Path("/content/drive/MyDrive/ADA_Judiciary")

In [3]:
df = pd.read_csv(data_dir / "cleaned" / "cases_cleaned_judge_position.csv")

In [4]:
print('Number of records in cleaned cases file for 2018: ' + str(len(df)))
df.head()

Number of records in cleaned cases file for 2018: 13724299


Unnamed: 0,female_defendant,female_petitioner,type_name,date_of_filing,date_first_list,case_duration_days,court_details,state_district,judge_category
0,0,0,1943,2018-01-01,2018-01-01,37.0,1-1-1,1-1,magistrate
1,0,0,1943,2018-01-01,2018-01-01,31.0,1-1-1,1-1,magistrate
2,0,0,1943,2018-01-01,2018-01-01,31.0,1-1-1,1-1,magistrate
3,0,0,1943,2018-01-01,2018-01-01,31.0,1-1-1,1-1,magistrate
4,-1,0,1943,2018-01-01,2018-01-01,8.0,1-1-1,1-1,magistrate


In [5]:
df.dtypes

Unnamed: 0,0
female_defendant,int64
female_petitioner,int64
type_name,int64
date_of_filing,object
date_first_list,object
case_duration_days,float64
court_details,object
state_district,object
judge_category,object


In [6]:
missing_values = df.isna().sum()
total_rows =df.shape[0]
# print column names and number of missing values
for col, num_missing in missing_values.items():
    percent_missing = round(num_missing/total_rows*100)
    print(f"Column '{col}' has {num_missing} missing values which is {percent_missing}% of total")

Column 'female_defendant' has 0 missing values which is 0% of total
Column 'female_petitioner' has 0 missing values which is 0% of total
Column 'type_name' has 0 missing values which is 0% of total
Column 'date_of_filing' has 0 missing values which is 0% of total
Column 'date_first_list' has 747460 missing values which is 5% of total
Column 'case_duration_days' has 6421003 missing values which is 47% of total
Column 'court_details' has 0 missing values which is 0% of total
Column 'state_district' has 0 missing values which is 0% of total
Column 'judge_category' has 0 missing values which is 0% of total


In [7]:
#Treat missing values of case_duration_days
# Step 1: Create missingness flag
df['case_duration_days_missing'] = \
    df['case_duration_days'].isna().astype(int)

# Step 2: Compute median
median_value = df['case_duration_days'].median()

# Step 3: Fill missing values safely
df['case_duration_days'] = \
    df['case_duration_days'].fillna(median_value)

# Step 4: Verify
print(df['case_duration_days'].isna().sum())

0


In [8]:
df.head()

Unnamed: 0,female_defendant,female_petitioner,type_name,date_of_filing,date_first_list,case_duration_days,court_details,state_district,judge_category,case_duration_days_missing
0,0,0,1943,2018-01-01,2018-01-01,37.0,1-1-1,1-1,magistrate,0
1,0,0,1943,2018-01-01,2018-01-01,31.0,1-1-1,1-1,magistrate,0
2,0,0,1943,2018-01-01,2018-01-01,31.0,1-1-1,1-1,magistrate,0
3,0,0,1943,2018-01-01,2018-01-01,31.0,1-1-1,1-1,magistrate,0
4,-1,0,1943,2018-01-01,2018-01-01,8.0,1-1-1,1-1,magistrate,0


In [9]:
# Create a missingness flag for date_first_list (optional)
df['date_first_list_missing'] = df['date_first_list'].isna().astype(int)

# Fill missing date_first_list with date_of_filing
df['date_first_list'] = df['date_first_list'].fillna(df['date_of_filing'])

# Verify
print(df['date_first_list'].isna().sum())  # Should be 0

0


In [10]:
# Select the relevant columns
cols_to_check = [ 'female_defendant',
                 'female_petitioner', 'type_name', 'date_of_filing',
                 'date_first_list', 'case_duration_days', 'court_details',
                 'state_district', 'judge_category', 'case_duration_days_missing']

# Check for missing values
missing_values = df[cols_to_check].isnull().sum().sort_values(ascending=False)

# Show missing value counts
print(missing_values)

female_defendant              0
female_petitioner             0
type_name                     0
date_of_filing                0
date_first_list               0
case_duration_days            0
court_details                 0
state_district                0
judge_category                0
case_duration_days_missing    0
dtype: int64


In [11]:
df.head()

Unnamed: 0,female_defendant,female_petitioner,type_name,date_of_filing,date_first_list,case_duration_days,court_details,state_district,judge_category,case_duration_days_missing,date_first_list_missing
0,0,0,1943,2018-01-01,2018-01-01,37.0,1-1-1,1-1,magistrate,0,0
1,0,0,1943,2018-01-01,2018-01-01,31.0,1-1-1,1-1,magistrate,0,0
2,0,0,1943,2018-01-01,2018-01-01,31.0,1-1-1,1-1,magistrate,0,0
3,0,0,1943,2018-01-01,2018-01-01,31.0,1-1-1,1-1,magistrate,0,0
4,-1,0,1943,2018-01-01,2018-01-01,8.0,1-1-1,1-1,magistrate,0,0


In [12]:
df.dtypes

Unnamed: 0,0
female_defendant,int64
female_petitioner,int64
type_name,int64
date_of_filing,object
date_first_list,object
case_duration_days,float64
court_details,object
state_district,object
judge_category,object
case_duration_days_missing,int64


In [13]:
df.to_csv(data_dir / "cleaned" / "cases_cleaned.csv", index=False)