1. Import Libraries and Load Data

In [1]:
# Import essential data manipulation and file handling libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

In [2]:
# Load your data
df = pd.read_csv("cases_cleaned.csv")

In [3]:
def map_judge_position(pos):
    pos = str(pos).lower()

    # Sessions / Criminal Judges
    if any(x in pos for x in ['district and sessions', 'sessions court', 'criminal cases']):
        return 'sessions_court'

    # Civil Judges
    elif any(x in pos for x in ['civil court', 'civil judge', 'district munsiff', 'junior division', 'senior division', 'principal civil judge']):
        return 'district_civil_judge'

    # Sub Courts / Taluka Courts / Outlying Courts
    elif any(x in pos for x in ['taluka court', 'sub court', 'outlying court']):
        return 'sub_court'

    # Magistrates
    elif any(x in pos for x in ['judicial magistrate', 'jmfc', 'chief judicial magistrate', 'metropolitan magistrate', 'magistrate']):
        return 'magistrate'

    # Family Courts
    elif 'family court' in pos:
        return 'family_court'

    # Motor Accident Claims Tribunal
    elif any(x in pos for x in ['motor accident', 'mact', 'm.a.c.t']):
        return 'motor_accident_court'

    # Industrial / Labour Courts
    elif any(x in pos for x in ['industrial court', 'labour court']):
        return 'industrial_labour_court'

    # Special Courts
    elif any(x in pos for x in ['special court', 'special magistrate', 'nia', 'pocso', 'prevention of corruption', 'economic offences']):
        return 'special_court'

    # Juvenile Courts
    elif 'juvenile court' in pos or 'juvenile board' in pos:
        return 'juvenile_court'

    # CBI / Investigation Courts
    elif 'cbi' in pos:
        return 'cbi_court'

    # Tribunals
    elif 'tribunal' in pos:
        return 'tribunal'

    # Vacation / temporary / administrative
    elif 'vacation' in pos:
        return 'vacation'

    # Cooperative Courts
    elif 'co-operative court' in pos or 'cooperative court' in pos:
        return 'cooperative_court'

    # Railway Courts
    elif 'railway' in pos:
        return 'railway_court'

    # Fast Track Courts
    elif 'fast track' in pos:
        return 'fast_track_court'

    else:
        return 'other'

# Apply mapping
df['judge_category'] = df['judge_position'].apply(map_judge_position)

# Check distribution
print(df['judge_category'].value_counts())


judge_category
magistrate                 13824308
sessions_court             10398285
district_civil_judge        7626372
other                       2251642
family_court                 982729
sub_court                    884798
motor_accident_court         178949
industrial_labour_court      127108
railway_court                121405
special_court                 22595
fast_track_court              17498
juvenile_court                13174
tribunal                      12797
cooperative_court             12668
vacation                       4909
cbi_court                      1496
Name: count, dtype: int64


In [4]:
# Mapping of judge_position to judge_category_clean
position_to_category = {
    'additional district judge': 'district_civil_judge',
    '2-additional district judge': 'district_civil_judge',
    '3-additional district judge': 'district_civil_judge',
    '4-additional district judge': 'district_civil_judge',
    '5-additional district judge': 'district_civil_judge',
    '6-additional district judge': 'district_civil_judge',
    '7-additional district judge': 'district_civil_judge',
    'additional court': 'district_civil_judge',
    'principal district judge': 'district_civil_judge',
    'district and sessional judge div.': 'district_civil_judge',
    'munsiff first class court': 'sub_court',
    'district munsiff first class': 'sub_court',
    'munsiff': 'sub_court',
    'sub court': 'sub_court',
    'taluka court': 'sub_court',
    'small cause court': 'sub_court',
    'junior division': 'district_civil_judge',
    'city civil and sessions court': 'district_civil_judge',
    'fast track court': 'fast_track_court',
    'motor accidents claims tribunal': 'motor_accident_court',
    'industrial court': 'industrial_labour_court',
    'labour court': 'industrial_labour_court',
    'co-operative court aurangabad': 'cooperative_court',
    'cooperative court': 'cooperative_court',
    'co-operative appellate court aurangabad': 'cooperative_court',
    'co-operative court nashik.': 'cooperative_court',
    'co-operative court, solapur': 'cooperative_court',
    'co-operative court nanded': 'cooperative_court',
    'co-operative court amravati': 'cooperative_court',
    'co-operative court raigad-ratnagiri at alibaug': 'cooperative_court',
    'co-operative court raigad-ratnagiri at ratnagiri': 'cooperative_court',
    'maharashtra state co-operative appellate court': 'cooperative_court',
    'co-operative appellate court, pune': 'cooperative_court',
    'co-operative trial court, pune': 'cooperative_court',
    'co-operative court, ahmednagar': 'cooperative_court',
    'co-operative court thane': 'cooperative_court',
    'co-operative court, shrirampur': 'cooperative_court',
    'the district co-operative court, satara': 'cooperative_court',
    'family court': 'family_court',
    'school tribunal': 'tribunal',
    'additional school tribunal': 'tribunal',
    'special magistrate court': 'special_court',
    'cbi court': 'cbi_court',
    'vacation': 'vacation',
    'pocso': 'special_court',
    'judicial commissioner ranchi': 'other',
    'judicial courts complex nahan, sirmaur, h.p.': 'other',
    'bagaha': 'district_civil_judge',
    'ahmedabad_metro court': 'district_civil_judge',
    'mmtc ii': 'special_court',
    'mmtc iii': 'special_court',
    'mmtc iv': 'special_court',
    'mmtc vi': 'special_court',
    'psmedkgp': 'special_court',
    'bwndcis': 'special_court'
}
df['judge_category'] = df.apply(
    lambda row: position_to_category.get(row['judge_position'], row['judge_category']),
    axis=1
)

In [5]:
# Check distribution
print(df['judge_category'].value_counts())

judge_category
magistrate                 13818466
sessions_court             10169813
district_civil_judge        8662055
sub_court                   1341808
family_court                 982729
other                        795397
special_court                217594
motor_accident_court         178949
industrial_labour_court      127108
railway_court                121405
fast_track_court              17498
cooperative_court             15535
juvenile_court                13174
tribunal                      12797
vacation                       4909
cbi_court                      1496
Name: count, dtype: int64


In [8]:
df['date_of_filing'] = pd.to_datetime(df['date_of_filing'], errors='coerce', format='mixed')
df['date_first_list'] = pd.to_datetime(df['date_first_list'], errors='coerce', format='mixed')

# From filing date
df['filing_year'] = df['date_of_filing'].dt.year
df['filing_month'] = df['date_of_filing'].dt.month
df['filing_quarter'] = df['date_of_filing'].dt.quarter
df['filing_dayofweek'] = df['date_of_filing'].dt.dayofweek  # Monday=0, Sunday=6
df['filing_dayofyear'] = df['date_of_filing'].dt.dayofyear
df['filing_weekofyear'] = df['date_of_filing'].dt.isocalendar().week

# From first listing date
df['first_list_delay'] = (df['date_first_list'] - df['date_of_filing']).dt.days
df['first_list_month'] = df['date_first_list'].dt.month
df['first_list_quarter'] = df['date_first_list'].dt.quarter

# Judicial calendar features (avoid weekends/holidays)
df['filed_on_weekend'] = df['filing_dayofweek'].isin([5, 6]).astype(int)
df['first_list_on_weekend'] = df['date_first_list'].dt.dayofweek.isin([5, 6]).astype(int)

In [10]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 36480733 entries, 0 to 36480732
Data columns (total 27 columns):
 #   Column                 Dtype         
---  ------                 -----         
 0   act                    float64       
 1   section                float64       
 2   bailable_ipc           object        
 3   number_sections_ipc    float64       
 4   criminal               int64         
 5   judge_position         object        
 6   female_defendant       int64         
 7   female_petitioner      int64         
 8   type_name              float64       
 9   date_of_filing         datetime64[ns]
 10  date_first_list        datetime64[ns]
 11  case_duration_days     float64       
 12  event                  int64         
 13  court_details          object        
 14  state_district         object        
 15  judge_category         object        
 16  filing_year            int32         
 17  filing_month           int32         
 18  filing_quarter      

In [11]:
columns_to_drop = ['judge_position']

# Create a copy of the dataframe and start preparing the same for prediction modelling
df_cleaned = df.drop(columns=columns_to_drop, axis=1)

13. Save Cleaned Dataset

In [12]:
# Export cleaned dataset to CSV for further analysis
df_cleaned.to_csv("cases_engineered.csv", index=False)
print("Cleaned dataset saved to 'cases_engineered.csv'")

Cleaned dataset saved to 'cases_engineered.csv'
