In [1]:
import pandas as pd
import numpy as np
import seaborn as sns

print("Started!")
pd.set_option('display.float_format', lambda x: '%.3f' % x)
# Try adding section and act names

Started!


In [2]:
gender_columns = ["female_defendant", "female_petitioner", "female_adv_def", "female_adv_pet"]

def gender_to_int(x):
    return int(str(x).split()[0])

def get_year_cases(year):
    fields = ["ddl_case_id", "year", "state_code", "dist_code", "court_no", "female_defendant", "female_petitioner", "female_adv_def", "female_adv_pet", "type_name", "purpose_name", "disp_name", "date_of_filing", "date_of_decision", "date_first_list",
       "date_last_list", "date_next_list"]
    
    cases = pd.read_csv(f"csv/cases/cases/cases_{year}.csv", usecols=fields) # , nrows=10000
    
    # normalise gender columns to ints (apparently only female_defendant col has strings, but still)
    for gc in gender_columns:
        cases[gc] = cases[gc].apply(gender_to_int)
        # print(cases[gc].unique())

    return cases

In [3]:
# might wanna add: , 'bail granted', 'bail refused', 'bail rejected'
# 0: petitioner; 1: defendant
good_disp = {"allowed": 0, "dismissed": 1, "convicted": 0, "plea bargaining": 0, "reject": 1, "plead guilty": 0, "withdrawn": 1, "acquitted": 1, "confession": 0, "settled": 1}

def check_defendant_won(x):
    return good_disp[x]

def get_good_disp(cases):
    cases = cases.loc[cases.disp_name.isin(list(good_disp.keys()))] # select some cute dispositions
    cases["defendant_won"] = cases.disp_name.apply(check_defendant_won)
    return cases

In [4]:
date_columns = ["date_of_filing", "date_of_decision", "date_of_decision", "date_first_list", "date_next_list", "date_last_list"]
# 

def process_dates_and_filter(cases):
    # print("- Undecided cases:", cases.date_of_decision.isna().sum())
    for date_col in date_columns:
        cases[date_col] = pd.to_datetime(cases[date_col], errors="coerce")

    cases = cases.dropna(subset=date_columns)
    cases.loc[:,"duration"] = (cases.date_of_decision - cases.date_of_filing).dt.days + 1
    cases["first_list_gap"] = (cases.date_first_list - cases.date_of_filing).dt.days + 1
    cases["next_list_gap"] = (cases.date_next_list - cases.date_first_list).dt.days + 1
    cases["last_list_gap"] = (cases.date_last_list - cases.date_next_list).dt.days + 1
    
    cases = cases.loc[(cases.duration > 0) & (cases.date_of_decision <= pd.to_datetime("2022-12-31"))]
    
    # split date columns
    for dc in date_columns:
        cases[f"{dc}_year"] = cases[dc].dt.year
        cases[f"{dc}_month"] = cases[dc].dt.month
        cases[f"{dc}_date"] = cases[dc].dt.day
    
    cases = cases.drop(columns=date_columns)
    return cases

In [5]:
judge_case_key = pd.read_csv("csv/keys/keys/judge_case_merge_key.csv")

def merge_judge_id(cases):
    return pd.merge(cases, judge_case_key, on="ddl_case_id", how="left")

In [6]:
disp_key = pd.read_csv("csv/keys/keys/disp_name_key.csv")

def merge_disp_name(cases, year):
    cases = pd.merge(cases, disp_key[["year", "disp_name", "disp_name_s"]], on=["year", "disp_name"], how="left")
    cases.disp_name = cases.disp_name_s
    return cases.drop(columns="disp_name_s")

In [7]:
def add_population_data(cases):
    # add population data into state_key
    # the state_key has state_code, so then merge with cases
    state_key = pd.read_csv("csv/keys/keys/cases_state_key.csv")
    state_key["population"] = 0
    
    population = {
        "Uttar Pradesh": 199812341,
        "Maharashtra": 112374333,
        "Bihar": 104099452,
        "West Bengal": 91276115,
        "Madhya Pradesh": 72626809,
        "Tamil Nadu": 72147030,
        "Rajasthan": 68548437,
        "Karnataka": 61095297,
        "Gujarat": 60439692,
        "Andhra Pradesh": 49577103,
        "Orissa": 41974219,
        "Telangana": 35003674,
        "Kerala": 33406061,
        "Jharkhand": 32988134,
        "Assam": 31205576,
        "Punjab": 27743338,
        "Chhattisgarh": 25545198,
        "Haryana": 25353081,
        "Jammu and Kashmir": 12541302,
        "Ladakh": 12541302,
        "Uttarakhand": 10086292,
        "Himachal Pradesh": 6864602,
        "Tripura": 3673917,
        "Meghalaya": 2966889,
        "Manipur": 2721756,
        "Nagaland": 1978502,
        "Goa": 1458545,
        "Arunachal Pradesh": 1383727,
        "Mizoram": 1091014,
        "Sikkim": 607688,
        "Delhi": 16753235,
        "Puducherry": 1244464,
        "Chandigarh": 1054686,
        "Andaman and Nicobar Islands": 380581,
        "DNH at Silvasa": 342853,
        "Diu and Daman": 243247,
        "Lakshadweep": 64429
    }
    
    for state_name in state_key.state_name.unique():
        state_key.loc[state_key.state_name == state_name, "population"] = population[state_name]
    
    cases = pd.merge(cases, state_key[["year", "state_code", "population"]], on=["year", "state_code"], how="left")
    
    return cases

In [8]:
def add_judge_info_from_id(cases):
    # get judges and filter for judges with known genders
    judges = pd.read_csv("csv/judges_clean.csv")
    judges = judges[judges.female_judge.notna()]
    judges.female_judge = judges.female_judge.apply(gender_to_int)
    # judges = judges.loc[(judges.female_judge == 0) | (judges.female_judge == 1)]
    
    date_columns = ["start_date", "end_date"]
    for dc in date_columns:
        judges[dc] = pd.to_datetime(judges[dc], errors="coerce")
    
    judges["judge_duration"] = (judges.end_date - judges.start_date).dt.days + 1
    judges = judges[(judges.judge_duration >= 1) & (judges.start_date <= pd.to_datetime("01/01/2017"))]
    
    
    # merge info of filing and decision judges with cases
    # handle decision judge
    cases = pd.merge(cases, judges[["ddl_judge_id", "female_judge", "judge_duration"]], left_on="ddl_decision_judge_id", right_on="ddl_judge_id", how="left")
    cases["female_decision_judge"] = cases.female_judge
    cases["decision_judge_duration"] = cases.judge_duration
    
    cases = cases.drop(columns=["female_judge", "judge_duration", "ddl_judge_id"])
    
    # handle filing judge
    cases = pd.merge(cases, judges[["ddl_judge_id", "female_judge", "judge_duration"]], left_on="ddl_filing_judge_id", right_on="ddl_judge_id", how="left")
    cases["female_filing_judge"] = cases.female_judge
    cases["filing_judge_duration"] = cases.judge_duration
    
    cases = cases.drop(columns=["female_judge", "judge_duration", "ddl_judge_id"])
    
    return cases

In [9]:
year = 2015

In [10]:
# cases = get_year_cases(year)
# print("Fetched cases")
# # print(cases.columns)

# cases = process_dates_and_filter(cases)
# print("Processed dates")
# # print(cases.columns)

# cases = merge_disp_name(cases, year)
# print("Merged disposition names")
# # print(cases.columns)

# cases = get_good_disp(cases)
# print("Filtered for relevant dispositions")
# # print(cases.columns)

# cases = add_population_data(cases)
# print("Added population data of states")
# # print(cases.columns)

# cases = merge_judge_id(cases)
# print("Merged judge data")
# # print(cases.columns)

# # cases = cases.dropna(subset=["ddl_filing_judge_id", "ddl_decision_judge_id"])
# # print("Dropped cases with missing judge info")
# # print(cases.columns)

# cases = add_judge_info_from_id(cases)
# print("Added judge information")
# # print(cases.columns)

# cases.ddl_decision_judge_id = cases.ddl_decision_judge_id.fillna(-1)
# cases.ddl_filing_judge_id = cases.ddl_filing_judge_id.fillna(-1)

# print("Left with", cases.shape[0], "cases")

# cases.to_csv("processed_defendant_win_classification_cases.csv", index=False)
# print("Saved to CSV")

In [11]:
cases = pd.read_csv("processed_defendant_win_classification_cases.csv", nrows=800000) # nrows=1000000
        
print("Read CSV")

Read CSV


In [12]:
cases.head()

Unnamed: 0,ddl_case_id,year,state_code,dist_code,court_no,female_defendant,female_petitioner,female_adv_def,female_adv_pet,type_name,...,date_last_list_month,date_last_list_date,defendant_won,population,ddl_filing_judge_id,ddl_decision_judge_id,female_decision_judge,decision_judge_duration,female_filing_judge,filing_judge_duration
0,01-01-01-201908000042015,2015,1,1,1,0,0,-9999,0,1978.0,...,3,21,1,112374333,-1.0,-1.0,,,,
1,01-01-01-201908000062015,2015,1,1,1,-9998,0,-9999,0,1978.0,...,5,28,1,112374333,-1.0,-1.0,,,,
2,01-01-01-201908000132015,2015,1,1,1,0,1,-9999,-9998,1978.0,...,8,16,1,112374333,-1.0,-1.0,,,,
3,01-01-01-201908000222015,2015,1,1,1,-9998,0,-9999,0,1978.0,...,3,26,1,112374333,3.0,3.0,0.0,831.0,0.0,831.0
4,01-01-01-201908000372015,2015,1,1,1,0,1,-9999,0,1978.0,...,3,19,1,112374333,3.0,3.0,0.0,831.0,0.0,831.0


In [13]:
cases.defendant_won.value_counts()

1    404626
0    395374
Name: defendant_won, dtype: int64

In [14]:
# cases[["date_first_list", "date_first_list_year", "date_first_list_month", "date_first_list_date"]].head()

In [15]:
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle

cases = cases.drop(columns=["ddl_case_id", "year"])

for col in cases.columns:
    cases[col] = cases[col].fillna(-1)

# cases.type_name = cases.type_name.fillna(-1)
# cases.purpose_name = cases.purpose_name.fillna(-1)
# cases = pd.get_dummies(cases, columns=["type_name"], drop_first=True)
cases = pd.get_dummies(cases, columns=["state_code"], drop_first=True)
cases = pd.get_dummies(cases, columns=gender_columns, drop_first=True)

cases = cases.drop(columns=['date_of_filing_month',
       'date_of_filing_date',
       'date_of_decision_month', 'date_of_decision_date',
       'date_first_list_month', 'date_first_list_date',
       'date_next_list_month', 'date_next_list_date', 'date_last_list_month', 'date_last_list_date'] + ['date_of_filing_year', 'date_of_decision_year', 'date_first_list_year',
       'date_next_list_year', 'date_last_list_year', 'population',
       'ddl_filing_judge_id', 'ddl_decision_judge_id'])
# 'decision_judge_duration', 'filing_judge_duration'

oneHot = ['female_decision_judge', 'female_filing_judge']
cases = pd.get_dummies(cases, columns=oneHot, drop_first=True)

cases = shuffle(cases, random_state=42)

In [16]:
train_data, test_data = train_test_split(cases, test_size=0.2, random_state=42)
cases = cases.iloc[0:0]

In [17]:
def split_X_Y(data):
    X = data.copy()
    X = X.drop(columns=["defendant_won", "disp_name"])
    Y = data.defendant_won
    
    return (X, Y)

In [18]:
X_train, Y_train = split_X_Y(train_data)
train_data = train_data.iloc[0:0]
X_test, Y_test = split_X_Y(test_data)
test_data = test_data.iloc[0:0]

In [19]:
# for col in X_train.columns:
#     print(col, "------------------------")
#     print(X_train[col].isna().sum())

## Trying a simple logistic regression

In [20]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler().fit(X_train)
X_train, X_test = scaler.transform(X_train), scaler.transform(X_test)

In [21]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.ensemble import VotingClassifier


log_clf = LogisticRegression(random_state=42, max_iter=1200000)
rnd_clf = RandomForestClassifier(max_depth=4, random_state=42)
svm_clf = SVC(random_state=42)
dt_clf = DecisionTreeClassifier(random_state=42)

voting_clf = VotingClassifier(
    estimators=[('lr', log_clf), ('rf', rnd_clf), ('dt', dt_clf)], 
    voting='hard')

In [34]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

for clf in (voting_clf, log_clf, rnd_clf, dt_clf):
    # ct = int(X_train.shape[0] * 0.01)
    print("-------------")
    print(clf.__class__.__name__)
    clf.fit(X_train, Y_train)
    Y_train_pred, Y_test_pred = clf.predict(X_train), clf.predict(X_test)
    print("Train Accuracy:", accuracy_score(Y_train, Y_train_pred))
    print("Test Accuracy:", accuracy_score(Y_test, Y_test_pred))
    print("Test confusion matrix:\n", confusion_matrix(Y_test, Y_test_pred))
    

-------------
VotingClassifier
Train Accuracy: 0.8650734375
Test Accuracy: 0.84421875
Test confusion matrix:
 [[62479 16761]
 [ 8164 72596]]
-------------
LogisticRegression
Train Accuracy: 0.784628125
Test Accuracy: 0.78459375
Test confusion matrix:
 [[61944 17296]
 [17169 63591]]
-------------
RandomForestClassifier
Train Accuracy: 0.8153734375
Test Accuracy: 0.8155125
Test confusion matrix:
 [[57195 22045]
 [ 7473 73287]]
-------------
DecisionTreeClassifier
Train Accuracy: 0.995490625
Test Accuracy: 0.8814125
Test confusion matrix:
 [[70208  9032]
 [ 9942 70818]]


In [35]:
# X_train.columns

In [36]:
judge_case_key.head()

Unnamed: 0,ddl_case_id,ddl_filing_judge_id,ddl_decision_judge_id
0,01-01-01-201900000022018,5.0,5.0
1,01-01-01-201900000032017,5.0,5.0
2,01-01-01-201900000032018,94.0,94.0
3,01-01-01-201900000042016,3.0,5.0
4,01-01-01-201900000042018,156.0,156.0


In [37]:
import tensorflow as tf
from tensorflow import keras

# model = keras.models.Sequential()
# # model.add(keras.layers.Dense(10000, activation="relu", input_shape=(X_train.shape[1],))) # , kernel_regularizer=keras.regularizers.l2(0.1)
# # model.add(keras.layers.Dense(10000, activation="relu", input_shape=(X_train.shape[1],)))
# model.add(keras.layers.Dense(10000, activation="relu", input_shape=(X_train.shape[1],)))
# model.add(keras.layers.Dense(10000, activation="relu")
# # model.add(keras.layers.Dense(10, activation="relu"))
# # model.add(keras.layers.Dense(20, activation="relu", input_shape=(X_train.shape[1],))) # , kernel_regularizer=keras.regularizers.l2(0.1)

# # for i in range(30):
#     # model.add(keras.layers.Dense(20, activation="relu"))

# # model.add(keras.layers.Dropout(rate=0.2))
# # model.add(keras.layers.Dense(20, activation="relu"))
# # model.add(keras.layers.Dropout(rate=0.2))
# # model.add(keras.layers.Dense(20, activation="relu"))
# # model.add(keras.layers.Dropout(rate=0.2))
# # model.add(keras.layers.Dense(20, activation="relu"))
# # model.add(keras.layers.Dropout(rate=0.2))
# # model.add(keras.layers.Dense(1000, activation="relu"))
# # model.add(keras.layers.Dense(000, activation="relu", kernel_regularizer=keras.regularizers.l2(0.01)))
# # model.add(keras.layers.Dense(200, activation="relu", kernel_regularizer=keras.regularizers.l2(0.01)))
# # model.add(keras.layers.Dense(200, activation="relu", kernel_regularizer=keras.regularizers.l2(0.01)))
# # model.add(keras.layers.Dropout(rate=0.2))
# model.add(keras.layers.Dense(1, activation="sigmoid"))

# model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

In [38]:
# history = model.fit(X_train, np.array(Y_train), epochs=5)

In [39]:
# model.evaluate(X_test, np.array(Y_test))

In [40]:
# print(cases.purpose_name.max())
# print(cases.type_name.max())

In [41]:
# model2 = keras.models.Sequential()
# # model.add(keras.layers.Dense(10000, activation="relu", input_shape=(X_train.shape[1],))) # , kernel_regularizer=keras.regularizers.l2(0.1)
# # model.add(keras.layers.Dense(10000, activation="relu", input_shape=(X_train.shape[1],)))
# model2.add(keras.layers.Dense(100, activation="relu", input_shape=(X_train.shape[1],)))
# for i in range(8):
#     model2.add(keras.layers.Dense(20, activation="relu"))
# model2.add(keras.layers.Dense(1, activation="sigmoid"))

# model2.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

# history2 = model2.fit(X_train, np.array(Y_train), epochs=5)

In [42]:
# model2.evaluate(X_test, np.array(Y_test))

In [43]:
# model2.fit(X_train, np.array(Y_train), epochs=5)

In [52]:
model3 = keras.models.Sequential()
# model.add(keras.layers.Dense(10000, activation="relu", input_shape=(X_train.shape[1],))) # , kernel_regularizer=keras.regularizers.l2(0.1)
# model.add(keras.layers.Dense(10000, activation="relu", input_shape=(X_train.shape[1],)))
model3.add(keras.layers.Dense(1000, activation="relu", input_shape=(X_train.shape[1],)))
for i in range(2):
    model3.add(keras.layers.Dense(1000, activation="relu"))
model3.add(keras.layers.Dense(1, activation="sigmoid"))

model3.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

history3 = model3.fit(X_train, np.array(Y_train), epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [53]:
# history3 = model3.fit(X_train, np.array(Y_train), epochs=5)

In [54]:
model3.evaluate(X_test, np.array(Y_test))



[0.283814936876297, 0.8828374743461609]

In [55]:
history3 = model3.fit(X_train, np.array(Y_train), epochs=25)

Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25


In [56]:
model3.evaluate(X_test, np.array(Y_test))



[0.26997125148773193, 0.8906999826431274]

In [58]:
history3 = model3.fit(X_train, np.array(Y_train), epochs=40)

Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25


In [74]:
model3.evaluate(X_test, np.array(Y_test))



[0.3147026300430298, 0.886343777179718]

In [65]:
from sklearn.metrics import f1_score
pred = model3.predict(X_test)
print(pred)

[[2.2241357e-01]
 [2.0969109e-01]
 [9.8998767e-01]
 ...
 [7.0563681e-02]
 [1.0911323e-06]
 [4.1314462e-07]]


In [73]:
for i in range(len(pred)):
    pred[i] = pred[i] >= 0.5

print(f1_score(pred, Y_test))
print(accuracy_score(pred, Y_test))

0.8875170873822439
0.88634375
