In [1]:
import pandas as pd
df = pd.read_csv("golden_sample.csv")
df


Unnamed: 0,statement_id,witness_id,stage,raw_text,cleaned_text,entity_person,entity_time,entity_location,entity_event,text_similarity_score,time_inconsistency_flag,location_inconsistency_flag,entity_contradiction_flag,risk_level,preparedness_level
0,S44,W04,examination,How are you related to the victim? The victim ...,how are you related to the victim? the victim ...,,,,,,,,,,
1,S34,W04,cross_examination,"Did you inform the police immediately? No, I i...","did you inform the police immediately? no, i i...",,,,,,,,,,
2,S68,W08,cross_examination,Can you identify the accused with certainty? I...,can you identify the accused with certainty? i...,,,,,,,,,,
3,S98,W08,examination,How are you related to the victim? The victim ...,how are you related to the victim? the victim ...,,,,,,,,,,
4,S110,W10,examination,At what time did the incident occur? The incid...,at what time did the incident occur? the incid...,,,,,,,,,,
5,S276,W18,examination,What articles were seized from the accused? A ...,what articles were seized from the accused? a ...,,,,,,,,,,


In [2]:
import re

def extract_time(text):
    if pd.isna(text):
        return []
    times = re.findall(r'\b\d{1,2}:\d{2}\s*(?:AM|PM|am|pm)?\b', text)
    return times

df["entity_time"] = df["cleaned_text"].apply(extract_time)
df[["statement_id", "cleaned_text", "entity_time"]]


Unnamed: 0,statement_id,cleaned_text,entity_time
0,S44,how are you related to the victim? the victim ...,[]
1,S34,"did you inform the police immediately? no, i i...",[]
2,S68,can you identify the accused with certainty? i...,[]
3,S98,how are you related to the victim? the victim ...,[]
4,S110,at what time did the incident occur? the incid...,[7:30 pm]
5,S276,what articles were seized from the accused? a ...,[]


In [3]:
df["time_inconsistency_flag"] = False

for wid in df["witness_id"].unique():
    subset = df[df["witness_id"] == wid]
    times = subset["entity_time"].apply(lambda x: tuple(x)).unique()
    times = [t for t in times if len(t) > 0]

    if len(times) > 1:
        df.loc[df["witness_id"] == wid, "time_inconsistency_flag"] = True

df[["statement_id", "witness_id", "entity_time", "time_inconsistency_flag"]]


Unnamed: 0,statement_id,witness_id,entity_time,time_inconsistency_flag
0,S44,W04,[],False
1,S34,W04,[],False
2,S68,W08,[],False
3,S98,W08,[],False
4,S110,W10,[7:30 pm],False
5,S276,W18,[],False


In [4]:
df["risk_level"] = df["time_inconsistency_flag"].apply(
    lambda x: "MEDIUM" if x else "LOW"
)

df[["statement_id", "time_inconsistency_flag", "risk_level"]]


Unnamed: 0,statement_id,time_inconsistency_flag,risk_level
0,S44,False,LOW
1,S34,False,LOW
2,S68,False,LOW
3,S98,False,LOW
4,S110,False,LOW
5,S276,False,LOW


In [5]:
def map_preparedness(risk):
    if risk == "LOW":
        return "HIGH_PREPAREDNESS"
    elif risk == "MEDIUM":
        return "MEDIUM_PREPAREDNESS"
    else:
        return "LOW_PREPAREDNESS"

df["preparedness_level"] = df["risk_level"].apply(map_preparedness)

df[["statement_id", "risk_level", "preparedness_level"]]


Unnamed: 0,statement_id,risk_level,preparedness_level
0,S44,LOW,HIGH_PREPAREDNESS
1,S34,LOW,HIGH_PREPAREDNESS
2,S68,LOW,HIGH_PREPAREDNESS
3,S98,LOW,HIGH_PREPAREDNESS
4,S110,LOW,HIGH_PREPAREDNESS
5,S276,LOW,HIGH_PREPAREDNESS


STRUCTURED WITNESS STATEMENTS DATASET

In [6]:
df_full = pd.read_csv("structured_witness_statements.csv")
df_full.head()


Unnamed: 0,statement_id,witness_id,stage,raw_text,cleaned_text,entity_person,entity_time,entity_location,entity_event,text_similarity_score,time_inconsistency_flag,location_inconsistency_flag,entity_contradiction_flag,risk_level,preparedness_level
0,S1,W01,examination,Where were you at the time of the incident? I ...,where were you at the time of the incident? i ...,[],[],[],[],,,,,,
1,S2,W02,examination,At what time did the incident occur? The incid...,at what time did the incident occur? the incid...,[],[],[],[],,,,,,
2,S3,W03,examination,"Did you see the accused at the spot? Yes, I sa...","did you see the accused at the spot? yes, i sa...",[],[],[],[],,,,,,
3,S4,W04,cross_examination,"Was it dark at that time? Yes, it was getting ...","was it dark at that time? yes, it was getting ...",[],[],[],[],,,,,,
4,S5,W05,cross_examination,Can you identify the accused with certainty? I...,can you identify the accused with certainty? i...,[],[],[],[],,,,,,


In [7]:
# --- Entity Time Extraction ---
df_full["entity_time"] = df_full["cleaned_text"].apply(extract_time)

# --- Time Inconsistency Flag ---
df_full["time_inconsistency_flag"] = False

for wid in df_full["witness_id"].unique():
    subset = df_full[df_full["witness_id"] == wid]
    times = subset["entity_time"].apply(lambda x: tuple(x)).unique()
    times = [t for t in times if len(t) > 0]

    if len(times) > 1:
        df_full.loc[df_full["witness_id"] == wid, "time_inconsistency_flag"] = True

# --- Risk Level ---
df_full["risk_level"] = df_full["time_inconsistency_flag"].apply(
    lambda x: "MEDIUM" if x else "LOW"
)

# --- Preparedness Level ---
df_full["preparedness_level"] = df_full["risk_level"].apply(map_preparedness)

df_full[["statement_id", "witness_id", "entity_time", "time_inconsistency_flag", "risk_level", "preparedness_level"]].head()


Unnamed: 0,statement_id,witness_id,entity_time,time_inconsistency_flag,risk_level,preparedness_level
0,S1,W01,[],True,MEDIUM,MEDIUM_PREPAREDNESS
1,S2,W02,[7:30 pm],True,MEDIUM,MEDIUM_PREPAREDNESS
2,S3,W03,[],True,MEDIUM,MEDIUM_PREPAREDNESS
3,S4,W04,[],True,MEDIUM,MEDIUM_PREPAREDNESS
4,S5,W05,[],True,MEDIUM,MEDIUM_PREPAREDNESS


In [8]:
df_full["time_inconsistency_flag"] = False

for wid in df_full["witness_id"].unique():
    subset = df_full[df_full["witness_id"] == wid]

    # collect only explicit time mentions
    times = []
    for tlist in subset["entity_time"]:
        if len(tlist) > 0:
            times.extend(tlist)

    # keep unique times
    unique_times = set(times)

    # flag only if more than one distinct time exists
    if len(unique_times) > 1:
        df_full.loc[df_full["witness_id"] == wid, "time_inconsistency_flag"] = True

df_full[["statement_id", "witness_id", "entity_time", "time_inconsistency_flag"]].head()


Unnamed: 0,statement_id,witness_id,entity_time,time_inconsistency_flag
0,S1,W01,[],True
1,S2,W02,[7:30 pm],True
2,S3,W03,[],True
3,S4,W04,[],True
4,S5,W05,[],True


In [9]:
df_full["time_inconsistency_flag"] = False

for wid in df_full["witness_id"].unique():
    subset = df_full[df_full["witness_id"] == wid]

    # collect only statements that explicitly mention time
    explicit_times = subset[subset["entity_time"].apply(lambda x: len(x) > 0)]

    # proceed only if at least TWO statements mention time
    if len(explicit_times) >= 2:
        times = []
        for tlist in explicit_times["entity_time"]:
            times.extend(tlist)

        if len(set(times)) > 1:
            df_full.loc[df_full["witness_id"] == wid, "time_inconsistency_flag"] = True

df_full[["statement_id", "witness_id", "entity_time", "time_inconsistency_flag"]].head()


Unnamed: 0,statement_id,witness_id,entity_time,time_inconsistency_flag
0,S1,W01,[],True
1,S2,W02,[7:30 pm],True
2,S3,W03,[],True
3,S4,W04,[],True
4,S5,W05,[],True


In [10]:
df_full[df_full["time_inconsistency_flag"] == True]["witness_id"].unique()[:10]


array(['W01', 'W02', 'W03', 'W04', 'W05', 'W06', 'W07', 'W14', 'W18',
       'W17'], dtype=object)

In [11]:
# Reset flags
df_full["time_inconsistency_flag"] = False

# Process witness by witness
for wid, subset in df_full.groupby("witness_id"):

    # keep only rows that explicitly mention time
    time_rows = subset[subset["entity_time"].apply(lambda x: len(x) > 0)]

    # need at least TWO such statements
    if len(time_rows) >= 2:
        all_times = []
        for tlist in time_rows["entity_time"]:
            all_times.extend(tlist)

        # if more than one distinct time exists
        if len(set(all_times)) > 1:
            df_full.loc[time_rows.index, "time_inconsistency_flag"] = True

# Check result
df_full[["statement_id", "witness_id", "entity_time", "time_inconsistency_flag"]].head()


Unnamed: 0,statement_id,witness_id,entity_time,time_inconsistency_flag
0,S1,W01,[],False
1,S2,W02,[7:30 pm],True
2,S3,W03,[],False
3,S4,W04,[],False
4,S5,W05,[],False


In [12]:
df_full["risk_level"] = df_full["time_inconsistency_flag"].apply(
    lambda x: "MEDIUM" if x else "LOW"
)

df_full[["statement_id", "witness_id", "time_inconsistency_flag", "risk_level"]].head()


Unnamed: 0,statement_id,witness_id,time_inconsistency_flag,risk_level
0,S1,W01,False,LOW
1,S2,W02,True,MEDIUM
2,S3,W03,False,LOW
3,S4,W04,False,LOW
4,S5,W05,False,LOW


In [13]:
def map_preparedness(risk):
    if risk == "LOW":
        return "HIGH_PREPAREDNESS"
    elif risk == "MEDIUM":
        return "MEDIUM_PREPAREDNESS"
    else:
        return "LOW_PREPAREDNESS"

df_full["preparedness_level"] = df_full["risk_level"].apply(map_preparedness)

df_full[["statement_id", "witness_id", "risk_level", "preparedness_level"]].head()


Unnamed: 0,statement_id,witness_id,risk_level,preparedness_level
0,S1,W01,LOW,HIGH_PREPAREDNESS
1,S2,W02,MEDIUM,MEDIUM_PREPAREDNESS
2,S3,W03,LOW,HIGH_PREPAREDNESS
3,S4,W04,LOW,HIGH_PREPAREDNESS
4,S5,W05,LOW,HIGH_PREPAREDNESS
