In [1]:
!pip install spacy pandas




In [2]:
import pandas as pd
import spacy
import re

#print("Libraries imported successfully")


Libraries imported successfully


In [1]:
!python -m spacy download en_core_web_sm


Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m87.0 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [1]:
import spacy

nlp = spacy.load("en_core_web_sm")
#print("SpaCy model loaded successfully")


SpaCy model loaded successfully


In [2]:
import pandas as pd

df = pd.read_csv("/content/witness_qna_Dataset - Witness_QA (1).csv")

#print("Dataset loaded successfully")
print(df.head())


Dataset loaded successfully
  case_id witness_id              stage  \
0    C001        W01        examination   
1    C001        W02        examination   
2    C001        W03        examination   
3    C001        W04  cross_examination   
4    C001        W05  cross_examination   

                                       question  \
0   Where were you at the time of the incident?   
1          At what time did the incident occur?   
2          Did you see the accused at the spot?   
3                     Was it dark at that time?   
4  Can you identify the accused with certainty?   

                                              answer risk_label  
0        I was present near the place of occurrence.        low  
1            The incident occurred at about 7:30 PM.        low  
2            Yes, I saw the accused near the victim.        low  
3                          Yes, it was getting dark.        low  
4  I believe it was him, but I cannot be complete...       high  


In [3]:
# TEMPORARY cleaned_text for testing only
df["cleaned_text"] = (
    df["question"].fillna("") + " " + df["answer"].fillna("")
)

print(df[["witness_id", "stage", "cleaned_text"]].head())


  witness_id              stage  \
0        W01        examination   
1        W02        examination   
2        W03        examination   
3        W04  cross_examination   
4        W05  cross_examination   

                                        cleaned_text  
0  Where were you at the time of the incident? I ...  
1  At what time did the incident occur? The incid...  
2  Did you see the accused at the spot? Yes, I sa...  
3  Was it dark at that time? Yes, it was getting ...  
4  Can you identify the accused with certainty? I...  


In [4]:
# Initialize entity columns as EMPTY LISTS
df["entity_person"] = [[] for _ in range(len(df))]
df["entity_time"] = [[] for _ in range(len(df))]
df["entity_location"] = [[] for _ in range(len(df))]
df["entity_event"] = [[] for _ in range(len(df))]

print(df[[
    "entity_person",
    "entity_time",
    "entity_location",
    "entity_event"
]].head())


  entity_person entity_time entity_location entity_event
0            []          []              []           []
1            []          []              []           []
2            []          []              []           []
3            []          []              []           []
4            []          []              []           []


In [5]:
def extract_person_entities(text):
    doc = nlp(text)
    persons = []

    for ent in doc.ents:
        if ent.label_ == "PERSON":
            persons.append(ent.text)

    return list(set(persons))  # remove duplicates


# Apply to dataframe
df["entity_person"] = df["cleaned_text"].apply(extract_person_entities)

# Preview results
df[["cleaned_text", "entity_person"]].head()


Unnamed: 0,cleaned_text,entity_person
0,Where were you at the time of the incident? I ...,[]
1,At what time did the incident occur? The incid...,[]
2,"Did you see the accused at the spot? Yes, I sa...",[]
3,"Was it dark at that time? Yes, it was getting ...",[]
4,Can you identify the accused with certainty? I...,[]


In [7]:
import re


In [10]:
def extract_time_entities(text):
    times = []

    # 1️⃣ SpaCy-based TIME entities
    doc = nlp(text)
    for ent in doc.ents:
        if ent.label_ == "TIME":
            times.append(ent.text)

    # 2️⃣ Regex-based full time patterns (FIXED)
    time_pattern = r'\b\d{1,2}(?::\d{2})?\s?(?:AM|PM|am|pm)\b'
    regex_times = re.findall(time_pattern, text)

    times.extend(regex_times)

    return list(set(times))



In [11]:
df["entity_time"] = df["cleaned_text"].apply(extract_time_entities)
df[["cleaned_text", "entity_time"]].head()



Unnamed: 0,cleaned_text,entity_time
0,Where were you at the time of the incident? I ...,[]
1,At what time did the incident occur? The incid...,"[about 7:30 PM, 7:30 PM]"
2,"Did you see the accused at the spot? Yes, I sa...",[]
3,"Was it dark at that time? Yes, it was getting ...",[]
4,Can you identify the accused with certainty? I...,[]


In [12]:
def extract_location_entities(text):
    locations = []
    doc = nlp(text)

    for ent in doc.ents:
        if ent.label_ in ["GPE", "LOC", "FAC"]:
            locations.append(ent.text)

    return list(set(locations))


# Apply to dataframe
df["entity_location"] = df["cleaned_text"].apply(extract_location_entities)

# Preview results
df[["cleaned_text", "entity_location"]].head()


Unnamed: 0,cleaned_text,entity_location
0,Where were you at the time of the incident? I ...,[]
1,At what time did the incident occur? The incid...,[]
2,"Did you see the accused at the spot? Yes, I sa...",[]
3,"Was it dark at that time? Yes, it was getting ...",[]
4,Can you identify the accused with certainty? I...,[]


In [14]:
# Simple, explainable event keywords
EVENT_KEYWORDS = [
    "incident",
    "assault",
    "attack",
    "fight",
    "stab",
    "theft",
    "robbery",
    "murder",
    "presence"
]

def extract_event_entities(text):
    events = []
    text_lower = text.lower()

    for keyword in EVENT_KEYWORDS:
        if keyword in text_lower:
            events.append(keyword)

    return list(set(events))


# Apply to dataframe
df["entity_event"] = df["cleaned_text"].apply(extract_event_entities)

# Preview results
df[["cleaned_text", "entity_event"]].head()


Unnamed: 0,cleaned_text,entity_event
0,Where were you at the time of the incident? I ...,[incident]
1,At what time did the incident occur? The incid...,[incident]
2,"Did you see the accused at the spot? Yes, I sa...",[]
3,"Was it dark at that time? Yes, it was getting ...",[]
4,Can you identify the accused with certainty? I...,[]
