# Member 3 – Statement Comparison & Inconsistency Detection

This notebook implements the **Analytical Intelligence Layer** of the Witness Preparation Framework.

The purpose of this module is to:
- Compare witness statements across stages and witnesses
- Identify potential inconsistencies using similarity and rule-based logic
- Highlight cross-examination risks and preparedness indicators

⚠️ This analysis is **assistive and heuristic**, not predictive or judgmental.


In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity


In [2]:
data_path = "/content/witness_qna_Dataset - Witness_QA.csv"
df = pd.read_csv(data_path)


In [3]:
df.head()


Unnamed: 0,case_id,witness_id,stage,question,answer,risk_label
0,C001,W01,examination,Where were you at the time of the incident?,I was present near the place of occurrence.,low
1,C001,W02,examination,At what time did the incident occur?,The incident occurred at about 7:30 PM.,low
2,C001,W03,examination,Did you see the accused at the spot?,"Yes, I saw the accused near the victim.",low
3,C001,W04,cross_examination,Was it dark at that time?,"Yes, it was getting dark.",low
4,C001,W05,cross_examination,Can you identify the accused with certainty?,"I believe it was him, but I cannot be complete...",high


In [4]:
df.columns


Index(['case_id', 'witness_id', 'stage', 'question', 'answer', 'risk_label'], dtype='object')

In [5]:
len(df)


543

In [6]:
df["statement_text"] = df["question"].astype(str) + " " + df["answer"].astype(str)


In [7]:
df["statement_text"].iloc[0]


'Where were you at the time of the incident? I was present near the place of occurrence.'

In [8]:
df["stage"].unique()


array(['examination', 'cross_examination'], dtype=object)

In [9]:
df["stage"].value_counts()


Unnamed: 0_level_0,count
stage,Unnamed: 1_level_1
examination,441
cross_examination,102


In [10]:
df["witness_id"].value_counts().head()


Unnamed: 0_level_0,count
witness_id,Unnamed: 1_level_1
W01,67
W03,60
W02,56
W06,52
W07,42


In [11]:
df[df["witness_id"] == "W01"][["stage", "statement_text"]].head()


Unnamed: 0,stage,statement_text
0,examination,Where were you at the time of the incident? I ...
10,examination,At what time did the incident occur? The incid...
20,examination,"Did you see the accused at the spot? Yes, I sa..."
30,cross_examination,"Was it dark at that time? Yes, it was getting ..."
40,cross_examination,Can you identify the accused with certainty? I...


In [12]:
# pick one witness
witness_df = df[df["witness_id"] == "W01"]

# separate by stage
exam_df = witness_df[witness_df["stage"] == "examination"]
cross_df = witness_df[witness_df["stage"] == "cross_examination"]

len(exam_df), len(cross_df)


(53, 14)

In [13]:
exam_stmt = exam_df.iloc[0]["statement_text"]
cross_stmt = cross_df.iloc[0]["statement_text"]

exam_stmt, cross_stmt


('Where were you at the time of the incident? I was present near the place of occurrence.',
 'Was it dark at that time? Yes, it was getting dark.')

In [14]:
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform([exam_stmt, cross_stmt])

similarity_score = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:2])[0][0]
similarity_score


np.float64(0.11389471295191682)

In [15]:
df["answer_text"] = df["answer"].astype(str)


In [18]:
exam_ans = exam_df.iloc[0]["answer"]
cross_ans = cross_df.iloc[0]["answer"]

vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform([exam_ans, cross_ans])

answer_similarity = cosine_similarity(
    tfidf_matrix[0:1], tfidf_matrix[1:2]
)[0][0]

answer_similarity

np.float64(0.09349477497536716)

In [25]:
def assign_topic(text):
    text = text.lower()

    if any(word in text for word in ["time", "dark", "night", "morning", "evening", "pm", "am", ":"]):
        return "time"
    if any(word in text for word in ["place", "near", "location", "spot", "area"]):
        return "location"
    if any(word in text for word in ["see", "saw", "identify", "recognize"]):
        return "identification"

    return "other"


In [26]:
assign_topic(exam_ans)


'location'

In [27]:
assign_topic(cross_ans)


'time'

In [28]:
exam_topic = assign_topic(exam_ans)
cross_topic = assign_topic(cross_ans)

exam_topic == cross_topic


False

In [30]:
df["answer_topic"] = df["answer"].astype(str).apply(assign_topic)


In [31]:
df[["answer", "answer_topic"]].head()


Unnamed: 0,answer,answer_topic
0,I was present near the place of occurrence.,location
1,The incident occurred at about 7:30 PM.,time
2,"Yes, I saw the accused near the victim.",location
3,"Yes, it was getting dark.",time
4,"I believe it was him, but I cannot be complete...",other


In [32]:
# topics of our sample answers
exam_topic = assign_topic(exam_ans)
cross_topic = assign_topic(cross_ans)

# eligibility check
eligible_for_comparison = (exam_topic == cross_topic)
eligible_for_comparison


False

In [33]:
# filter only witness W01
w1 = df[df["witness_id"] == "W01"]

# keep only rows where topic is 'time'
time_rows = w1[w1["answer_topic"] == "time"]

# see a few rows
time_rows[["stage", "answer"]].head()


Unnamed: 0,stage,answer
10,examination,The incident occurred at about 7:30 PM.
30,cross_examination,"Yes, it was getting dark."
60,cross_examination,"No, I informed the police the next morning."
100,examination,The incident occurred at about 7:30 PM.
120,cross_examination,"Yes, it was getting dark."


In [34]:
# pick one time-related exam and cross answer
time_exam = time_rows[time_rows["stage"] == "examination"].iloc[0]["answer"]
time_cross = time_rows[time_rows["stage"] == "cross_examination"].iloc[0]["answer"]

time_exam, time_cross


('The incident occurred at about 7:30 PM.', 'Yes, it was getting dark.')

In [35]:
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform([time_exam, time_cross])

valid_similarity = cosine_similarity(
    tfidf_matrix[0:1], tfidf_matrix[1:2]
)[0][0]

valid_similarity


np.float64(0.0)

In [36]:
def has_explicit_time(text):
    text = text.lower()
    return any(word in text for word in ["am", "pm", ":", "morning", "evening", "night"])


In [37]:
has_explicit_time(time_exam)


True

In [38]:
has_explicit_time(time_cross)


False

In [40]:
if has_explicit_time(time_exam) and has_explicit_time(time_cross):
    print("CHECK TIMES")
else:
    print("NO TIME INCONSISTENCY")



NO TIME INCONSISTENCY


In [41]:
def detect_time_inconsistency(ans1, ans2):
    if has_explicit_time(ans1) and has_explicit_time(ans2):
        return "CHECK TIMES"
    else:
        return "NO TIME INCONSISTENCY"


In [42]:
detect_time_inconsistency(time_exam, time_cross)


'NO TIME INCONSISTENCY'

In [43]:
def extract_simple_time(text):
    text = text.lower()
    for token in text.split():
        if "am" in token or "pm" in token or ":" in token:
            return token
    return None


In [44]:
extract_simple_time(time_exam)


'7:30'

In [45]:
extract_simple_time(time_cross)


In [46]:
print(extract_simple_time(time_cross))


None


In [47]:
def is_time_inconsistent(ans1, ans2):
    # both must mention explicit time
    if not (has_explicit_time(ans1) and has_explicit_time(ans2)):
        return False

    t1 = extract_simple_time(ans1)
    t2 = extract_simple_time(ans2)

    # if either time could not be extracted safely
    if t1 is None or t2 is None:
        return False

    # inconsistency only if times are different
    return t1 != t2


In [48]:
is_time_inconsistent(time_exam, time_cross)


False

In [49]:
def has_location(text):
    text = text.lower()
    return any(word in text for word in [
        "place", "near", "at", "inside", "outside", "spot", "area", "location"
    ])


In [50]:
has_location(exam_ans)


True

In [51]:
has_location(cross_ans)


False

In [52]:
def extract_simple_location(text):
    text = text.lower()
    words = text.split()

    for i, word in enumerate(words):
        if word in ["place", "spot", "area", "location"]:
            # return the word and one word before it (if exists)
            return " ".join(words[max(0, i-1):i+1])
        if word in ["inside", "outside", "near", "at"]:
            return word
    return None


In [53]:
extract_simple_location(exam_ans)


'near'

In [54]:
extract_simple_location(cross_ans)


In [55]:
print(extract_simple_location(cross_ans))


None


In [56]:
def is_location_inconsistent(ans1, ans2):
    # both must mention location
    if not (has_location(ans1) and has_location(ans2)):
        return False

    loc1 = extract_simple_location(ans1)
    loc2 = extract_simple_location(ans2)

    # if either location could not be extracted safely
    if loc1 is None or loc2 is None:
        return False

    # inconsistency only if locations are different
    return loc1 != loc2


In [57]:
is_location_inconsistent(exam_ans, cross_ans)


False

In [58]:
def has_identification(text):
    text = text.lower()
    return any(word in text for word in [
        "see", "saw", "seen", "identify", "identified",
        "recognize", "recognised", "accused", "person", "him", "her"
    ])


In [59]:
has_identification(df.iloc[2]["answer"])


True

In [60]:
has_identification(time_exam)


False

In [61]:
def identification_type(text):
    text = text.lower()

    # negative or uncertain identification
    if any(word in text for word in [
        "could not identify", "cannot identify", "not sure",
        "not certain", "couldn't recognize", "cannot recognize"
    ]):
        return "negative"

    # positive identification
    if any(word in text for word in [
        "saw", "identified", "recognized", "clearly saw", "sure it was"
    ]):
        return "positive"

    return "unknown"


In [62]:
identification_type(df.iloc[2]["answer"])


'positive'

In [63]:
identification_type(df.iloc[4]["answer"])


'unknown'

In [64]:
def is_identification_inconsistent(ans1, ans2):
    if not (has_identification(ans1) and has_identification(ans2)):
        return False

    id1 = identification_type(ans1)
    id2 = identification_type(ans2)

    # flag only clear contradiction
    if id1 == "positive" and id2 == "negative":
        return True
    if id1 == "negative" and id2 == "positive":
        return True

    return False


In [65]:
is_identification_inconsistent(
    df.iloc[2]["answer"],   # positive identification
    df.iloc[4]["answer"]    # unknown / uncertain
)


False

In [66]:
def assign_risk_level(ans1, ans2):
    # highest priority: identification inconsistency
    if is_identification_inconsistent(ans1, ans2):
        return "HIGH"

    # medium risk: time or location inconsistency
    if is_time_inconsistent(ans1, ans2) or is_location_inconsistent(ans1, ans2):
        return "MEDIUM"

    # no inconsistency detected
    return "LOW"


In [67]:
assign_risk_level(time_exam, time_cross)


'LOW'

In [71]:
def assign_risk_level(ans1, ans2):
    # highest priority: identification inconsistency
    if is_identification_inconsistent(ans1, ans2):
        return "HIGH"

    # medium risk: time or location inconsistency
    if is_time_inconsistent(ans1, ans2) or is_location_inconsistent(ans1, ans2):
        return "MEDIUM"

    # clarification needed: vague or uncertain answers
    if is_vague(ans1) or is_vague(ans2):
        return "CLARIFICATION_NEEDED"

    # no issues detected
    return "LOW"



In [72]:
assign_risk_level(
    "I do not remember the details clearly.",
    "I am not sure about the incident."
)


'CLARIFICATION_NEEDED'

In [73]:
def is_vague(text):
    text = text.lower()
    return any(phrase in text for phrase in [
        "not sure", "do not remember", "don't remember",
        "cannot recall", "can't recall", "no idea",
        "uncertain", "not clear"
    ])


In [74]:
def assign_preparedness_level(risk_level):
    if risk_level == "LOW":
        return "HIGH_PREPAREDNESS"

    if risk_level == "CLARIFICATION_NEEDED":
        return "MEDIUM_PREPAREDNESS"

    # MEDIUM or HIGH risk
    return "LOW_PREPAREDNESS"


In [75]:
assign_preparedness_level("CLARIFICATION_NEEDED")


'MEDIUM_PREPAREDNESS'

In [76]:
QUESTION_BANK = {
    "time": [
        "Can you clarify the exact time of the incident?",
        "Earlier you mentioned a different time. Can you explain the discrepancy?",
        "At what precise time did the incident occur?",
        "Are you certain about the time you just mentioned?",
        "Did you mention the same time in your earlier statement?",
        "What were you doing immediately before the incident occurred?"
    ],

    "location": [
        "Can you specify your exact location at the time of the incident?",
        "Earlier you mentioned a different place. Can you clarify this?",
        "Where exactly were you positioned during the incident?",
        "How far were you from the place of occurrence?",
        "Did you mention the same location in your earlier statement?",
        "Were there any landmarks near your location?"
    ],

    "identification": [
        "How certain are you about identifying the accused?",
        "Earlier you gave a different account regarding identification. Can you explain?",
        "Did you clearly see the person you are referring to?",
        "Were there any conditions affecting your ability to identify the accused?",
        "Have you consistently identified the same person throughout?",
        "Can you explain why your identification differs across statements?"
    ],

    "clarification": [
        "Could you please provide more details regarding this point?",
        "Can you clarify what you meant by that statement?",
        "Are you certain about the information you just provided?",
        "Is there anything you would like to add to make this clearer?",
        "Do you recall any additional details now?",
        "Would you like to clarify your earlier response?"
    ]
}


In [77]:
def suggest_questions(ans1, ans2):
    # identification inconsistency → highest priority
    if is_identification_inconsistent(ans1, ans2):
        return QUESTION_BANK["identification"]

    # time inconsistency
    if is_time_inconsistent(ans1, ans2):
        return QUESTION_BANK["time"]

    # location inconsistency
    if is_location_inconsistent(ans1, ans2):
        return QUESTION_BANK["location"]

    # vague answers → clarification
    if is_vague(ans1) or is_vague(ans2):
        return QUESTION_BANK["clarification"]

    # no issues → no suggested questions
    return []


In [78]:
suggest_questions(
    "The incident occurred at about 7:30 PM.",
    "The incident occurred around 9:00 PM."
)


['Can you clarify the exact time of the incident?',
 'Earlier you mentioned a different time. Can you explain the discrepancy?',
 'At what precise time did the incident occur?',
 'Are you certain about the time you just mentioned?',
 'Did you mention the same time in your earlier statement?',
 'What were you doing immediately before the incident occurred?']

In [79]:
def suggest_questions(ans1, ans2, top_n=5):
    if is_identification_inconsistent(ans1, ans2):
        return QUESTION_BANK["identification"][:top_n]

    if is_time_inconsistent(ans1, ans2):
        return QUESTION_BANK["time"][:top_n]

    if is_location_inconsistent(ans1, ans2):
        return QUESTION_BANK["location"][:top_n]

    if is_vague(ans1) or is_vague(ans2):
        return QUESTION_BANK["clarification"][:top_n]

    return []


In [80]:
suggest_questions(
    "The incident occurred at about 7:30 PM.",
    "The incident occurred around 9:00 PM.",
    top_n=3
)


['Can you clarify the exact time of the incident?',
 'Earlier you mentioned a different time. Can you explain the discrepancy?',
 'At what precise time did the incident occur?']

In [81]:
# example answers
ans1 = "The incident occurred at about 7:30 PM."
ans2 = "The incident occurred around 9:00 PM."

risk = assign_risk_level(ans1, ans2)
preparedness = assign_preparedness_level(risk)
questions = suggest_questions(ans1, ans2, top_n=5)

final_output = {
    "risk_level": risk,
    "preparedness_level": preparedness,
    "suggested_questions": questions
}

final_output


{'risk_level': 'MEDIUM',
 'preparedness_level': 'LOW_PREPAREDNESS',
 'suggested_questions': ['Can you clarify the exact time of the incident?',
  'Earlier you mentioned a different time. Can you explain the discrepancy?',
  'At what precise time did the incident occur?',
  'Are you certain about the time you just mentioned?',
  'Did you mention the same time in your earlier statement?']}