In [5]:
import os

# show current folder
print("Current directory:", os.getcwd())

# list repo folders (to confirm you're in the repo environment)
print("Top-level files/folders:", os.listdir(".")[:20])


Current directory: /content
Top-level files/folders: ['.config', 'witness-preparation-framework', 'sample_data']


In [4]:
!git clone https://github.com/shri4806/witness-preparation-framework.git


Cloning into 'witness-preparation-framework'...
remote: Enumerating objects: 56, done.[K
remote: Counting objects: 100% (56/56), done.[K
remote: Compressing objects: 100% (49/49), done.[K
remote: Total 56 (delta 16), reused 0 (delta 0), pack-reused 0 (from 0)[K
Receiving objects: 100% (56/56), 49.83 KiB | 3.83 MiB/s, done.
Resolving deltas: 100% (16/16), done.


In [6]:
import os

os.listdir(".")


['.config', 'witness-preparation-framework', 'sample_data']

In [7]:
import os

# move into repo
os.chdir("witness-preparation-framework")

# confirm location
print("Current directory:", os.getcwd())

# list contents
print("Repo contents:", os.listdir("."))


Current directory: /content/witness-preparation-framework
Repo contents: ['README.md', 'model', 'LICENSE', 'data', '.git', '.gitignore']


In [8]:
import os

# move into your assigned folder
os.chdir("model/text_processing")

# confirm location
print("Current directory:", os.getcwd())

# list files inside your folder
print("Files here:", os.listdir("."))


Current directory: /content/witness-preparation-framework/model/text_processing
Files here: ['README.md']


In [10]:
# open one witness file and show first 20 lines
file_path = "../../data/witness_qna/" + os.listdir("../../data/witness_qna")[0]

print("Opening file:", file_path)
print("-" * 50)

with open(file_path, "r", encoding="utf-8", errors="ignore") as f:
    for i in range(20):
        print(f.readline().strip())

Opening file: ../../data/witness_qna/README.md
--------------------------------------------------
This folder contains witness-style question and answer data.

The dataset is used to simulate witness testimonies for statement comparison, variation analysis, and inconsistency detection within the framework.



















In [11]:
import os

# go back to repo root
os.chdir("/content/witness-preparation-framework")

# list data folders
print("Data folders:", os.listdir("data"))

# list witness_qna contents
print("Witness QnA files:", os.listdir("data/witness_qna"))


Data folders: ['README.md', 'ipc_sections', 'witness_qna']
Witness QnA files: ['README.md', 'witness_qna_Dataset - Witness_QA.csv']


In [12]:
import pandas as pd
import os

csv_path = "data/witness_qna/witness_qna_Dataset - Witness_QA.csv"

df = pd.read_csv(csv_path)

print("CSV loaded ✅")
print("Shape (rows, cols):", df.shape)
print("\nColumns:\n", df.columns)
print("\nSample rows:\n")
display(df.head(5))


CSV loaded ✅
Shape (rows, cols): (543, 6)

Columns:
 Index(['case_id', 'witness_id', 'stage', 'question', 'answer', 'risk_label'], dtype='object')

Sample rows:



Unnamed: 0,case_id,witness_id,stage,question,answer,risk_label
0,C001,W01,examination,Where were you at the time of the incident?,I was present near the place of occurrence.,low
1,C001,W02,examination,At what time did the incident occur?,The incident occurred at about 7:30 PM.,low
2,C001,W03,examination,Did you see the accused at the spot?,"Yes, I saw the accused near the victim.",low
3,C001,W04,cross_examination,Was it dark at that time?,"Yes, it was getting dark.",low
4,C001,W05,cross_examination,Can you identify the accused with certainty?,"I believe it was him, but I cannot be complete...",high


In [13]:
import pandas as pd

# Load original CSV
df = pd.read_csv("data/witness_qna/witness_qna_Dataset - Witness_QA.csv")

# Create statement_id
df["statement_id"] = ["S" + str(i+1) for i in range(len(df))]

# Combine question + answer into raw_text
df["raw_text"] = df["question"].fillna("") + " " + df["answer"].fillna("")

# Create cleaned_text (simple for now)
df["cleaned_text"] = (
    df["raw_text"]
    .str.lower()
    .str.replace(r"\s+", " ", regex=True)
    .str.strip()
)

# Create final dataframe with locked schema
final_df = pd.DataFrame({
    "statement_id": df["statement_id"],
    "witness_id": df["witness_id"],
    "stage": df["stage"],
    "raw_text": df["raw_text"],
    "cleaned_text": df["cleaned_text"],

    # Member 2 fields (empty)
    "entity_person": [[] for _ in range(len(df))],
    "entity_time": [[] for _ in range(len(df))],
    "entity_location": [[] for _ in range(len(df))],
    "entity_event": [[] for _ in range(len(df))],

    # Member 3 fields (None)
    "text_similarity_score": [None] * len(df),
    "time_inconsistency_flag": [None] * len(df),
    "location_inconsistency_flag": [None] * len(df),
    "entity_contradiction_flag": [None] * len(df),
    "risk_level": [None] * len(df),
    "preparedness_level": [None] * len(df)
})

display(final_df.head())


Unnamed: 0,statement_id,witness_id,stage,raw_text,cleaned_text,entity_person,entity_time,entity_location,entity_event,text_similarity_score,time_inconsistency_flag,location_inconsistency_flag,entity_contradiction_flag,risk_level,preparedness_level
0,S1,W01,examination,Where were you at the time of the incident? I ...,where were you at the time of the incident? i ...,[],[],[],[],,,,,,
1,S2,W02,examination,At what time did the incident occur? The incid...,at what time did the incident occur? the incid...,[],[],[],[],,,,,,
2,S3,W03,examination,"Did you see the accused at the spot? Yes, I sa...","did you see the accused at the spot? yes, i sa...",[],[],[],[],,,,,,
3,S4,W04,cross_examination,"Was it dark at that time? Yes, it was getting ...","was it dark at that time? yes, it was getting ...",[],[],[],[],,,,,,
4,S5,W05,cross_examination,Can you identify the accused with certainty? I...,can you identify the accused with certainty? i...,[],[],[],[],,,,,,


In [14]:
index=False



In [15]:
import os

# go back to your folder
os.chdir("/content/witness-preparation-framework/model/text_processing")

# create outputs folder if not exists
os.makedirs("outputs", exist_ok=True)

# save CSV
final_df.to_csv("outputs/structured_witness_statements.csv", index=False)

print("Saved ✅")
print(os.listdir("outputs"))


Saved ✅
['structured_witness_statements.csv']


In [16]:
with open("README.md", "r", encoding="utf-8") as f:
    print(f.read())


This module handles witness document processing and structuring.

Responsibilities:
- Segment witness documents into statements
- Handle Q&A-style text
- Clean and normalize text for analysis



In [17]:
import os
print(os.getcwd())


/content/witness-preparation-framework/model/text_processing


In [18]:
readme_text = """
# Witness Text Processing & Structuring (Member 1)

## Goal
Convert raw witness Q&A documents into clean, structured statement segments that are easy to analyze in later modules.

This module performs:
- Witness statement structuring (row-level segments)
- Stage tagging (examination / cross_examination / re_examination)
- Text cleaning and normalization for NLP readiness

## Input
Dataset path (read-only):
- `data/witness_qna/witness_qna_Dataset - Witness_QA.csv`

Key input fields used:
- `witness_id`
- `stage`
- `question`
- `answer`

## Output
Saved to:
- `model/text_processing/outputs/structured_witness_statements.csv`

Output follows the **COMMON DATA FORMAT (FINAL & LOCKED)**.
Each row represents **ONE witness statement segment** with these populated fields:
- `statement_id`
- `witness_id`
- `stage`
- `raw_text`
- `cleaned_text`

All entity and analysis fields are included but left empty for downstream modules:
- entity fields: `[]`
- analysis fields: `None`

## Processing Logic (High Level)
1. Load witness Q&A CSV.
2. Create `statement_id` for each row.
3. Construct `raw_text` by combining question + answer without modifying wording.
4. Create `cleaned_text` by applying lightweight normalization:
   - lowercasing
   - whitespace cleanup
   - stripping extra spaces
5. Export structured output CSV.

## One-line viva defense
“This module prepares unstructured courtroom documents by segmenting and normalizing witness statements into structured text units suitable for downstream analysis.”
"""

with open("README.md", "w", encoding="utf-8") as f:
    f.write(readme_text)

print("README updated ✅")


README updated ✅
