In [9]:
import pandas as pd
import re
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk

# Download NLTK data files (only need to do this once)
nltk.download('punkt')
nltk.download('stopwords')

# Load the dataset
file_path = r"C:\Users\sulai\Downloads\final_accident_data.csv"
df = pd.read_csv(file_path)

import re
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk

# Download NLTK data files (only need to do this once)
nltk.download('punkt')
nltk.download('stopwords')

def clean_text(text):
    # Convert to lowercase
    text = text.lower()
    
    # Remove punctuation except for specific characters (e.g., ':')
    text = text.translate(str.maketrans('', '', string.punctuation.replace(':', '')))
    
    # Tokenize the text
    tokens = word_tokenize(text)
    
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    
    # Keep numbers and FIR numbers (e.g., P21, P18)
    tokens = [word for word in tokens if re.match(r'\d+|p\d+|:\d+|[a-zA-Z]+', word)]
    
    # Join tokens back into a single string
    cleaned_text = ' '.join(tokens)
    
    return cleaned_text

# Apply the cleaning function to the unstructured text column
df['Cleaned Text'] = df['Unstructured Text'].apply(clean_text)

# Save the cleaned data back to the same CSV file
df.to_csv(file_path, index=False)

# Example of cleaned text
print(df['Cleaned Text'].head())

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\sulai\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\sulai\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\sulai\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\sulai\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


0    01 dec 2021 05:30 pm accident occurred thiruva...
1    31 dec 2024 06:30 accident occurred thiruvanan...
2    24 dec 2024 08:45 accident occurred thiruvanan...
3    01 jan 2023 02:15 pm accident occurred thiruva...
4    17 jan 2024 05:45 pm accident occurred thiruva...
Name: Cleaned Text, dtype: object


# Text Preprocessing for Accident Reports

## Overview
This script processes accident report data by cleaning and structuring the text for further analysis.

## Steps Involved

### 1. Loading the Dataset
- Reads a CSV file (`final_accident_data.csv`) containing accident reports.
- Assumes the dataset has a column named **"Unstructured Text"**, which contains raw accident descriptions.

### 2. Cleaning the Text
- **Converts to Lowercase**: Ensures uniformity by making all words lowercase.
- **Removes Punctuation**: Eliminates unnecessary symbols except for colons (`:`).
- **Tokenization**: Breaks the text into individual words (tokens).
- **Removes Stopwords**: Filters out common words (e.g., "the", "is", "and") that do not contribute much meaning.
- **Retains Relevant Numbers and FIR Patterns**: Keeps numbers, specific patterns like `P21`, and time-related values (`:15`).
- **Reconstructs Cleaned Text**: Joins the processed words back into a cleaned sentence.

### 3. Saving and Displaying Results
- Stores the cleaned text in a new column (**"Cleaned Text"**) in the same dataset.
- Saves the updated CSV file.
- Prints a few examples of cleaned text for verification.

## Purpose
This preprocessing prepares the text for further NLP tasks like classification, summarization, or chatbot integration by reducing noise while preserving key information.


In [11]:
# Add <start> and <end> tokens to the cleaned text and recommendation columns
df['Cleaned Text'] = df['Cleaned Text'].apply(lambda x: '<start> ' + x + ' <end>')
df['Recommendation'] = df['Recommendation'].apply(lambda x: '<start> ' + x + ' <end>')

# Adding Start and End Tokens to Processed Text

## Overview
This step enhances the dataset by adding special tokens (`<start>` and `<end>`) to mark the beginning and end of sentences in two key columns: **"Cleaned Text"** and **"Recommendation"**.

## Purpose of Start and End Tokens
- **Improves NLP Models**: Helps models understand sentence boundaries.
- **Useful for Seq2Seq Models**: Essential for training models like LSTMs or Transformers in text generation tasks.
- **Enhances Data Consistency**: Ensures structured input for processing.

## Implementation Details
- The **"Cleaned Text"** column is updated by adding:
  - `<start>` at the beginning.
  - `<end>` at the end.
- The **"Recommendation"** column is processed similarly.

In [13]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Tokenizer for input (Cleaned Text) and output (Recommendation)
tokenizer = Tokenizer(oov_token='<unk>')
tokenizer.fit_on_texts(df['Cleaned Text'].tolist() + df['Recommendation'].tolist())

# Convert text to sequences
input_sequences = tokenizer.texts_to_sequences(df['Cleaned Text'])
output_sequences = tokenizer.texts_to_sequences(df['Recommendation'])

# Padding sequences
max_len_input = 200  # Max length for input sequences
max_len_output = 50  # Max length for output sequences
input_padded = pad_sequences(input_sequences, maxlen=max_len_input, padding='post', truncating='post')
output_padded = pad_sequences(output_sequences, maxlen=max_len_output, padding='post', truncating='post')

In [15]:
from sklearn.model_selection import train_test_split

# Split dataset
X_train, X_test, y_train, y_test = train_test_split(input_padded, output_padded, test_size=0.2, random_state=42)

# Tokenization and Sequence Preparation for NLP Model

## Overview
This step processes textual data by tokenizing, converting to sequences, padding, and splitting into training and testing datasets.

## Steps Involved:

### 1. **Tokenization**
- A `Tokenizer` is created to convert words into numerical representations.
- The special token `<unk>` is used for out-of-vocabulary (OOV) words.
- The tokenizer is trained on both the **"Cleaned Text"** (input) and **"Recommendation"** (output) columns.

### 2. **Text to Sequence Conversion**
- Each sentence in **"Cleaned Text"** and **"Recommendation"** is transformed into a sequence of numerical tokens.

### 3. **Padding Sequences**
- Since different sentences have different lengths, padding ensures consistency.
- **Maximum sequence length**:
  - **Input text:** 200 tokens
  - **Output text:** 50 tokens
- Padding is applied **post-sequence**, ensuring shorter sequences do not affect model processing.

### 4. **Splitting Dataset**
- The data is divided into **training (80%)** and **testing (20%)** sets.
- Ensures proper evaluation of the NLP model.

In [17]:
import numpy as np
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Embedding, Dense
import pickle

# Load GloVe embeddings
glove_file = r"C:\Users\sulai\Downloads\NLP\glove.6B.100d.txt"
embeddings_index = {}
with open(glove_file, 'r', encoding='utf-8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs

# Create embedding matrix
embedding_dim = 100  # Dimension of GloVe embeddings
vocab_size = len(tokenizer.word_index) + 1  # Vocabulary size

embedding_matrix = np.zeros((vocab_size, embedding_dim))
for word, i in tokenizer.word_index.items():
    if word in embeddings_index:
        embedding_matrix[i] = embeddings_index[word]

# Define the model
embedding_dim = 100  # Embedding dimension
lstm_units = 256  # Number of LSTM units
vocab_size = len(tokenizer.word_index) + 1  # Vocabulary size

# Encoder
encoder_inputs = Input(shape=(max_len_input,))
enc_emb = Embedding(vocab_size, embedding_dim, weights=[embedding_matrix], trainable=False)(encoder_inputs)
encoder_lstm = LSTM(lstm_units, return_state=True)
encoder_outputs, state_h, state_c = encoder_lstm(enc_emb)

# Decoder
decoder_inputs = Input(shape=(max_len_output - 1,))
dec_emb = Embedding(vocab_size, embedding_dim, weights=[embedding_matrix], trainable=False)(decoder_inputs)
decoder_lstm = LSTM(lstm_units, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(dec_emb, initial_state=[state_h, state_c])
decoder_dense = Dense(vocab_size, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

# Define the model
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy')
model.summary()

# Prepare decoder input and output sequences
decoder_input_data = output_padded[:, :-1]  # Exclude the last token
decoder_output_data = output_padded[:, 1:]  # Exclude the first token

# Train the model
history = model.fit(
    [X_train, decoder_input_data], 
    decoder_output_data, 
    epochs=10, 
    batch_size=64, 
    validation_split=0.2
)

# Save the model and tokenizer
model.save('lstm_model.h5')
with open('tokenizer.pkl', 'wb') as f:
    pickle.dump(tokenizer, f)

Epoch 1/10
[1m39/39[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 378ms/step - loss: 4.9468 - val_loss: 0.4753
Epoch 2/10
[1m39/39[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 363ms/step - loss: 0.4415 - val_loss: 0.3441
Epoch 3/10
[1m39/39[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 370ms/step - loss: 0.3241 - val_loss: 0.2419
Epoch 4/10
[1m39/39[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 352ms/step - loss: 0.2234 - val_loss: 0.1485
Epoch 5/10
[1m39/39[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 327ms/step - loss: 0.1361 - val_loss: 0.0926
Epoch 6/10
[1m39/39[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 330ms/step - loss: 0.0890 - val_loss: 0.0640
Epoch 7/10
[1m39/39[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 328ms/step - loss: 0.0625 - val_loss: 0.0489
Epoch 8/10
[1m39/39[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 320ms/step - loss: 0.0479 - val_loss: 0.0381
Epoch 9/10
[1m39/39[0m [32m━━



# LSTM-Based Sequence-to-Sequence Model for Text Generation

## Overview
This step builds and trains an **LSTM-based sequence-to-sequence model** for accident report analysis.  
It includes **GloVe embeddings**, an **encoder-decoder architecture**, and **sequence preparation**.

---

## **1. Loading Pretrained GloVe Embeddings**
- GloVe (Global Vectors for Word Representation) embeddings are used to create word vectors.
- The embeddings are loaded from a file (`glove.6B.100d.txt`) and stored in a dictionary.
- An **embedding matrix** is created to map each word in the vocabulary to its corresponding GloVe vector.

---

## **2. Defining the Encoder-Decoder Architecture**
### **Encoder**
- **Input:** Preprocessed accident report text (`Cleaned Text`).
- **Embedding Layer:** Uses **pretrained GloVe vectors**.
- **LSTM Layer:** Processes text and captures **contextual information**.
- **Hidden State & Cell State:** Passed to the decoder.

### **Decoder**
- **Input:** Recommendations (`Recommendation`).
- **Embedding Layer:** Uses the same **GloVe embedding matrix**.
- **LSTM Layer:** Generates sequential outputs with **attention to the encoder’s state**.
- **Dense Layer:** Outputs predictions using a **softmax activation function**.

---

## **3. Model Summary**


In [43]:
# Assuming you have trained your model and it is stored in the variable `model`
model.save('lstm_model.keras')

In [21]:
# Define encoder and decoder models for inference
encoder_model_inf = Model(encoder_inputs, [state_h, state_c])

decoder_state_input_h = Input(shape=(lstm_units,))
decoder_state_input_c = Input(shape=(lstm_units,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]

decoder_outputs, state_h, state_c = decoder_lstm(dec_emb, initial_state=decoder_states_inputs)
decoder_states = [state_h, state_c]
decoder_outputs = decoder_dense(decoder_outputs)

decoder_model_inf = Model([decoder_inputs] + decoder_states_inputs, [decoder_outputs] + decoder_states)

# Inference Setup for Sequence-to-Sequence Model

## **Overview**
Once the LSTM-based **encoder-decoder model** is trained, we need to set up **separate models for inference**.  
Inference models allow us to generate predictions **one word at a time**, using the encoder's output states as input for the decoder.

---

## **1. Defining the Encoder Model for Inference**
- The **encoder** takes an input sequence and generates **hidden states** (`state_h`) and **cell states** (`state_c`).
- These states will be used to initialize the **decoder** during inference.

```python
encoder_model_inf = Model(encoder_inputs, [state_h, state_c])


In [41]:
import json

patterns = {
    'District': r'\b(thiruvananthapuram city)\b',
    'PS Name': r'\b(vattiyoorkavu|vanchiyoor)\b',
    'Date Report': r'\b(\d{2} [a-z]{3} \d{4})\b',
    'Date Accident': r'\b(\d{2} [a-z]{3} \d{4})\b',
    'Time Accident': r'\b(\d{1,2}:\d{2} (?:am|pm))\b',
    'Accident type': r'\baccident type (\w+ injury|fatal)\b',
    'Death': r'\b(\d+) fatalities\b',
    'Grievous': r'\b(\d+) grievous injuries\b',
    'Minor': r'\b(\d+) minor injuries\b',
    'Pedestrian': r'\b(\d+) pedestrian\b',
    'Cyclist': r'\b(\d+) cyclist\b',
    'Place of Occurance': r'\baccident took place (\w+)\b',
    'Type Area': r'\b(\w+) area\b',
    'City/Town/Village': r'\b(\w+) (?:city|town|village)\b',
    'Lanes Road': r'\b(\w+) lanes\b',
    'Divider': r'\bdivider (\w+)\b',
    'Spot Accident': r'\bspot classification (\w+)\b',
    'Weather': r'\bweather conditions (\w+)\b',
    'Collision': r'\bcollision (\w+)\b',
    'Type Road': r'\broad type (\w+)\b',
    'Road Features': r'\bfeatures (\w+)\b',
    'Visibility': r'\bvisibility (\w+)\b',
    'Traffic Control': r'\btraffic control scene (\w+)\b',
    'Accussed Vehicle': r'\baccident involved (\w+)\b',
    'Victim Vehicle': r'\b(\w+) vehicle\b',
    'Recommendation': r'\brecommendation (.+?)\b'
}

with open('regex_patterns.json', 'w') as f:
    json.dump(patterns, f)

In [39]:
def extract_info(text, patterns):
    info = {}
    for key, pattern in patterns.items():
        match = re.search(pattern, text, re.IGNORECASE)
        if match:
            try:
                info[key] = match.group(1).strip()  # Capture the actual data
            except IndexError:
                print(f"Error in pattern for {key}: {pattern}")
                print(f"Text: {text}")
                info[key] = None
        else:
            info[key] = None
    return info

# Apply the extraction function to each row in the DataFrame
df['Summary'] = df['Cleaned Text'].apply(lambda x: extract_info(x, patterns))

# Format the extracted information into a summary
def format_summary(summary):
    formatted_summary = []
    for key, value in summary.items():
        if value:
            formatted_summary.append(f"{key}: {value}")
    return "\n".join(formatted_summary)

# Apply the formatting function to each summary
df['Formatted Summary'] = df['Summary'].apply(format_summary)

# Adjust display settings to show full content
pd.set_option('display.max_colwidth', None)

# Print the summaries
print(df['Formatted Summary'].head())

0              District: thiruvananthapuram city\nPS Name: vattiyoorkavu\nDate Report: 01 dec 2021\nDate Accident: 01 dec 2021\nTime Accident: 05:30 pm\nAccident type: minor injury\nDeath: 0\nGrievous: 0\nMinor: 2\nPlace of Occurance: moothakunnam\nCity/Town/Village: thiruvananthapuram\nSpot Accident: near\nWeather: sunnyclear\nType Road: national\nRoad Features: straight\nVisibility: road\nTraffic Control: uncontrolled\nAccussed Vehicle: tipper\nRecommendation: improve
1                                               District: thiruvananthapuram city\nPS Name: vanchiyoor\nDate Report: 31 dec 2024\nDate Accident: 31 dec 2024\nAccident type: fatal\nDeath: 1\nGrievous: 0\nMinor: 0\nPlace of Occurance: kavilnada\nCity/Town/Village: thiruvananthapuram\nSpot Accident: pedestrian\nWeather: sunnyclear\nType Road: national\nRoad Features: straight\nVisibility: road\nTraffic Control: uncontrolled\nAccussed Vehicle: motor\nRecommendation: improve
2    District: thiruvananthapuram city\nPS Name: v

In [35]:
# Display the first few rows of the cleaned text
print(df['Cleaned Text'].head())

0                                                        <start> 01 dec 2021 05:30 pm accident occurred thiruvananthapuram city jurisdiction vattiyoorkavu police station accident type minor injury 0 fatalities 0 grievous injuries 2 minor injuries accident took place moothakunnam rural weather conditions sunnyclear good visibility road type national highway features straight road accident involved tipper motor cycle spot classification near bus stop traffic control scene uncontrolled recommendation improve general road safety awareness enforcement <end>
1    <start> 31 dec 2024 06:30 accident occurred thiruvananthapuram city jurisdiction vanchiyoor police station accident type fatal 1 fatalities 0 grievous injuries 0 minor injuries accident took place kavilnada rural weather conditions sunnyclear good visibility road type national highway features straight road accident involved motor cycle motor cycle spot classification pedestrian crossing traffic control scene uncontrolled recommenda

In [27]:
# Example of how to format the summary
def format_summary(summary):
    formatted_summary = []
    for key, value in summary.items():
        if value:
            formatted_summary.append(f"{key}: {value}")
    return "\n".join(formatted_summary)

# Apply the formatting function to each summary
df['Formatted Summary'] = df['Summary'].apply(format_summary)

# Display the first few summaries
print(df['Formatted Summary'].head())

0    Accident type: accident type\nGrievous: grievo...
1    Accident type: accident type\nGrievous: grievo...
2    Accident type: accident type\nGrievous: grievo...
3    Accident type: accident type\nGrievous: grievo...
4    Accident type: accident type\nGrievous: grievo...
Name: Formatted Summary, dtype: object


In [29]:
import pandas as pd

# Set the display option to show more characters in each cell
pd.set_option('display.max_colwidth', None)  # Set to None to display the full content

# Now print the summaries
print(df['Formatted Summary'].head())

0                                                        Accident type: accident type\nGrievous: grievous\nMinor: minor\nWeather: weather\nVisibility: visibility\nTraffic Control: traffic control\nRecommendation: recommendation
1                                Accident type: accident type\nGrievous: grievous\nMinor: minor\nPedestrian: pedestrian\nWeather: weather\nVisibility: visibility\nTraffic Control: traffic control\nRecommendation: recommendation
2                                                        Accident type: accident type\nGrievous: grievous\nMinor: minor\nWeather: weather\nVisibility: visibility\nTraffic Control: traffic control\nRecommendation: recommendation
3    Accident type: accident type\nGrievous: grievous\nMinor: minor\nWeather: weather\nType Road: type road\nRoad Features: road features\nVisibility: visibility\nTraffic Control: traffic control\nRecommendation: recommendation
4                                                        Accident type: accident type\nG