In [15]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.cluster import KMeans
from tensorflow.keras.layers import Input, Reshape, Conv1D, Conv1DTranspose, Dense, MaxPooling1D,Flatten, UpSampling1D,Embedding, Dropout, BatchNormalization
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing import sequence
from sklearn import metrics
from tensorflow.keras.utils import normalize, to_categorical
from tensorflow.keras.models import Model
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.optimizers import Adam, RMSprop, Adagrad, SGD
from tensorflow.keras.losses import MeanSquaredError
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score, classification_report
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
import os
import re
import string
import sys
import nltk
import joblib
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from nltk.tokenize import word_tokenize
from transformers import BertTokenizer
from keras.callbacks import EarlyStopping,  ModelCheckpoint, ReduceLROnPlateau

In [16]:
from tensorflow.keras.models import load_model
import pickle
# Load the CAE model
autoencoder = load_model("Final_cae_model_Result16.keras")

In [17]:
# Load the pre-trained model from disk
loaded_kmeans = joblib.load('kmeans_model_final8.pkl')

In [18]:
import PyPDF2
import json

# Open the PDF file
with open('report.pdf', 'rb') as pdf_file:
    # Create a PDF reader object
    pdf_reader = PyPDF2.PdfReader(pdf_file)

    # Get the total number of pages
    num_pages = len(pdf_reader.pages)

    # Convert PDF to JSON
    data = []
    for page in pdf_reader.pages:
        data.append(page.extract_text())
   


with open('pdf_to_json.json', 'w') as f:
    json.dump(data, f)

# Load JSON data
with open('pdf_to_json.json') as f:
    pdf_data = json.load(f)

# Extract comments
comments = []
keyword = "Comment"
min_comment_length = 20
for page in pdf_data:
    if keyword in page:
        comments_on_page = page.split(keyword)[1:]
        for comment in comments_on_page:
            comment = comment.strip()
            if '\n' in comment:
                comment = comment.split('\n')[0]
            if ' C:\\' in comment:
                continue
            comment = re.sub(r'^\d+\s*', '', comment)
            if comment not in comments and len(comment) >= min_comment_length:
                comments.append(comment)

print(comments)

['Car-To-Car Rear braking: Approach to braking target with 50 km/h.', 'The distance defined via Alias leads to a distance of 40 m between front bumper of ASM vehicle and rear bumper of fellow vehicle.', 'Evaluation CustomVerifying, whether the normalized score of the AEB system exceeds the value 0.5. The randomly chosen value is not part of the', 'Verifying, whether the normalized score of the AEB system exceeds the value 0.5. The randomly chosen value is not part of the NCAP assessment protocol.', 'Verifying, whether the lateral deviation is within tolerance.Test: CCRTest', 'The distance defined via Alias leads to a distance of 12 m between front bumper of ASM vehicle and rear bumper of fellow vehicle.', 'Verifying, whether the lateral deviation is within tolerance.Concrete Test Case: 1']


In [19]:
max_length = 48
"""Function for preprocessing the text."""
stemmer = SnowballStemmer('english')
def preprocess_text(text, stop_words):
    
    # Check if the text is not a string (e.g., NaN) and return an empty string in such cases
    if not isinstance(text, str):
        return ''
    
     # Remove special characters and digits
    text = re.sub(r'[^\w\s]', '', str(text))
    text = re.sub(r'\d+', '', text)

    # Tokenize text and remove stop words
    tokens = word_tokenize(text.lower())
    tokens = [token for token in tokens if token not in stop_words]

    # Stemming
    stemmed_tokens = [stemmer.stem(token) for token in tokens]

    # Join tokens back into a string
    preprocessed_text = ' '.join(tokens)
    
    return preprocessed_text
 

In [20]:
tokenizer = Tokenizer()
stop_words = set(stopwords.words('english'))
comments_pre = [preprocess_text(comment, stop_words) for comment in comments]
print(comments_pre)
# Convert the external text data into sequences of integers
X_external = tokenizer.texts_to_sequences(comments_pre)

# Pad the sequences to have the same length
encoded_external = pad_sequences(X_external, padding='post', maxlen=max_length)
new_seq= np.expand_dims(encoded_external, axis=-1)
reconstructed_text= autoencoder.predict(new_seq)
exp_reconstructed_text = np.reshape(reconstructed_text, newshape=(reconstructed_text.shape[0], -1))
y_ext_pred = loaded_kmeans.predict(exp_reconstructed_text)


['cartocar rear braking approach braking target kmh', 'distance defined via alias leads distance front bumper asm vehicle rear bumper fellow vehicle', 'evaluation customverifying whether normalized score aeb system exceeds value randomly chosen value part', 'verifying whether normalized score aeb system exceeds value randomly chosen value part ncap assessment protocol', 'verifying whether lateral deviation within tolerancetest ccrtest', 'distance defined via alias leads distance front bumper asm vehicle rear bumper fellow vehicle', 'verifying whether lateral deviation within toleranceconcrete test case']


In [36]:
# Define a dictionary to map cluster labels to text labels
label_map = {0: 'Hardware Failure', 1: 'Software Failure', 2:'Sensor Failure', 3:'network Failure'}

# Convert the cluster labels from numbers to text
text_labels = [label_map[label] for label in y_ext_pred]

# Print the cluster labels as text
for i in range(len(comments)):
    print('{}: {}'.format(comments[i], text_labels[i]))

Car-To-Car Rear braking: Approach to braking target with 50 km/h.: Sensor Failure
The distance defined via Alias leads to a distance of 40 m between front bumper of ASM vehicle and rear bumper of fellow vehicle.: Sensor Failure
Evaluation CustomVerifying, whether the normalized score of the AEB system exceeds the value 0.5. The randomly chosen value is not part of the: Sensor Failure
Verifying, whether the normalized score of the AEB system exceeds the value 0.5. The randomly chosen value is not part of the NCAP assessment protocol.: Sensor Failure
Verifying, whether the lateral deviation is within tolerance.Test: CCRTest: Sensor Failure
The distance defined via Alias leads to a distance of 12 m between front bumper of ASM vehicle and rear bumper of fellow vehicle.: Sensor Failure
Verifying, whether the lateral deviation is within tolerance.Concrete Test Case: 1: Sensor Failure
