In [1]:
import logging
import pandas as pd
import numpy as np

from numpy import random
import gensim

import nltk
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import accuracy_score, confusion_matrix
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
import re
from bs4 import BeautifulSoup
%matplotlib inline

In [30]:
df = pd.read_csv('severity_dataset.csv', delimiter=';', names=['ID', 'Guideword', 'Hazard', 'Detailed_scenario', 'Hazardous_event', 'Severity', 'Severity_rationale'])

df = df[pd.notnull(df['Severity'])]
# shuffle the DataFrame rows
df = df.sample(frac=1).reset_index(drop=True)

# print(df.head(10))
# Calculate the total number of words in the 'Detailed scenario' column
df['Detailed_scenario'].apply(lambda x: len(x.split(' '))).sum()

4156

In [31]:
def print_plot(index):
    example = df[df.index == index][['Detailed_scenario', 'Severity']].values[0]
    if len(example) > 0:
        print(example[0])
        print('Severity:', example[1])
print_plot(10)

"Approaching a T-intersection with yield signs, a car is in the adjacent lane"
Severity: S2


In [32]:
# Download necessary NLTK data
nltk.download('stopwords')

REPLACE_BY_SPACE_RE = re.compile(r'[/(){}[\]\|@,;]')
BAD_SYMBOLS_RE = re.compile(r'[^0-9a-z #+_]')
STOPWORDS = set(stopwords.words('english'))

def clean_text(text):
    """
    text: a string
    
    return: modified initial string
    """
    text = BeautifulSoup(text, "lxml").text  # HTML decoding
    text = text.lower()  # lowercase text
    text = REPLACE_BY_SPACE_RE.sub(' ', text)  # replace REPLACE_BY_SPACE_RE symbols by space in text
    text = BAD_SYMBOLS_RE.sub('', text)  # delete symbols which are in BAD_SYMBOLS_RE from text
    text = ' '.join(word for word in text.split() if word not in STOPWORDS)  # delete stopwords from text
    return text

# Assuming df is already defined as shown in your previous code
df['Detailed_scenario'] = df['Detailed_scenario'].apply(clean_text)
print(df.head(10))
# print(df.head(10)['Severity'])
# print(df.head(10)['Detailed scenario'])


    ID   Guideword                                             Hazard  \
0    7  Commission  CAEM produces a lateral motion request when th...   
1  182  Commission  CAEM produces a lateral motion request when th...   
2   35    Omission  CAEM fails to produce a lateral motion request...   
3  158  Commission  CAEM produces a lateral motion request when th...   
4   23    Omission  CAEM fails to produce a lateral motion request...   
5  148  Commission  CAEM produces a lateral motion request when th...   
6   93    Omission  CAEM fails to produce a lateral motion request...   
7   16  Commission  CAEM produces a lateral motion request when th...   
8   52    Omission  CAEM fails to produce a lateral motion request...   
9  196  Commission  CAEM produces a lateral motion request when th...   

                                   Detailed_scenario  \
0  approaching tintersection signalized pedestria...   
1  driving multilane wet road low friction passin...   
2  vehicle entering roundabo

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\malsha\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [33]:
df['Detailed_scenario'].apply(lambda x: len(x.split(' '))).sum()

2490

In [38]:
X = df.Detailed_scenario
y = df.Severity
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state = 42)

Naive Bayes Classifier for Multinomial Model

In [45]:
# Naive Bayes Classifier for Multinomial Models

from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer
import time

my_tags = ['S0','S1','S2','S3']


nb = Pipeline([('vect', CountVectorizer()),
               ('tfidf', TfidfTransformer()),
               ('clf', MultinomialNB()),
              ])
nb.fit(X_train, y_train)


# Print the execution time


# %%time
from sklearn.metrics import classification_report
start_time = time.time()
y_pred = nb.predict(X_test)
end_time = time.time()

print("Execution time: %s seconds" % (end_time - start_time))
print('accuracy %s' % accuracy_score(y_pred, y_test))
print(classification_report(y_test, y_pred,target_names=my_tags))

Execution time: 0.001065969467163086 seconds
accuracy 0.6527777777777778
              precision    recall  f1-score   support

          S0       0.00      0.00      0.00         2
          S1       1.00      0.22      0.36         9
          S2       0.64      1.00      0.78        45
          S3       0.00      0.00      0.00        16

    accuracy                           0.65        72
   macro avg       0.41      0.31      0.29        72
weighted avg       0.53      0.65      0.53        72



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Linear Support Vector Machine

In [47]:
from sklearn.linear_model import SGDClassifier

sgd = Pipeline([('vect', CountVectorizer()),
                ('tfidf', TfidfTransformer()),
                ('clf', SGDClassifier(loss='hinge', penalty='l2',alpha=1e-3, random_state=42, max_iter=5, tol=None)),
               ])
sgd.fit(X_train, y_train)


start_time = time.time()
y_pred = sgd.predict(X_test)
end_time = time.time()

print("Execution time: %s seconds" % (end_time - start_time))
print('accuracy %s' % accuracy_score(y_pred, y_test))
print(classification_report(y_test, y_pred,target_names=my_tags))

Execution time: 0.007984638214111328 seconds
accuracy 0.75
              precision    recall  f1-score   support

          S0       0.00      0.00      0.00         2
          S1       0.57      0.44      0.50         9
          S2       0.75      0.93      0.83        45
          S3       0.89      0.50      0.64        16

    accuracy                           0.75        72
   macro avg       0.55      0.47      0.49        72
weighted avg       0.74      0.75      0.72        72



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Logistic Regression

In [48]:
from sklearn.linear_model import LogisticRegression

logreg = Pipeline([('vect', CountVectorizer()),
                ('tfidf', TfidfTransformer()),
                ('clf', LogisticRegression(n_jobs=1, C=1e5)),
               ])
logreg.fit(X_train, y_train)

start_time = time.time()
y_pred = logreg.predict(X_test)

end_time = time.time()

print("Execution time: %s seconds" % (end_time - start_time))

print('accuracy %s' % accuracy_score(y_pred, y_test))
print(classification_report(y_test, y_pred,target_names=my_tags))

Execution time: 0.0 seconds
accuracy 0.7222222222222222
              precision    recall  f1-score   support

          S0       0.00      0.00      0.00         2
          S1       0.57      0.44      0.50         9
          S2       0.78      0.84      0.81        45
          S3       0.62      0.62      0.62        16

    accuracy                           0.72        72
   macro avg       0.49      0.48      0.48        72
weighted avg       0.70      0.72      0.71        72



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


word2vec and logistic regression


In [50]:
# from gensim.models import Word2Vec

# wv = gensim.models.KeyedVectors.load_word2vec_format("GoogleNews-vectors-negative300.bin.gz", binary=True)
# wv.init_sims(replace=True)

  wv.init_sims(replace=True)


In [52]:
# from itertools import islice

# list(islice(wv.index_to_key, 13030, 13050))

['Memorial_Hospital',
 'Seniors',
 'memorandum',
 'elephant',
 'Trump',
 'Census',
 'pilgrims',
 'De',
 'Dogs',
 '###-####_ext',
 'chaotic',
 'forgive',
 'scholar',
 'Lottery',
 'decreasing',
 'Supervisor',
 'fundamentally',
 'Fitness',
 'abundance',
 'Hold']

In [53]:
# def word_averaging(wv, words):
#     all_words, mean = set(), []
    
#     for word in words:
#         if isinstance(word, np.ndarray):
#             mean.append(word)
#         elif word in wv.vocab:
#             mean.append(wv.syn0norm[wv.vocab[word].index])
#             all_words.add(wv.vocab[word].index)

#     if not mean:
#         logging.warning("cannot compute similarity with no input %s", words)
#         # FIXME: remove these examples in pre-processing
#         return np.zeros(wv.vector_size,)

#     mean = gensim.matutils.unitvec(np.array(mean).mean(axis=0)).astype(np.float32)
#     return mean

# def  word_averaging_list(wv, text_list):
#     return np.vstack([word_averaging(wv, post) for post in text_list ])

In [54]:
# def w2v_tokenize_text(text):
#     tokens = []
#     for sent in nltk.sent_tokenize(text, language='english'):
#         for word in nltk.word_tokenize(sent, language='english'):
#             if len(word) < 2:
#                 continue
#             tokens.append(word)
#     return tokens
    
# train, test = train_test_split(df, test_size=0.3, random_state = 42)

# test_tokenized = test.apply(lambda r: w2v_tokenize_text(r['Detailed_scenario']), axis=1).values
# train_tokenized = train.apply(lambda r: w2v_tokenize_text(r['Detailed_scenario']), axis=1).values

# X_train_word_average = word_averaging_list(wv,train_tokenized)
# X_test_word_average = word_averaging_list(wv,test_tokenized)

LookupError: 
**********************************************************************
  Resource [93mpunkt[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('punkt')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mtokenizers/punkt/english.pickle[0m

  Searched in:
    - 'C:\\Users\\malsha/nltk_data'
    - 'c:\\Users\\malsha\\AppData\\Local\\Programs\\Python\\Python39\\nltk_data'
    - 'c:\\Users\\malsha\\AppData\\Local\\Programs\\Python\\Python39\\share\\nltk_data'
    - 'c:\\Users\\malsha\\AppData\\Local\\Programs\\Python\\Python39\\lib\\nltk_data'
    - 'C:\\Users\\malsha\\AppData\\Roaming\\nltk_data'
    - 'C:\\nltk_data'
    - 'D:\\nltk_data'
    - 'E:\\nltk_data'
    - ''
**********************************************************************
