In [2]:
# Library to interact with the OS
import os

# Libraries for reading and manipulating data
import numpy as np
import pandas as pd

# Libraries for data visualization
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

%matplotlib inline
pd.set_option('display.float_format', lambda x: '%.4f' % x)


In [4]:
df_train=pd.read_csv('augmented_train.csv')

In [6]:
df_train.head(2)

Unnamed: 0,Potential Accident Level,Gender,Country_Country_01,Country_Country_02,Country_Country_03,Local_Local_01,Local_Local_02,Local_Local_03,Local_Local_04,Local_Local_05,...,Critical Risk_Projection/Burning,Critical Risk_Projection/Choco,Critical Risk_Projection/Manual Tools,Critical Risk_Suspended Loads,Critical Risk_Traffic,Critical Risk_Vehicles and Mobile Equipment,Critical Risk_Venomous Animals,Critical Risk_remains of choco,Description,Accident Level
0,4,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,By manually moving a steel cabinet for disposa...,3
1,4,0,1,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,Once the mooring of the faneles in the detonat...,1


In [8]:
df_test=pd.read_csv('augmented_test.csv')

In [10]:
df_test.head(2)

Unnamed: 0,Potential Accident Level,Gender,Description,Country_Country_01,Country_Country_02,Country_Country_03,Local_Local_01,Local_Local_02,Local_Local_03,Local_Local_04,...,Critical Risk_Projection of fragments,Critical Risk_Projection/Burning,Critical Risk_Projection/Choco,Critical Risk_Projection/Manual Tools,Critical Risk_Suspended Loads,Critical Risk_Traffic,Critical Risk_Vehicles and Mobile Equipment,Critical Risk_Venomous Animals,Critical Risk_remains of choco,Accident Level
0,1,0,On 02/03/17 during the soil sampling in the re...,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,1,0,1
1,4,0,During execution of drilling on the target - B...,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,3


In [16]:
# to use regular expressions for manipulating text data
import re

# to load the natural language toolkit
import nltk
nltk.download('stopwords')    # loading the stopwords
nltk.download('wordnet')  

# to remove common stop words
from nltk.corpus import stopwords

# to perform stemming
from nltk.stem.porter import PorterStemmer

# to create Bag of Words
from sklearn.feature_extraction.text import CountVectorizer

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\nehag\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\nehag\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [18]:
# Removing non-alphanumeric chars
df_train['Description_T'] = df_train['Description'].apply(lambda x: ''.join(re.sub('[^A-Za-z0-9]+', ' ', str(x))))
df_test['Description_T'] = df_test['Description'].apply(lambda x: ''.join(re.sub('[^A-Za-z0-9]+', ' ', str(x))))

In [20]:
#removing digits
df_train['Description_T'] = df_train['Description_T'].apply(lambda x: ''.join(re.sub(r'\d+',' ', str(x))))
df_test['Description_T'] = df_test['Description_T'].apply(lambda x: ''.join(re.sub(r'\d+',' ', str(x))))

In [22]:
# To lowercase
df_train['Description_T'] = df_train['Description_T'].apply(lambda x: str(x).lower())
df_test['Description_T'] = df_test['Description_T'].apply(lambda x: str(x).lower())

In [24]:
df_train['Description_T'] = df_train['Description_T'].str.strip()
df_test['Description_T'] = df_test['Description_T'].str.strip()

In [26]:
# Stopword removal
df_train['Description_T'] = df_train['Description_T'].apply(lambda x: ' '.join([word for word in x.split() if word not in stopwords.words('english')]))

# Stopword removal
df_test['Description_T'] = df_test['Description_T'].apply(lambda x: ' '.join([word for word in x.split() if word not in stopwords.words('english')]))

In [28]:
df_train.loc[0:10, ['Description', 'Description_T']]

Unnamed: 0,Description,Description_T
0,By manually moving a steel cabinet for disposa...,manually moving steel cabinet disposal help an...
1,Once the mooring of the faneles in the detonat...,mooring faneles detonating cord completed inju...
2,When performing cleaning activity of the area ...,performing cleaning activity area near tc grin...
3,The technician was doing the magnetometric sur...,technician magnetometric survey stepped thorn ...
4,The operator cleaned with spatula spear throug...,operator cleaned spatula spear one windows boi...
5,"On 02/25/2017 at 13:05 p.m., when assisting wi...",p assisting gps magnetometric collaborator gil...
6,"On 05/02/2017, at 10:40 p.m., when performing ...",p performing geological mapping activity geolo...
7,"In phase III of the concentrator plant, the ma...",phase iii concentrator plant maintenance perso...
8,"While preparing to mount polypropylene tubing,...",preparing mount polypropylene tubing employee ...
9,After the welder completed the welding work to...,welder completed welding work reinforce forms ...


In [30]:
df_test.loc[0:10, ['Description', 'Description_T']]

Unnamed: 0,Description,Description_T
0,On 02/03/17 during the soil sampling in the re...,soil sampling region sta employees rafael dani...
1,During execution of drilling on the target - B...,execution drilling target bolt brjcldd made co...
2,"The operator was in the center (Demag IV), per...",operator center demag iv performing maintenanc...
3,The employee was transiting toward the cadmium...,employee transiting toward cadmium factory nea...
4,"Being 12:20 hours Approximately, in circumstan...",hours approximately circumstances administrati...
5,"At Rp 050 of level 1620, in circumstances wher...",rp level circumstances workers company perform...
6,When observing the pulp overflow of the overfl...,observing pulp overflow overflow reception dra...
7,During the execution of the soil sampling task...,execution soil sampling task potions area arou...
8,In moments that the truck of transport of pers...,moments truck transport personnel company mcei...
9,Employee moved toward the structure of post 10...,employee moved toward structure post came step...


In [32]:
nltk.download('punkt')
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()
df_train['Description_WL'] = df_train.apply(lambda row: nltk.word_tokenize(row['Description_T']), axis=1)
def lemmatize_list(words):
    new_words = []
    for word in words:
      new_words.append(lemmatizer.lemmatize(word, pos='v'))
    return ' '.join(new_words)
df_train['Description_WL'] = df_train.apply(lambda x: lemmatize_list(x['Description_WL']), axis=1)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\nehag\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [34]:
df_train.loc[0:10, ['Description', 'Description_WL']]

Unnamed: 0,Description,Description_WL
0,By manually moving a steel cabinet for disposa...,manually move steel cabinet disposal help anot...
1,Once the mooring of the faneles in the detonat...,moor faneles detonate cord complete injure per...
2,When performing cleaning activity of the area ...,perform clean activity area near tc grind empl...
3,The technician was doing the magnetometric sur...,technician magnetometric survey step thorn rea...
4,The operator cleaned with spatula spear throug...,operator clean spatula spear one windows boile...
5,"On 02/25/2017 at 13:05 p.m., when assisting wi...",p assist gps magnetometric collaborator gilv n...
6,"On 05/02/2017, at 10:40 p.m., when performing ...",p perform geological map activity geologist ma...
7,"In phase III of the concentrator plant, the ma...",phase iii concentrator plant maintenance perso...
8,"While preparing to mount polypropylene tubing,...",prepare mount polypropylene tube employee pull...
9,After the welder completed the welding work to...,welder complete weld work reinforce form deepe...


In [36]:
nltk.download('punkt')
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()
df_test['Description_WL'] = df_test.apply(lambda row: nltk.word_tokenize(row['Description_T']), axis=1)
def lemmatize_list(words):
    new_words = []
    for word in words:
      new_words.append(lemmatizer.lemmatize(word, pos='v'))
    return ' '.join(new_words)
df_test['Description_WL'] = df_test.apply(lambda x: lemmatize_list(x['Description_WL']), axis=1)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\nehag\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [38]:
df_test.loc[0:10, ['Description', 'Description_WL']]

Unnamed: 0,Description,Description_WL
0,On 02/03/17 during the soil sampling in the re...,soil sample region sta employees rafael danill...
1,During execution of drilling on the target - B...,execution drill target bolt brjcldd make compa...
2,"The operator was in the center (Demag IV), per...",operator center demag iv perform maintenance t...
3,The employee was transiting toward the cadmium...,employee transit toward cadmium factory near t...
4,"Being 12:20 hours Approximately, in circumstan...",hours approximately circumstances administrati...
5,"At Rp 050 of level 1620, in circumstances wher...",rp level circumstances workers company perform...
6,When observing the pulp overflow of the overfl...,observe pulp overflow overflow reception drawe...
7,During the execution of the soil sampling task...,execution soil sample task potions area around...
8,In moments that the truck of transport of pers...,moments truck transport personnel company mcei...
9,Employee moved toward the structure of post 10...,employee move toward structure post come step ...


In [40]:
df_train1=df_train.copy()
df_test1=df_test.copy()

In [42]:
from nltk.tokenize import word_tokenize, RegexpTokenizer

# RegexpTokenizer
regexp = RegexpTokenizer("[\w']+")

In [44]:
# Text normalization for Word2Vec
for df in [df_train1,df_test1]:
    df['tokens'] = (df["Description_WL"].apply(regexp.tokenize))
df_train1[['tokens', 'Accident Level']]

Unnamed: 0,tokens,Accident Level
0,"[manually, move, steel, cabinet, disposal, hel...",3
1,"[moor, faneles, detonate, cord, complete, inju...",1
2,"[perform, clean, activity, area, near, tc, gri...",3
3,"[technician, magnetometric, survey, step, thor...",1
4,"[operator, clean, spatula, spear, one, windows...",1
...,...,...
1150,"[approximately, hours, routine, maintenance, o...",5
1151,"[hours, routine, maintenance, operation, secto...",5
1152,"[installation, activity, meter, diameter, vent...",5
1153,"[maintenance, activity, conveyor, belt, feed, ...",5


In [150]:
#word2vec

In [46]:
import gensim
from gensim.models import Word2Vec

In [48]:
# Loading the pre-trained Word2Vec model
word2vec_path = 'GoogleNews-vectors-negative300.bin'
word2vec = gensim.models.KeyedVectors.load_word2vec_format(word2vec_path, binary=True)

In [50]:
# Some useful functions for Word2Vec
def get_average_word2vec(tokens_list, vector, generate_missing = False, k = 300):
    if len(tokens_list) < 1:
        return np.zeros(k)
    if generate_missing:
        vectorized = [vector[word] if word in vector else np.random.rand(k) for word in tokens_list]
    else:
        vectorized = [vector[word] if word in vector else np.zeros(k) for word in tokens_list]
    length = len(vectorized)
    summed = np.sum(vectorized, axis = 0)
    averaged = np.divide(summed, length)
    return averaged

def get_word2vec_embeddings(vectors, tokens, generate_missing = False):
    embeddings = tokens.apply(lambda x: get_average_word2vec(x, vectors, generate_missing = generate_missing))
    return list(embeddings)

In [52]:
# Word2Vec embedding
df_train_embed = get_word2vec_embeddings(word2vec, df_train1['tokens'])
df_test_embed = get_word2vec_embeddings(word2vec, df_test1['tokens'])

In [54]:
import scipy
from scipy import sparse
from scipy.sparse import csr_matrix

In [56]:
df_train_w2v = scipy.sparse.csr_matrix(df_train_embed)
df_test_w2v = scipy.sparse.csr_matrix(df_test_embed )

In [58]:
W2Vec_Data_train=pd.DataFrame(df_train_w2v.todense(), columns=['Word2vec_'+str(i) for i in range(300)])
W2Vec_Data_test=pd.DataFrame(df_test_w2v.todense(), columns=['Word2vec_'+str(i) for i in range(300)])

In [60]:
W2Vec_Data_train.head()

Unnamed: 0,Word2vec_0,Word2vec_1,Word2vec_2,Word2vec_3,Word2vec_4,Word2vec_5,Word2vec_6,Word2vec_7,Word2vec_8,Word2vec_9,...,Word2vec_290,Word2vec_291,Word2vec_292,Word2vec_293,Word2vec_294,Word2vec_295,Word2vec_296,Word2vec_297,Word2vec_298,Word2vec_299
0,-0.0389,0.0461,-0.0325,0.0007,-0.0464,-0.0107,0.1132,-0.1115,0.1225,0.0791,...,-0.0428,0.0826,-0.0809,-0.0216,-0.0228,0.0115,0.01,-0.024,-0.0811,-0.021
1,-0.0248,0.0418,0.0107,-0.0078,-0.0484,-0.0242,0.0367,-0.0591,0.0857,0.0545,...,-0.0258,0.0576,-0.126,0.0144,-0.0145,-0.0432,-0.1008,-0.0719,0.0223,-0.043
2,-0.0357,0.0737,0.0062,0.013,-0.0658,0.0373,-0.0165,-0.089,0.0177,0.0333,...,-0.0326,0.0782,-0.059,0.0027,0.0,-0.0219,-0.1034,-0.0347,0.0073,-0.001
3,0.0154,-0.0132,-0.0564,-0.0045,-0.1166,-0.01,-0.05,-0.1157,0.074,0.083,...,-0.0828,0.0362,-0.0542,0.0206,-0.0722,-0.042,-0.1186,-0.0327,0.0566,0.0013
4,0.0405,0.0423,0.0511,-0.0112,-0.0855,0.0037,0.12,-0.0999,0.1327,0.1117,...,-0.0617,0.057,-0.0939,0.0174,-0.0134,-0.0347,0.0015,-0.0399,-0.0435,-0.0253


In [62]:
W2Vec_Data_test.head()

Unnamed: 0,Word2vec_0,Word2vec_1,Word2vec_2,Word2vec_3,Word2vec_4,Word2vec_5,Word2vec_6,Word2vec_7,Word2vec_8,Word2vec_9,...,Word2vec_290,Word2vec_291,Word2vec_292,Word2vec_293,Word2vec_294,Word2vec_295,Word2vec_296,Word2vec_297,Word2vec_298,Word2vec_299
0,-0.0224,0.0097,0.0366,0.0703,-0.0811,-0.0011,0.0285,-0.0886,0.0913,0.0646,...,-0.0071,0.0465,-0.0472,0.0495,-0.0082,-0.0025,-0.0031,-0.0278,-0.0715,-0.0265
1,-0.0134,0.0514,-0.0236,0.0503,-0.116,-0.0236,0.0536,-0.1049,0.0687,0.0651,...,-0.0321,0.0561,-0.117,0.0134,-0.0415,-0.0278,0.0023,-0.0106,-0.0183,-0.0489
2,0.029,0.0119,0.006,-0.0184,-0.1194,0.0163,0.0909,-0.1275,0.0398,0.0038,...,0.0057,0.107,-0.0471,0.0509,0.0124,-0.04,-0.0232,0.0038,-0.0473,-0.0219
3,-0.0129,0.0024,0.0409,0.1106,-0.0532,-0.0104,0.0598,-0.1222,0.0934,0.1503,...,-0.0272,-0.061,-0.0059,0.034,-0.0962,0.0453,-0.0034,0.0839,0.1053,-0.0908
4,0.0384,0.0092,0.0276,0.0308,-0.1075,-0.0074,0.0733,-0.1233,0.0524,0.0138,...,-0.0511,0.0617,-0.0995,0.049,-0.0089,-0.0495,0.0448,-0.0103,-0.0655,0.0057


In [64]:
W2Vec_Data_train.shape

(1155, 300)

In [66]:
W2Vec_Data_test.shape

(105, 300)

In [68]:
final_dataset_train = df_train1[['Accident Level']].join(W2Vec_Data_train.reset_index(drop=True))

In [70]:
final_dataset_train.head()

Unnamed: 0,Accident Level,Word2vec_0,Word2vec_1,Word2vec_2,Word2vec_3,Word2vec_4,Word2vec_5,Word2vec_6,Word2vec_7,Word2vec_8,...,Word2vec_290,Word2vec_291,Word2vec_292,Word2vec_293,Word2vec_294,Word2vec_295,Word2vec_296,Word2vec_297,Word2vec_298,Word2vec_299
0,3,-0.0389,0.0461,-0.0325,0.0007,-0.0464,-0.0107,0.1132,-0.1115,0.1225,...,-0.0428,0.0826,-0.0809,-0.0216,-0.0228,0.0115,0.01,-0.024,-0.0811,-0.021
1,1,-0.0248,0.0418,0.0107,-0.0078,-0.0484,-0.0242,0.0367,-0.0591,0.0857,...,-0.0258,0.0576,-0.126,0.0144,-0.0145,-0.0432,-0.1008,-0.0719,0.0223,-0.043
2,3,-0.0357,0.0737,0.0062,0.013,-0.0658,0.0373,-0.0165,-0.089,0.0177,...,-0.0326,0.0782,-0.059,0.0027,0.0,-0.0219,-0.1034,-0.0347,0.0073,-0.001
3,1,0.0154,-0.0132,-0.0564,-0.0045,-0.1166,-0.01,-0.05,-0.1157,0.074,...,-0.0828,0.0362,-0.0542,0.0206,-0.0722,-0.042,-0.1186,-0.0327,0.0566,0.0013
4,1,0.0405,0.0423,0.0511,-0.0112,-0.0855,0.0037,0.12,-0.0999,0.1327,...,-0.0617,0.057,-0.0939,0.0174,-0.0134,-0.0347,0.0015,-0.0399,-0.0435,-0.0253


In [74]:
final_dataset_test = df_test1[['Accident Level']].join(W2Vec_Data_test.reset_index(drop=True))

In [76]:
final_dataset_test.head()

Unnamed: 0,Accident Level,Word2vec_0,Word2vec_1,Word2vec_2,Word2vec_3,Word2vec_4,Word2vec_5,Word2vec_6,Word2vec_7,Word2vec_8,...,Word2vec_290,Word2vec_291,Word2vec_292,Word2vec_293,Word2vec_294,Word2vec_295,Word2vec_296,Word2vec_297,Word2vec_298,Word2vec_299
0,1,-0.0224,0.0097,0.0366,0.0703,-0.0811,-0.0011,0.0285,-0.0886,0.0913,...,-0.0071,0.0465,-0.0472,0.0495,-0.0082,-0.0025,-0.0031,-0.0278,-0.0715,-0.0265
1,3,-0.0134,0.0514,-0.0236,0.0503,-0.116,-0.0236,0.0536,-0.1049,0.0687,...,-0.0321,0.0561,-0.117,0.0134,-0.0415,-0.0278,0.0023,-0.0106,-0.0183,-0.0489
2,1,0.029,0.0119,0.006,-0.0184,-0.1194,0.0163,0.0909,-0.1275,0.0398,...,0.0057,0.107,-0.0471,0.0509,0.0124,-0.04,-0.0232,0.0038,-0.0473,-0.0219
3,1,-0.0129,0.0024,0.0409,0.1106,-0.0532,-0.0104,0.0598,-0.1222,0.0934,...,-0.0272,-0.061,-0.0059,0.034,-0.0962,0.0453,-0.0034,0.0839,0.1053,-0.0908
4,2,0.0384,0.0092,0.0276,0.0308,-0.1075,-0.0074,0.0733,-0.1233,0.0524,...,-0.0511,0.0617,-0.0995,0.049,-0.0089,-0.0495,0.0448,-0.0103,-0.0655,0.0057


In [78]:
final_dataset_test.shape

(105, 301)

In [80]:
final_dataset_train.to_csv("nlp_chatbot_word2vec_train.csv", 
                           index=False,header=True)

In [82]:
final_dataset_test.to_csv("nlp_chatbot_word2vec_test.csv", 
                          index=False,header=True)

In [84]:
#TF-IDF

In [86]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_df_train = pd.DataFrame()
for i in [1,2]:
    tfidf = TfidfVectorizer(max_features=50, stop_words='english',use_idf=True, ngram_range=(i,i))
    X = tfidf.fit_transform(df_train1['Description_WL']).toarray()
    tfs = pd.DataFrame(X, columns=["TFIDF_" + n for n in tfidf.get_feature_names_out()])
    tfidf_df_train = pd.concat([tfidf_df_train.reset_index(drop=True), tfs.reset_index(drop=True)], axis=1)

tfidf_df_train.head(5)

Unnamed: 0,TFIDF_accident,TFIDF_approximately,TFIDF_area,TFIDF_attempt,TFIDF_attention,TFIDF_belt,TFIDF_bruise,TFIDF_cause,TFIDF_conveyor,TFIDF_employee,...,TFIDF_routine inspection,TFIDF_routine maintenance,TFIDF_severe burn,TFIDF_severe injuries,TFIDF_severe lacerations,TFIDF_strike leave,TFIDF_sudden release,TFIDF_suffer severe,TFIDF_th routine,TFIDF_time accident
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.2922,0.0,0.453,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.3141,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,0.0,0.0,0.3543,0.0,0.0,0.0,0.0,0.0,0.0,0.3319,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.1847,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [88]:
final_dataset_tfidf_train= df_train1[['Accident Level']].join(tfidf_df_train.reset_index(drop=True))

In [90]:
final_dataset_tfidf_train.to_csv("nlp_chatbot_tfidf_train.csv", 
                                 index=False, header=True)

In [92]:
final_dataset_tfidf_train.shape

(1155, 101)

In [94]:
tfidf_df_test = pd.DataFrame()
X = tfidf.transform(df_test1['Description_WL']).toarray()
tfs = pd.DataFrame(X, columns=["TFIDF_" + n for n in tfidf.get_feature_names_out()])
tfidf_df_test = pd.concat([tfidf_df_test.reset_index(drop=True), tfs.reset_index(drop=True)], axis=1)
tfidf_df_test.head(5)

Unnamed: 0,TFIDF_approximately hours,TFIDF_approximately meter,TFIDF_attempt clear,TFIDF_attempt loosen,TFIDF_balance fall,TFIDF_belt employee,TFIDF_cause injury,TFIDF_cause lose,TFIDF_cause severe,TFIDF_concrete floor,...,TFIDF_routine inspection,TFIDF_routine maintenance,TFIDF_severe burn,TFIDF_severe injuries,TFIDF_severe lacerations,TFIDF_strike leave,TFIDF_sudden release,TFIDF_suffer severe,TFIDF_th routine,TFIDF_time accident
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.7718,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [96]:
tfidf_df_test.shape

(105, 50)

In [98]:
final_dataset_tfidf_test= df_test1[['Accident Level']].join(tfidf_df_test.reset_index(drop=True))

In [102]:
final_dataset_tfidf_test.to_csv("nlp_chatbot_tfidf_test.csv", 
                                index=False, header=True)

In [104]:
final_dataset_tfidf_test.shape

(105, 51)

In [106]:
final_dataset_tfidf_test.head()

Unnamed: 0,Accident Level,TFIDF_approximately hours,TFIDF_approximately meter,TFIDF_attempt clear,TFIDF_attempt loosen,TFIDF_balance fall,TFIDF_belt employee,TFIDF_cause injury,TFIDF_cause lose,TFIDF_cause severe,...,TFIDF_routine inspection,TFIDF_routine maintenance,TFIDF_severe burn,TFIDF_severe injuries,TFIDF_severe lacerations,TFIDF_strike leave,TFIDF_sudden release,TFIDF_suffer severe,TFIDF_th routine,TFIDF_time accident
0,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,3,0.0,0.0,0.0,0.0,0.0,0.0,0.7718,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [109]:
#Glove

In [108]:
import pandas as pd
import numpy as np
from gensim.scripts.glove2word2vec import glove2word2vec
from gensim.models import KeyedVectors

# Load the GloVe model
glove_input_file = 'glove.6B.100d.txt'
word2vec_output_file = 'glove.6B.100d.txt.word2vec'
glove2word2vec(glove_input_file, word2vec_output_file)

(400000, 100)

In [112]:
# load the Stanford GloVe model
filename = 'glove.6B.100d.txt.word2vec'
glove_model = KeyedVectors.load_word2vec_format(filename, binary=False)

In [114]:
# Checking the size of the vocabulary
print("Length of the vocabulary is", len(glove_model.index_to_key))

Length of the vocabulary is 400000


In [116]:
# Retrieving the words present in the GloVe model's vocabulary
glove_words = glove_model.index_to_key

# Creating a dictionary of words and their corresponding vectors
glove_word_vector_dict = dict(zip(glove_model.index_to_key,list(glove_model.vectors)))

In [118]:
vec_size = 100

In [120]:
def average_vectorizer_GloVe(doc):
    # Initializing a feature vector for the sentence
    feature_vector = np.zeros((vec_size,), dtype="float64")

    # Creating a list of words in the sentence that are present in the model vocabulary
    words_in_vocab = [word for word in doc.split() if word in glove_words]

    # adding the vector representations of the words
    for word in words_in_vocab:
        feature_vector += np.array(glove_word_vector_dict[word])

    # Dividing by the number of words to get the average vector
    if len(words_in_vocab) != 0:
        feature_vector /= len(words_in_vocab)

    return feature_vector

In [122]:
# creating a dataframe of the vectorized documents
train_df = pd.DataFrame(df_train1['Description_WL'].apply(average_vectorizer_GloVe).tolist(), 
                        columns=['glove_'+str(i) for i in range(vec_size)])

test_df = pd.DataFrame(df_test1['Description_WL'].apply(average_vectorizer_GloVe).tolist(), 
                        columns=['glove_'+str(i) for i in range(vec_size)])

In [126]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1155 entries, 0 to 1154
Data columns (total 100 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   glove_0   1155 non-null   float64
 1   glove_1   1155 non-null   float64
 2   glove_2   1155 non-null   float64
 3   glove_3   1155 non-null   float64
 4   glove_4   1155 non-null   float64
 5   glove_5   1155 non-null   float64
 6   glove_6   1155 non-null   float64
 7   glove_7   1155 non-null   float64
 8   glove_8   1155 non-null   float64
 9   glove_9   1155 non-null   float64
 10  glove_10  1155 non-null   float64
 11  glove_11  1155 non-null   float64
 12  glove_12  1155 non-null   float64
 13  glove_13  1155 non-null   float64
 14  glove_14  1155 non-null   float64
 15  glove_15  1155 non-null   float64
 16  glove_16  1155 non-null   float64
 17  glove_17  1155 non-null   float64
 18  glove_18  1155 non-null   float64
 19  glove_19  1155 non-null   float64
 20  glove_20  1155 non-null   flo

In [128]:
train_df.head()

Unnamed: 0,glove_0,glove_1,glove_2,glove_3,glove_4,glove_5,glove_6,glove_7,glove_8,glove_9,...,glove_90,glove_91,glove_92,glove_93,glove_94,glove_95,glove_96,glove_97,glove_98,glove_99
0,-0.2623,-0.0841,0.1282,-0.1997,-0.063,-0.0496,-0.2303,0.1156,0.1853,0.1863,...,0.1082,-0.111,-0.0686,0.1333,-0.0839,0.1923,0.3424,-0.2759,0.5752,-0.1874
1,-0.2027,0.1678,0.035,-0.0604,-0.1545,0.0998,-0.0327,0.1934,-0.1007,0.2192,...,-0.0453,0.2017,0.0147,0.0042,-0.0336,0.0225,-0.0656,-0.1288,0.4219,-0.1941
2,-0.4038,0.3504,-0.0543,0.0028,-0.2617,0.1594,-0.1096,0.1454,-0.1383,0.1892,...,-0.1371,0.2488,0.0567,0.0614,-0.0335,0.0448,-0.1588,-0.1819,0.5797,-0.1138
3,-0.146,0.0384,0.118,0.02,-0.3475,-0.2282,-0.0424,0.1417,-0.079,0.0452,...,-0.0822,-0.0289,-0.0698,0.1573,-0.1465,0.0872,0.2163,-0.1254,0.203,0.1634
4,-0.231,0.161,0.0229,-0.2509,-0.119,0.1436,-0.0991,0.1991,0.0684,0.2025,...,-0.0414,-0.0326,0.0487,-0.0031,0.0291,0.0168,0.0638,-0.1638,0.4409,-0.1961


In [130]:
test_df.head()

Unnamed: 0,glove_0,glove_1,glove_2,glove_3,glove_4,glove_5,glove_6,glove_7,glove_8,glove_9,...,glove_90,glove_91,glove_92,glove_93,glove_94,glove_95,glove_96,glove_97,glove_98,glove_99
0,-0.2038,0.0308,0.1292,-0.2621,-0.2052,0.1716,-0.0224,0.2435,-0.054,0.1468,...,0.0275,-0.0341,0.0555,0.0106,-0.2028,0.1686,0.0049,-0.1679,0.3551,-0.0071
1,-0.1566,0.0534,-0.0504,-0.1828,-0.0559,-0.1275,-0.1065,0.1858,-0.0243,0.1024,...,-0.1016,0.017,-0.1687,0.0072,-0.0843,-0.1431,-0.0242,-0.2812,0.4429,-0.0124
2,-0.3406,0.0817,0.0107,-0.055,-0.0234,-0.0894,-0.149,0.1673,0.0645,0.2124,...,0.0446,-0.0131,0.0044,-0.1081,-0.0302,0.0266,0.0457,-0.2696,0.5144,-0.0667
3,-0.0848,0.1955,0.0782,0.0013,0.0383,-0.3407,-0.0659,0.0959,0.0179,0.2631,...,0.0383,-0.2204,-0.3644,0.142,-0.212,0.0125,-0.054,-0.0064,0.5232,-0.0805
4,-0.2,0.0687,0.1034,-0.0566,-0.0721,0.279,-0.1755,0.145,0.0065,-0.0484,...,-0.2636,-0.0324,-0.0332,0.0336,-0.3325,-0.134,0.1489,-0.1896,0.6406,-0.0262


In [132]:
final_dataset_train_glove= df_train1[['Accident Level']].join(train_df.reset_index(drop=True))

In [136]:
final_dataset_train_glove.head()

Unnamed: 0,Accident Level,glove_0,glove_1,glove_2,glove_3,glove_4,glove_5,glove_6,glove_7,glove_8,...,glove_90,glove_91,glove_92,glove_93,glove_94,glove_95,glove_96,glove_97,glove_98,glove_99
0,3,-0.2623,-0.0841,0.1282,-0.1997,-0.063,-0.0496,-0.2303,0.1156,0.1853,...,0.1082,-0.111,-0.0686,0.1333,-0.0839,0.1923,0.3424,-0.2759,0.5752,-0.1874
1,1,-0.2027,0.1678,0.035,-0.0604,-0.1545,0.0998,-0.0327,0.1934,-0.1007,...,-0.0453,0.2017,0.0147,0.0042,-0.0336,0.0225,-0.0656,-0.1288,0.4219,-0.1941
2,3,-0.4038,0.3504,-0.0543,0.0028,-0.2617,0.1594,-0.1096,0.1454,-0.1383,...,-0.1371,0.2488,0.0567,0.0614,-0.0335,0.0448,-0.1588,-0.1819,0.5797,-0.1138
3,1,-0.146,0.0384,0.118,0.02,-0.3475,-0.2282,-0.0424,0.1417,-0.079,...,-0.0822,-0.0289,-0.0698,0.1573,-0.1465,0.0872,0.2163,-0.1254,0.203,0.1634
4,1,-0.231,0.161,0.0229,-0.2509,-0.119,0.1436,-0.0991,0.1991,0.0684,...,-0.0414,-0.0326,0.0487,-0.0031,0.0291,0.0168,0.0638,-0.1638,0.4409,-0.1961


In [144]:
final_dataset_train_glove.shape

(1155, 101)

In [138]:
final_dataset_test_glove= df_test1[['Accident Level']].join(test_df.reset_index(drop=True))

In [140]:
final_dataset_test_glove.head()

Unnamed: 0,Accident Level,glove_0,glove_1,glove_2,glove_3,glove_4,glove_5,glove_6,glove_7,glove_8,...,glove_90,glove_91,glove_92,glove_93,glove_94,glove_95,glove_96,glove_97,glove_98,glove_99
0,1,-0.2038,0.0308,0.1292,-0.2621,-0.2052,0.1716,-0.0224,0.2435,-0.054,...,0.0275,-0.0341,0.0555,0.0106,-0.2028,0.1686,0.0049,-0.1679,0.3551,-0.0071
1,3,-0.1566,0.0534,-0.0504,-0.1828,-0.0559,-0.1275,-0.1065,0.1858,-0.0243,...,-0.1016,0.017,-0.1687,0.0072,-0.0843,-0.1431,-0.0242,-0.2812,0.4429,-0.0124
2,1,-0.3406,0.0817,0.0107,-0.055,-0.0234,-0.0894,-0.149,0.1673,0.0645,...,0.0446,-0.0131,0.0044,-0.1081,-0.0302,0.0266,0.0457,-0.2696,0.5144,-0.0667
3,1,-0.0848,0.1955,0.0782,0.0013,0.0383,-0.3407,-0.0659,0.0959,0.0179,...,0.0383,-0.2204,-0.3644,0.142,-0.212,0.0125,-0.054,-0.0064,0.5232,-0.0805
4,2,-0.2,0.0687,0.1034,-0.0566,-0.0721,0.279,-0.1755,0.145,0.0065,...,-0.2636,-0.0324,-0.0332,0.0336,-0.3325,-0.134,0.1489,-0.1896,0.6406,-0.0262


In [142]:
final_dataset_test_glove.shape

(105, 101)

In [146]:
final_dataset_train_glove.to_csv("nlp_chatbot_glove_train.csv", index=False,header=True)

In [148]:
final_dataset_test_glove.to_csv("nlp_chatbot_glove_test.csv", index=False,header=True)