In [9]:
# Library to interact with the OS
import os

# Libraries for reading and manipulating data
import numpy as np
import pandas as pd

# Libraries for data visualization
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

%matplotlib inline
pd.set_option('display.float_format', lambda x: '%.4f' % x)

PROJECT_DIR = os.path.join(os.path.dirname('preprocessing.ipynb'), os.pardir)

In [10]:
is_df = pd.read_excel('Data+Set+-+industrial_safety_and_health_database_with_accidents_description.xlsx')
is_df.head()

Unnamed: 0.1,Unnamed: 0,Data,Countries,Local,Industry Sector,Accident Level,Potential Accident Level,Genre,Employee or Third Party,Critical Risk,Description
0,0,2016-01-01,Country_01,Local_01,Mining,I,IV,Male,Third Party,Pressed,While removing the drill rod of the Jumbo 08 f...
1,1,2016-01-02,Country_02,Local_02,Mining,I,IV,Male,Employee,Pressurized Systems,During the activation of a sodium sulphide pum...
2,2,2016-01-06,Country_01,Local_03,Mining,I,III,Male,Third Party (Remote),Manual Tools,In the sub-station MILPO located at level +170...
3,3,2016-01-08,Country_01,Local_04,Mining,I,I,Male,Third Party,Others,Being 9:45 am. approximately in the Nv. 1880 C...
4,4,2016-01-10,Country_01,Local_04,Mining,IV,IV,Male,Third Party,Others,Approximately at 11:45 a.m. in circumstances t...


# Overview

In [11]:
is_df.columns

Index(['Unnamed: 0', 'Data', 'Countries', 'Local', 'Industry Sector',
       'Accident Level', 'Potential Accident Level', 'Genre',
       'Employee or Third Party', 'Critical Risk', 'Description'],
      dtype='object')

In [12]:
# Dropping the index column
is_df.drop(['Unnamed: 0'],axis=1,inplace=True)

In [13]:
# Duplicates
is_df.duplicated().sum()

7

In [14]:
# Dropping duplicates
is_df.drop_duplicates(inplace=True, ignore_index=True)

In [15]:
# Rename 'Data', 'Countries', 'Genre' columns in Data frame
is_df.rename(columns={'Data':'Date', 'Countries':'Country', 'Genre':'Gender'}, inplace=True)

# Treating Attribute data

In [16]:
# Label encoding
is_df['Gender'] = is_df['Gender'].apply(lambda x: {'Male': 0, 'Female': 1}[x])
is_df['Accident Level'] = is_df['Accident Level'].apply(lambda x: {'I': 1, 'II': 2, 'III': 3, 'IV': 4, 'V': 5}[x])
is_df['Potential Accident Level'] = is_df['Potential Accident Level'].apply(lambda x: {'I': 1, 'II': 2, 'III': 3, 'IV': 4, 'V': 5, 'VI': 6}[x])

is_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 10 columns):
 #   Column                    Non-Null Count  Dtype         
---  ------                    --------------  -----         
 0   Date                      418 non-null    datetime64[ns]
 1   Country                   418 non-null    object        
 2   Local                     418 non-null    object        
 3   Industry Sector           418 non-null    object        
 4   Accident Level            418 non-null    int64         
 5   Potential Accident Level  418 non-null    int64         
 6   Gender                    418 non-null    int64         
 7   Employee or Third Party   418 non-null    object        
 8   Critical Risk             418 non-null    object        
 9   Description               418 non-null    object        
dtypes: datetime64[ns](1), int64(3), object(6)
memory usage: 32.8+ KB


In [17]:
# Dropping datetime info
is_df.drop(['Date'], axis=1, inplace=True)

In [18]:
# One-hot encoding
is_df = pd.get_dummies(is_df, columns=['Country', 'Local', 'Industry Sector', 'Employee or Third Party', 'Critical Risk'], dtype=np.int64)

In [19]:
is_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 58 columns):
 #   Column                                                   Non-Null Count  Dtype 
---  ------                                                   --------------  ----- 
 0   Accident Level                                           418 non-null    int64 
 1   Potential Accident Level                                 418 non-null    int64 
 2   Gender                                                   418 non-null    int64 
 3   Description                                              418 non-null    object
 4   Country_Country_01                                       418 non-null    int64 
 5   Country_Country_02                                       418 non-null    int64 
 6   Country_Country_03                                       418 non-null    int64 
 7   Local_Local_01                                           418 non-null    int64 
 8   Local_Local_02                          

In [20]:
is_df.head(2)

Unnamed: 0,Accident Level,Potential Accident Level,Gender,Description,Country_Country_01,Country_Country_02,Country_Country_03,Local_Local_01,Local_Local_02,Local_Local_03,...,Critical Risk_Projection,Critical Risk_Projection of fragments,Critical Risk_Projection/Burning,Critical Risk_Projection/Choco,Critical Risk_Projection/Manual Tools,Critical Risk_Suspended Loads,Critical Risk_Traffic,Critical Risk_Vehicles and Mobile Equipment,Critical Risk_Venomous Animals,Critical Risk_remains of choco
0,1,4,0,While removing the drill rod of the Jumbo 08 f...,1,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,4,0,During the activation of a sodium sulphide pum...,0,1,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0


# Description

In [21]:
# to use regular expressions for manipulating text data
import re

# to load the natural language toolkit
import nltk
nltk.download('stopwords')    # loading the stopwords
nltk.download('wordnet')  

# to remove common stop words
from nltk.corpus import stopwords

# to perform stemming
from nltk.stem.porter import PorterStemmer

# to create Bag of Words
from sklearn.feature_extraction.text import CountVectorizer

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Prasana\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Prasana\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [22]:
# To lowercase
is_df['Description_T'] = is_df['Description'].apply(lambda x: x.lower())

In [23]:
# Removing non-alphanumeric chars
is_df['Description_T'] = is_df['Description_T'].apply(lambda x: ''.join(re.sub('[^A-Za-z0-9]+', ' ', x)))

In [24]:
# Removing extra white spaces
is_df['Description_T'] = is_df['Description_T'].str.strip()

In [25]:
# Stopword removal
is_df['Description_T'] = is_df['Description_T'].apply(lambda x: ' '.join([word for word in x.split() if word not in stopwords.words('english')]))

In [26]:
is_df.loc[0:10, ['Description', 'Description_T']]

Unnamed: 0,Description,Description_T
0,While removing the drill rod of the Jumbo 08 f...,removing drill rod jumbo 08 maintenance superv...
1,During the activation of a sodium sulphide pum...,activation sodium sulphide pump piping uncoupl...
2,In the sub-station MILPO located at level +170...,sub station milpo located level 170 collaborat...
3,Being 9:45 am. approximately in the Nv. 1880 C...,9 45 approximately nv 1880 cx 695 ob7 personne...
4,Approximately at 11:45 a.m. in circumstances t...,approximately 11 45 circumstances mechanics an...
5,During the unloading operation of the ustulado...,unloading operation ustulado bag need unclog d...
6,The collaborator reports that he was on street...,collaborator reports street 09 holding left ha...
7,"At approximately 04:50 p.m., when the mechanic...",approximately 04 50 p mechanic technician jos ...
8,Employee was sitting in the resting area at le...,employee sitting resting area level 326 raise ...
9,At the moment the forklift operator went to ma...,moment forklift operator went manipulate big b...


In [27]:
nltk.download('punkt')
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()
is_df['Description_WL'] = is_df.apply(lambda row: nltk.word_tokenize(row['Description_T']), axis=1)
def lemmatize_list(words):
    new_words = []
    for word in words:
      new_words.append(lemmatizer.lemmatize(word, pos='v'))
    return ' '.join(new_words)
is_df['Description_WL'] = is_df.apply(lambda x: lemmatize_list(x['Description_WL']), axis=1)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Prasana\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [28]:
is_df.loc[0:10, ['Description', 'Description_WL']]

Unnamed: 0,Description,Description_WL
0,While removing the drill rod of the Jumbo 08 f...,remove drill rod jumbo 08 maintenance supervis...
1,During the activation of a sodium sulphide pum...,activation sodium sulphide pump pip uncouple s...
2,In the sub-station MILPO located at level +170...,sub station milpo locate level 170 collaborato...
3,Being 9:45 am. approximately in the Nv. 1880 C...,9 45 approximately nv 1880 cx 695 ob7 personne...
4,Approximately at 11:45 a.m. in circumstances t...,approximately 11 45 circumstances mechanics an...
5,During the unloading operation of the ustulado...,unload operation ustulado bag need unclog disc...
6,The collaborator reports that he was on street...,collaborator report street 09 hold leave hand ...
7,"At approximately 04:50 p.m., when the mechanic...",approximately 04 50 p mechanic technician jos ...
8,Employee was sitting in the resting area at le...,employee sit rest area level 326 raise bore su...
9,At the moment the forklift operator went to ma...,moment forklift operator go manipulate big bag...


In [29]:
df=is_df.copy()

In [30]:
import gensim.downloader as api
wv = api.load('word2vec-google-news-300')

vec_king = wv['king']

In [39]:
# Count vectorization of text
from sklearn.feature_extraction.text import CountVectorizer
 
# Ticket Data
corpus = df['Description_WL'].values
 
# Creating the vectorizer
vectorizer = CountVectorizer(stop_words='english')
 
# Converting the text to numeric data
X = vectorizer.fit_transform(corpus)
 
#print(vectorizer.get_feature_names())
 
# Preparing Data frame For machine learning
# Priority column acts as a target variable and other columns as predictors
CountVectorizedData=pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out())
print(CountVectorizedData.shape)
CountVectorizedData.head()

(418, 2626)


Unnamed: 0,00,001,007,01,018,0183,02,020,02bp0166,02bp0167,...,young,z014,zaf,zamac,zaro,zero,zinc,zinco,zn,zone
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0


In [40]:
CountVectorizedData.shape

(418, 2626)

In [41]:
WordsVocab=CountVectorizedData.columns[:]

In [42]:
WordsVocab

Index(['00', '001', '007', '01', '018', '0183', '02', '020', '02bp0166',
       '02bp0167',
       ...
       'young', 'z014', 'zaf', 'zamac', 'zaro', 'zero', 'zinc', 'zinco', 'zn',
       'zone'],
      dtype='object', length=2626)

In [61]:
def FunctionText2Vec(inpTextData):
    # Converting the text to numeric data
    X = vectorizer.transform(inpTextData)
    CountVecData=pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out())
    
    # Creating empty dataframe to hold sentences
    W2Vec_Data=pd.DataFrame()
    
    # Looping through each row for the data
    for i in range(CountVecData.shape[0]):
 
        # initiating a sentence with all zeros
        Sentence = np.zeros(300)
 
        # Looping thru each word in the sentence and if its present in 
        # the Word2Vec model then storing its vector
        for word in WordsVocab[CountVecData.iloc[i , :]>=1]:
            #print(word)
            if word in wv.key_to_index.keys():    
                Sentence=Sentence+wv[word]
        # Appending the sentence to the dataframe
    W2Vec_Data = pd.concat([W2Vec_Data, pd.DataFrame([Sentence], columns=['Word2vec_'+str(i) for i in range(300)])])
    return(W2Vec_Data)

In [62]:

W2Vec_Data = FunctionText2Vec(df['Description_WL'])
# Checking the new representation for sentences
W2Vec_Data.shape

(1, 300)

In [63]:
W2Vec_Data.head()

Unnamed: 0,Word2vec_0,Word2vec_1,Word2vec_2,Word2vec_3,Word2vec_4,Word2vec_5,Word2vec_6,Word2vec_7,Word2vec_8,Word2vec_9,...,Word2vec_290,Word2vec_291,Word2vec_292,Word2vec_293,Word2vec_294,Word2vec_295,Word2vec_296,Word2vec_297,Word2vec_298,Word2vec_299
0,-0.1698,0.5333,-0.8289,0.0984,-1.2468,0.4736,0.6614,-1.1595,1.1874,1.4158,...,-0.4964,1.0148,-0.4473,-1.1477,-0.9365,-1.0762,-0.9687,-0.9205,0.9557,0.1874


In [64]:
final_dataset= df[['Accident Level']].join(W2Vec_Data.reset_index(drop=True))

In [65]:
final_dataset.head()

Unnamed: 0,Accident Level,Word2vec_0,Word2vec_1,Word2vec_2,Word2vec_3,Word2vec_4,Word2vec_5,Word2vec_6,Word2vec_7,Word2vec_8,...,Word2vec_290,Word2vec_291,Word2vec_292,Word2vec_293,Word2vec_294,Word2vec_295,Word2vec_296,Word2vec_297,Word2vec_298,Word2vec_299
0,1,-0.1698,0.5333,-0.8289,0.0984,-1.2468,0.4736,0.6614,-1.1595,1.1874,...,-0.4964,1.0148,-0.4473,-1.1477,-0.9365,-1.0762,-0.9687,-0.9205,0.9557,0.1874
1,1,,,,,,,,,,...,,,,,,,,,,
2,1,,,,,,,,,,...,,,,,,,,,,
3,1,,,,,,,,,,...,,,,,,,,,,
4,4,,,,,,,,,,...,,,,,,,,,,


In [66]:
final_dataset.to_csv("nlp_chatbot_word2vec.csv", index=False,header=True)

In [67]:
#glove

In [68]:
# Converting the Stanford GloVe model vector format to word2vec
from gensim.scripts.glove2word2vec import glove2word2vec
from gensim.models import KeyedVectors
glove_input_file = 'glove.6B.100d.txt'
word2vec_output_file = 'glove.6B.100d.txt.word2vec'
glove2word2vec(glove_input_file, word2vec_output_file)

(400000, 100)

In [69]:
# load the Stanford GloVe model
filename = 'glove.6B.100d.txt.word2vec'
glove_model = KeyedVectors.load_word2vec_format(filename, binary=False)

In [70]:
# Checking the size of the vocabulary
print("Length of the vocabulary is", len(glove_model.index_to_key))

Length of the vocabulary is 400000


In [71]:
# Checking the word embedding of a random word
word = "drill"
glove_model[word]

array([-0.35765  ,  0.81467  , -0.2346   , -0.33073  , -0.55697  ,
       -0.91257  ,  0.13386  ,  0.89648  , -0.68417  ,  0.8944   ,
        0.33198  ,  0.52635  ,  0.54153  ,  0.38754  , -0.38877  ,
       -0.17014  , -0.24252  ,  0.39321  , -0.58885  , -0.44848  ,
        0.50495  ,  0.017128 , -0.040326 , -0.30586  , -0.024022 ,
        0.41875  , -1.1461   ,  0.8086   ,  0.12388  ,  0.21747  ,
       -0.94692  , -0.20167  ,  0.35933  ,  0.15633  , -0.039052 ,
       -0.46565  , -1.2225   , -0.48241  ,  0.65021  , -0.4558   ,
       -0.18787  , -0.05471  , -0.6474   , -0.25599  , -0.16758  ,
       -0.18455  , -0.40177  ,  0.22248  , -0.1685   , -0.60159  ,
       -0.54771  ,  0.37457  , -0.21604  ,  0.77251  , -0.32279  ,
       -0.94642  , -0.081026 , -0.87158  ,  1.3124   ,  1.0501   ,
        0.095171 ,  0.46109  ,  0.91197  ,  0.39826  ,  0.032378 ,
        0.25852  , -0.12024  , -0.29043  , -0.010148 ,  0.22901  ,
        0.26109  , -0.18566  ,  0.030369 ,  0.11884  ,  0.0568

In [72]:
# Retrieving the words present in the GloVe model's vocabulary
glove_words = glove_model.index_to_key

# Creating a dictionary of words and their corresponding vectors
glove_word_vector_dict = dict(zip(glove_model.index_to_key,list(glove_model.vectors)))

In [73]:
vec_size=100

In [74]:
def average_vectorizer_GloVe(doc):
    # Initializing a feature vector for the sentence
    feature_vector = np.zeros((vec_size,), dtype="float64")

    # Creating a list of words in the sentence that are present in the model vocabulary
    words_in_vocab = [word for word in doc.split() if word in glove_words]

    # adding the vector representations of the words
    for word in words_in_vocab:
        feature_vector += np.array(glove_word_vector_dict[word])

    # Dividing by the number of words to get the average vector
    if len(words_in_vocab) != 0:
        feature_vector /= len(words_in_vocab)

    return feature_vector


In [75]:
# creating a dataframe of the vectorized documents
df_Glove = pd.DataFrame(df['Description_WL'].apply(average_vectorizer_GloVe).tolist(), columns=['Feature '+str(i) for i in range(vec_size)])
df_Glove

Unnamed: 0,Feature 0,Feature 1,Feature 2,Feature 3,Feature 4,Feature 5,Feature 6,Feature 7,Feature 8,Feature 9,...,Feature 90,Feature 91,Feature 92,Feature 93,Feature 94,Feature 95,Feature 96,Feature 97,Feature 98,Feature 99
0,-0.1625,0.0487,-0.0458,-0.1402,-0.1130,-0.2562,-0.1873,0.2322,-0.0300,0.2054,...,-0.1727,0.1455,-0.1539,0.0817,0.0238,-0.1169,0.0658,-0.2009,0.4236,0.0298
1,-0.2061,0.2139,0.0195,0.1100,-0.0053,0.0217,0.0134,0.2624,0.0805,0.1114,...,0.0846,0.0435,-0.1789,-0.0530,-0.2451,0.0030,0.1674,-0.1813,0.2512,-0.1116
2,-0.0924,0.1172,0.0111,-0.1880,-0.0294,-0.1723,-0.0027,0.0540,-0.0315,0.1624,...,-0.1076,0.0168,-0.0358,0.0563,-0.1914,0.0943,-0.0778,-0.1446,0.4481,-0.1807
3,-0.1104,0.0107,0.0346,-0.1647,0.0095,0.0411,-0.0815,0.1698,-0.1151,-0.0284,...,-0.1297,0.1507,0.0336,0.0388,-0.1150,-0.0624,0.0056,-0.1186,0.5266,-0.0177
4,-0.0868,0.0885,-0.0409,-0.1400,-0.1002,0.0170,0.0192,0.1989,-0.2522,0.0980,...,-0.0888,0.0979,-0.0422,-0.0041,-0.2148,0.0598,-0.0732,-0.2002,0.4524,-0.2539
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
413,-0.1810,0.0279,0.0579,-0.1069,-0.1108,0.1351,0.0397,0.2164,-0.1588,0.1287,...,-0.1702,0.1627,0.0122,0.0135,-0.0752,-0.1596,0.1071,-0.0660,0.4796,-0.1835
414,0.0401,0.0343,-0.0289,-0.2505,-0.1131,-0.0513,-0.1521,0.0527,0.2271,0.1792,...,-0.0098,-0.0076,0.0587,0.0792,-0.1021,-0.0093,0.0498,-0.2020,0.3392,-0.1465
415,-0.1945,0.0703,0.2066,-0.2320,-0.1879,0.2346,-0.1698,0.2314,-0.0748,-0.0081,...,-0.0301,-0.0369,0.0068,-0.1114,-0.1557,0.1256,0.0006,-0.0119,0.5620,0.0362
416,-0.3636,0.1171,0.0194,-0.1601,0.0881,0.0131,-0.0950,0.2467,-0.0587,0.1210,...,-0.1149,0.1244,-0.0032,-0.1317,-0.2301,-0.0102,0.0731,-0.0958,0.4701,-0.0604


In [76]:
final_dataset_glove= df[['Accident Level']].join(df_Glove.reset_index(drop=True))

In [77]:
final_dataset_glove.to_csv("nlp_chatbot_glove.csv", index=False,header=True)

In [78]:
final_dataset_glove

Unnamed: 0,Accident Level,Feature 0,Feature 1,Feature 2,Feature 3,Feature 4,Feature 5,Feature 6,Feature 7,Feature 8,...,Feature 90,Feature 91,Feature 92,Feature 93,Feature 94,Feature 95,Feature 96,Feature 97,Feature 98,Feature 99
0,1,-0.1625,0.0487,-0.0458,-0.1402,-0.1130,-0.2562,-0.1873,0.2322,-0.0300,...,-0.1727,0.1455,-0.1539,0.0817,0.0238,-0.1169,0.0658,-0.2009,0.4236,0.0298
1,1,-0.2061,0.2139,0.0195,0.1100,-0.0053,0.0217,0.0134,0.2624,0.0805,...,0.0846,0.0435,-0.1789,-0.0530,-0.2451,0.0030,0.1674,-0.1813,0.2512,-0.1116
2,1,-0.0924,0.1172,0.0111,-0.1880,-0.0294,-0.1723,-0.0027,0.0540,-0.0315,...,-0.1076,0.0168,-0.0358,0.0563,-0.1914,0.0943,-0.0778,-0.1446,0.4481,-0.1807
3,1,-0.1104,0.0107,0.0346,-0.1647,0.0095,0.0411,-0.0815,0.1698,-0.1151,...,-0.1297,0.1507,0.0336,0.0388,-0.1150,-0.0624,0.0056,-0.1186,0.5266,-0.0177
4,4,-0.0868,0.0885,-0.0409,-0.1400,-0.1002,0.0170,0.0192,0.1989,-0.2522,...,-0.0888,0.0979,-0.0422,-0.0041,-0.2148,0.0598,-0.0732,-0.2002,0.4524,-0.2539
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
413,1,-0.1810,0.0279,0.0579,-0.1069,-0.1108,0.1351,0.0397,0.2164,-0.1588,...,-0.1702,0.1627,0.0122,0.0135,-0.0752,-0.1596,0.1071,-0.0660,0.4796,-0.1835
414,1,0.0401,0.0343,-0.0289,-0.2505,-0.1131,-0.0513,-0.1521,0.0527,0.2271,...,-0.0098,-0.0076,0.0587,0.0792,-0.1021,-0.0093,0.0498,-0.2020,0.3392,-0.1465
415,1,-0.1945,0.0703,0.2066,-0.2320,-0.1879,0.2346,-0.1698,0.2314,-0.0748,...,-0.0301,-0.0369,0.0068,-0.1114,-0.1557,0.1256,0.0006,-0.0119,0.5620,0.0362
416,1,-0.3636,0.1171,0.0194,-0.1601,0.0881,0.0131,-0.0950,0.2467,-0.0587,...,-0.1149,0.1244,-0.0032,-0.1317,-0.2301,-0.0102,0.0731,-0.0958,0.4701,-0.0604


In [79]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_df = pd.DataFrame()
for i in [1,2]:
    tfidf = TfidfVectorizer(max_features=50, stop_words='english',use_idf=True, ngram_range=(i,i))
    X = tfidf.fit_transform(is_df['Description_WL']).toarray()
    tfs = pd.DataFrame(X, columns=["TFIDF_" + n for n in tfidf.get_feature_names_out()])
    tfidf_df = pd.concat([tfidf_df.reset_index(drop=True), tfs.reset_index(drop=True)], axis=1)

tfidf_df.head(5)

Unnamed: 0,TFIDF_accident,TFIDF_activity,TFIDF_approximately,TFIDF_area,TFIDF_assistant,TFIDF_carry,TFIDF_cause,TFIDF_clean,TFIDF_collaborator,TFIDF_come,...,TFIDF_safety gloves,TFIDF_split set,TFIDF_support mesh,TFIDF_time accident,TFIDF_time event,TFIDF_transfer medical,TFIDF_use safety,TFIDF_wear safety,TFIDF_work area,TFIDF_worker wear
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.4739,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.2598,0.0,0.0,0.1695,0.0,0.5312,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.3317,0.0,0.3339,0.0,0.1956,0.0,0.0,0.3508,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.2979,0.0,0.0,0.0,0.1757,0.0,0.0,0.3151,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [80]:
final_dataset= is_df.join(tfidf_df.reset_index(drop=True))

In [81]:
final_dataset.head()

Unnamed: 0,Accident Level,Potential Accident Level,Gender,Description,Country_Country_01,Country_Country_02,Country_Country_03,Local_Local_01,Local_Local_02,Local_Local_03,...,TFIDF_safety gloves,TFIDF_split set,TFIDF_support mesh,TFIDF_time accident,TFIDF_time event,TFIDF_transfer medical,TFIDF_use safety,TFIDF_wear safety,TFIDF_work area,TFIDF_worker wear
0,1,4,0,While removing the drill rod of the Jumbo 08 f...,1,0,0,1,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,4,0,During the activation of a sodium sulphide pum...,0,1,0,0,1,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1,3,0,In the sub-station MILPO located at level +170...,1,0,0,0,0,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1,1,0,Being 9:45 am. approximately in the Nv. 1880 C...,1,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,4,4,0,Approximately at 11:45 a.m. in circumstances t...,1,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [82]:
final_dataset.drop(['Description','Description_T','Description_WL'],axis=1,inplace=True)

In [83]:
final_dataset.columns

Index(['Accident Level', 'Potential Accident Level', 'Gender',
       'Country_Country_01', 'Country_Country_02', 'Country_Country_03',
       'Local_Local_01', 'Local_Local_02', 'Local_Local_03', 'Local_Local_04',
       ...
       'TFIDF_safety gloves', 'TFIDF_split set', 'TFIDF_support mesh',
       'TFIDF_time accident', 'TFIDF_time event', 'TFIDF_transfer medical',
       'TFIDF_use safety', 'TFIDF_wear safety', 'TFIDF_work area',
       'TFIDF_worker wear'],
      dtype='object', length=157)

In [84]:
final_dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Columns: 157 entries, Accident Level to TFIDF_worker wear
dtypes: float64(100), int64(57)
memory usage: 512.8 KB


In [85]:
final_dataset.to_csv("nlp_chatbot_TF_IDF.csv", index=False,header=True)

In [86]:
final_dataset.head(2)

Unnamed: 0,Accident Level,Potential Accident Level,Gender,Country_Country_01,Country_Country_02,Country_Country_03,Local_Local_01,Local_Local_02,Local_Local_03,Local_Local_04,...,TFIDF_safety gloves,TFIDF_split set,TFIDF_support mesh,TFIDF_time accident,TFIDF_time event,TFIDF_transfer medical,TFIDF_use safety,TFIDF_wear safety,TFIDF_work area,TFIDF_worker wear
0,1,4,0,1,0,0,1,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,4,0,0,1,0,0,1,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [87]:
!pip install tensorflow





In [88]:
!pip install keras




In [90]:
import tensorflow
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential


# Define the target variable and features
y = is_df['Accident Level'].astype('category').cat.codes  # Encoding the target variable

In [91]:
from keras.layers import Dense

In [92]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [93]:
# Define the neural network model
model = Sequential()
model.add(Dense(128, input_dim=X.shape[1], activation='relu'))
model.add(Dense(64, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

In [98]:
# Compile the model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model
model.fit(X_train, y_train, epochs=15, batch_size=10, validation_data=(X_test, y_test))

# Evaluate the model
loss, accuracy = model.evaluate(X_test, y_test)
print(f'Model Accuracy: {accuracy}')

Epoch 1/15
[1m34/34[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 22ms/step - accuracy: 0.4940 - loss: -106.5474 - val_accuracy: 0.3571 - val_loss: 127.4027
Epoch 2/15
[1m34/34[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 0.4360 - loss: -107.8276 - val_accuracy: 0.4881 - val_loss: 131.6848
Epoch 3/15
[1m34/34[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 0.5520 - loss: -181.4545 - val_accuracy: 0.5119 - val_loss: 134.7161
Epoch 4/15
[1m34/34[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 0.5603 - loss: -118.7300 - val_accuracy: 0.3333 - val_loss: 137.0875
Epoch 5/15
[1m34/34[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 0.4591 - loss: -94.9985 - val_accuracy: 0.3095 - val_loss: 140.7711
Epoch 6/15
[1m34/34[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 0.4917 - loss: -56.6575 - val_accuracy: 0.3214 - val_loss: 145.9253
Epoch 7/15
