In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

In [None]:
accident_safety_data=pd.read_csv("C:/Applications/Machine Learning/NLP/CapstoneProjectNLP/data/hse_data.csv")
#accident_safety_data=pd.read_csv("/content/hse_data.csv")

In [None]:
accident_safety_data.head()

In [None]:
accident_safety_data.columns

We can see that the columns "Unnamed" is unwanted, as it will not help us in our analysis. 
Also, Data column should be renamed to "Date". Therefore, let's drop the column "Unnamed" and rename the column "Data" to "Date"

In [None]:
#create a backup of the dataset before we make any changes to it
accident_safety_data_new=accident_safety_data
accident_safety_data_new.head()

In [None]:
#dropping "Unnamed" column
accident_safety_data_new.drop('Unnamed: 0',axis='columns', inplace=True)
#renaming "Data" column to "Date"
accident_safety_data_new.rename(columns = {'Data':'Date'}, inplace = True)

In [None]:
#Let us check the shape of our dataset
accident_safety_data_new.shape

We can see that the dataset has 425 rows and 10 columns

In [None]:
accident_safety_data_new.head()

In [None]:
#Let us check for missing values in the dataset
accident_safety_data_new.isna().apply(pd.value_counts)

We can see that this dataset has no null values.

In [None]:
#Let us now check the datatype of the dataset and also get to know some more details
accident_safety_data_new.dtypes

Here, we can see that all the columns of the dataset are of "object" datatype. Coming to the type of data present in each column, we can see that there is a column "Date", which means it holds time series data. All other columns except "Description" are of categorical datatype. 

In [None]:
accident_safety_data_new.describe().T

From the above table, we can infer the below:

1. This dataset contains accident data of 3 countries, out of which Country1 has the most number of accidents. 

2. The data is collected from 3 types of industry sectors.Local_3 has the most number of accidents.

3. There are 5 major accident levels in which this dataset has been classified.316 accidents are of accident level 1, making it the most frequent accident type. This also means that the data is not distributed evenly.

4. The data is a consolidation of accidents faced by employees as well as third party vendors and others. Third party employees have faced the most number of accidents according to this dataset.

5. 403 male employees have been reported to have accidents, which mean the distribution of data in this case is also not evenly balanced.

6. 33 different types of critical risks have been identified in the dataset.

We have seen that there are quite a few categorical columns in the dataset which can be encoded to numerical values e.g. 

1. Local

2. Accident Level

3. Potential Accident Level


In [None]:
accident_safety_data_new.head()

UNIVARIATE ANALYSIS

Let us check the distribution of data based on accident levels

In [None]:
fig = px.histogram(accident_safety_data_new, x="Accident Level")
fig.show()

We can see that the distribution of Accident Levels is highly imbalanced in the dataset

1. Let us check the distribution of data based on country.

In [None]:
fig = px.histogram(accident_safety_data_new, x="Countries")
fig.show()

We can see that "Country_01" has the most number of accident cases.

Let us now see the distribution of accidents with respect to the type of employee.(Employee/ThirdParty/ThirdPartyRemote)

In [None]:
fig = px.histogram(accident_safety_data_new, x="Employee or Third Party")
fig.show()

From the graph it is very clear that accidents have happened in almost equal proportions among permanent employees or third party contractors, with thrid party contractors a bit on the higher side.

Let us also check the distribution of accidents as per industry sector.

In [None]:
fig = px.histogram(accident_safety_data_new, x="Industry Sector")
fig.show()

We can see that majority of accidents have happened in the mining sector, followed by metal industry and other type of industries.

We will now see the distribution of accidents as per Gender

In [None]:
fig = px.histogram(accident_safety_data_new, x="Genre")
fig.show()

Clearly, the distribution of accidents is imbalanced when checked by "Genre". The count of accidents in males is way higher than that in females.

Lastly, let us check the distribution by Locals.

In [None]:
fig = px.histogram(accident_safety_data_new, x="Local")
fig.show()

In [None]:
fig = px.histogram(accident_safety_data_new, x="Accident Level")
fig.show()

We can see that most of the people have met with accident having level 1.

In [None]:
fig = px.histogram(accident_safety_data_new, x="Critical Risk")
fig.show()

We can see from the graph that the Critical risk category "Others" have the most number of accidents. This means we are not clear about the exact risk factor associated with accidents in this dataset.

BIVARIATE ANALYSIS

Let us write a function to see how does the accident level varies with the Industry Sector and Countries

In [None]:
def plothistograms(data,column_name_x,column_name_color,value):
    fig = px.histogram(data, x=column_name_x, color=column_name_color, 
                   barmode=value
                  )
    fig.show()

1. We will see the distribution of different accident levels occured per country

In [None]:
plothistograms(accident_safety_data_new,"Countries","Accident Level","relative")

Observations from the above graph:
1. Accident Level V accidents have occured only in Country I.
2. Maximum number of accidents in all countries are mainly of type Accident Level I.
3. Country_01 has had accidents of all Accident types, making it the most riskiest place as per the dataset.

In [None]:
plothistograms(accident_safety_data_new,"Industry Sector","Accident Level","group")

The most number of accidents have occured in the Mining Industry in Country 1 so far, followed by the metal industry, also in Country 1.

2. Next, let us see how many accidents have occured per Local

In [None]:
plothistograms(accident_safety_data_new,"Local","Industry Sector","stack")
plothistograms(accident_safety_data_new,"Countries","Industry Sector","stack")
plothistograms(accident_safety_data_new,"Local","Accident Level","stack")

1. Local 01,Local 02,Local 03,Local 04,Local 07 all have plants belonging  to the Mining Sector and they have had the most number of accidents. 
2. Other industry sectors have had the least number of accidents.
3. Local 09 and Local_11 seems to be the safest cities, with only 2 accidents, even though it has plants belonging to the Metal sector.

In [None]:
accident_safety_data_new['Critical Risk'].value_counts()


In [None]:
#plothistograms(accident_safety_data_new,"Accident Level","Critical Risk","stack")
import plotly.graph_objects as go

labels = accident_safety_data_new['Critical Risk']
values = accident_safety_data_new['Critical Risk'].value_counts()

# Use `hole` to create a donut-like pie chart
fig = go.Figure(data=[go.Pie(labels=labels, values=values, hole=.3)])
fig.show()

1. Local 04 and Local_03 have the highest number of unknown risks i.e. of type "Other".
2. We can see that Local 06 has various kind of risks present, which means it could have more of metal plants there.
3. We can also see that the next critical risks apart from Other are "Power Lock","Fall/Fall Prevention" and "Vehicle and mobile equipment".

Here it is clearly visible that in the mining industry, third party employees have met with the maximum number of accidents as compared to the metal industry where their employees have met with the highest number of accidents.

In [None]:
plothistograms(accident_safety_data_new,"Potential Accident Level","Industry Sector","stack")

1. Major number of accidents have occured in the Potential Accident Level 3 category.
2. Potential Accident Level 5 is least in the mining industry.

DATA AUGMENTATION

In [None]:
#Let us first create a dataset using only the class variable "Accident Level" and Description column.
accident_safety_acc_level=accident_safety_data_new
accident_safety_acc_level['Accident_Level']=accident_safety_acc_level.apply(lambda col: str(col['Accident Level']), axis=1)
accident_safety_acc_level['Description_DL']=accident_safety_acc_level.apply(lambda col: str(col['Description']), axis=1)
accident_safety_acc_level=accident_safety_acc_level[['Accident_Level','Description_DL']]
accident_safety_acc_level.head()

In [None]:
labels, frequencies = np.unique(accident_safety_acc_level.Accident_Level.values, return_counts=True)

fig = px.pie(accident_safety_acc_level, values=frequencies, names=labels, title='Frequency of Description by Accident Level')
fig.show()


We can clearly see that the Description column is imbalanced in the dataset. Most of the description is present only for Accident Level I(0).

We will now check the exact counts of Descriptions per Accident level.

In [None]:
accident_safety_acc_level.Accident_Level.value_counts().values
for u in accident_safety_acc_level.Accident_Level.unique().tolist():
    print(u)

Let us first divide our data into train and test samples

We will try different data augmentation techniques so that the data is balanced properly before it is passed into the dataset.

1. SImple upsampling


In [None]:
accident_safety_acc_level.Accident_Level.value_counts()

Using EDA let us perform data augmentation

In [None]:
options=['II']
df_1=accident_safety_acc_level.copy(True)
df_1=df_1.loc[df_1['Accident_Level'].isin(options)]
df_1.head()

In [None]:
options=['III']
df_2=accident_safety_acc_level.copy(True)
df_2=df_2.loc[df_2['Accident_Level'].isin(options)]
df_2.head()

In [None]:
options=['IV']
df_3=accident_safety_acc_level.copy(True)
df_3=df_3.loc[df_3['Accident_Level'].isin(options)]
df_3.head()

In [None]:
options=['V']
df_4=accident_safety_acc_level.copy(True)
df_4=df_4.loc[df_4['Accident_Level'].isin(options)]
df_4.head()

In [None]:
from data_augmentation import gen_eda
df_1_up=gen_eda(df_1,0.5,0.2,0.2,0.1,10)
df_2_up=gen_eda(df_2,0.5,0.2,0.2,0.1,10)
df_3_up=gen_eda(df_3,0.5,0.2,0.2,0.1,10)
df_4_up=gen_eda(df_4,0.5,0.2,0.2,0.1,30)

In [None]:
accident_safety_acc_level_eda_upsampled = pd.concat([df_1_up,df_2_up,df_3_up,df_4_up])

In [None]:
accident_safety_acc_level_eda_upsampled.describe().T

In [None]:
accident_safety_acc_level_eda_upsampled.head()

In [None]:
accident_safety_acc_level_eda_upsampled = accident_safety_acc_level_eda_upsampled.append(accident_safety_acc_level[accident_safety_acc_level.Accident_Level == 'I'])

In [None]:
labels, frequencies = np.unique(accident_safety_acc_level_eda_upsampled.Accident_Level.values, return_counts=True)

fig = px.pie(accident_safety_acc_level_eda_upsampled, values=frequencies, names=labels, title='Frequency of Description by Accident Level')
fig.show()

In [None]:
#cleaning data for machine learning model
from preprocess_data import clean_data
accident_safety_acc_level_eda_upsampled["Description_ML"] = accident_safety_acc_level_eda_upsampled["Description_DL"].apply(lambda x: clean_data(x))

In [None]:
#cleaning data#cleaning data for deep learning model
from preprocess_data import clean_DL_data
accident_safety_acc_level_eda_upsampled["Description_DL_clean"] = accident_safety_acc_level_eda_upsampled["Description_DL"].apply(lambda x: clean_DL_data(x))

In [None]:
accident_safety_data_new.Description[1]

In [None]:
accident_safety_acc_level_eda_upsampled.Description_ML[1]

In [None]:
accident_safety_acc_level_eda_upsampled.Description_DL_clean[1]

Let us see the most frequent words used for each accident level now.

1. Accident Level I

In [None]:
from wordcloud import WordCloud
keywords_sarcasm = " ".join(line for line in accident_safety_acc_level_eda_upsampled[accident_safety_acc_level_eda_upsampled.Accident_Level=='I'].Description_DL)
word_cloud= WordCloud(width=1250, height=625, max_font_size=350, 
                      random_state=42).generate(keywords_sarcasm)
plt.figure(figsize=(20, 10))
plt.title("Most frequent words used to describe Accident Level I", size=20, weight="bold")
plt.imshow(word_cloud)
plt.axis("off")
plt.show()

In [None]:
from wordcloud import WordCloud
keywords_sarcasm = " ".join(line for line in accident_safety_acc_level_eda_upsampled[accident_safety_acc_level_eda_upsampled.Accident_Level=='II'].Description_DL)
word_cloud= WordCloud(width=1250, height=625, max_font_size=350, 
                      random_state=42).generate(keywords_sarcasm)
plt.figure(figsize=(20, 10))
plt.title("Most frequent words used to describe Accident Level II", size=20, weight="bold")
plt.imshow(word_cloud)
plt.axis("off")
plt.show()

In [None]:
from wordcloud import WordCloud
keywords_sarcasm = " ".join(line for line in accident_safety_acc_level_eda_upsampled[accident_safety_acc_level_eda_upsampled.Accident_Level=='III'].Description_DL)
word_cloud= WordCloud(width=1250, height=625, max_font_size=350, 
                      random_state=42).generate(keywords_sarcasm)
plt.figure(figsize=(20, 10))
plt.title("Most frequent words used to describe Accident Level III", size=20, weight="bold")
plt.imshow(word_cloud)
plt.axis("off")
plt.show()

In [None]:
from wordcloud import WordCloud
keywords_sarcasm = " ".join(line for line in accident_safety_acc_level_eda_upsampled[accident_safety_acc_level_eda_upsampled.Accident_Level=='IV'].Description_DL)
word_cloud= WordCloud(width=1250, height=625, max_font_size=350, 
                      random_state=42).generate(keywords_sarcasm)
plt.figure(figsize=(20, 10))
plt.title("Most frequent words used to describe Accident Level IV", size=20, weight="bold")
plt.imshow(word_cloud)
plt.axis("off")
plt.show()

In [None]:
from wordcloud import WordCloud
keywords_sarcasm = " ".join(line for line in accident_safety_acc_level_eda_upsampled[accident_safety_acc_level_eda_upsampled.Accident_Level=='V'].Description_DL)
word_cloud= WordCloud(width=1250, height=625, max_font_size=350, 
                      random_state=42).generate(keywords_sarcasm)
plt.figure(figsize=(20, 10))
plt.title("Most frequent words used to describe Accident Level V", size=20, weight="bold")
plt.imshow(word_cloud)
plt.axis("off")
plt.show()