In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
data = pd.read_csv('../input/work-accidents-in-china/accidents.csv')
data.head()

# 1. Accident time analysis

In [None]:
data['Date'] = pd.to_datetime(data['Date'])
data['year'] = data['Date'].dt.year
data['mon'] = data['Date'].dt.month
data['dayofweek'] = data['Date'].dt.dayofweek

In [None]:
# import plt lib
import matplotlib
import matplotlib.pyplot as plt
from matplotlib import cm # import colormap
import seaborn as sns
myfont = matplotlib.font_manager.FontProperties(fname='../input/myfont/simhei.ttf')
plt.rcParams['axes.unicode_minus'] = False

In [None]:
mon_counts = data['mon'].value_counts()
mon_counts

We can see that all the months of the accident are April, may, July, March, August and June, all in **spring and summer**.

In [None]:
mon_labels = ['Apr','May','Jul','Mar','Aug','Jun']
colors = cm.rainbow(np.arange(len(mon_labels),0,-1)/len(mon_labels))
barplot = plt.bar(x = mon_labels,height = mon_counts,color = colors)
plt.show()

From the figure, we can intuitively see that the number of accidents in April is the most, and the number of accidents in June is the least.

In [None]:
dayofweek_counts = data['dayofweek'].value_counts()
dayofweek_counts

It can be seen from the results that the most frequent accidents in a week are **Saturday (5): 58** and **Monday (0): 44**. The number of accidents at other times is similar, all about 35. 

In [None]:
dayofweek_labels = ['Saturday','Monday','Wednesday','Sunday','Thursday','Friday','Tuesday']
colors = cm.rainbow(np.arange(len(dayofweek_labels),0,-1)/len(dayofweek_labels))
barplot = plt.bar(x = dayofweek_labels,height = dayofweek_counts,width=0.6,color = colors)
plt.tick_params(labelsize=8.5) 
plt.show()

In [None]:
weekday = 0
weekend = 0
for idx,val in dayofweek_counts.items():
    if idx == 5 or idx == 6:
        weekend += val
    else:
        weekday += val
weekday /= 5
weekend /= 2
('weekday average:' +str(weekday) + ', weekend average:' + str(weekend)+'.')

From the calculation results, **we find that the average number of accidents happened on weekend is more than that on weekday**.  This may indicate that work on rest days will affect the quality of people's work and increase the probability of accidents.

# 2. Accident location analysis

In [None]:
select = data.groupby('year')['Province'].value_counts().sort_values(ascending=False)
select

It can be seen from the results that there are data items that are not correctly classified, such as'天津市', which should be converted to' Tianjin '

In [None]:
def replace(x):
    if x == '天津市':
        return 'Tianjin'
    elif x == '广东省':
        return 'Guangdong'
    elif x == '新疆维吾尔自治区':
        return 'Xinjiang'
    elif x == '江苏省':
        return 'Jiangsu'
    elif x == '河南省':
        return 'Henan'
    elif x == '湖北省':
        return 'Hubei'
    else:
        return x
data['Province'] = data['Province'].apply(lambda x: replace(x))

In [None]:
select = data.groupby('year')['Province'].value_counts().sort_values(ascending=False)
select

In [None]:
# The accident type of province
province_list = list(data['Province'].value_counts().index)
# function : return value_counts
def accident_type_counts(province_name,height=0.8,labelsize=12):
    data_temp = data.copy()
    data_temp = data_temp[data_temp['Province'] == province_name]
    select = data_temp['Type'].value_counts(ascending=False)
    bar_plot(select,province_name,height,labelsize)
# function : plot data
def bar_plot(value_counts,province_name,height,labelsize):
    plt.title(province_name,fontsize=15)
    labels = list(value_counts.index)
    colors = cm.rainbow(np.arange(len(labels),0,-1)/len(labels))
    barplot = plt.barh(y = labels,height = height,width=value_counts,color = colors)
    plt.tick_params(labelsize=labelsize) 
    plt.xticks(rotation=90)
    plt.show()
    
for p in province_list:
    accident_type_counts(p,labelsize=15)

In [None]:
# The keywords of accident description of province
import jieba
import jieba.analyse
from wordcloud import WordCloud
from wordcloud import STOPWORDS

province_list = list(data['Province'].value_counts().index)

# make keywords wordcloud and show
def keywords_extract_and_show(province_name):
    data_temp = data.copy()
    data_temp = data_temp[data_temp['Province'] == province_name]
    data_temp['Description'] = data_temp['Description'].apply(lambda x : jieba.lcut(str(x)))
    description_words = data_temp['Description'].sum()
    sentence = ' '.join(description_words)
    stop_words = [province_name,'one','two','dead','injured'] + list(STOPWORDS)
    word_cloud = WordCloud(background_color='white',max_words = 20, stopwords = stop_words)
    word_fig = word_cloud.generate(sentence)
    plt.figure(figsize=(10,5))
    plt.title(province_name,fontsize=20)
    plt.imshow(word_fig)
    plt.axis("off")
    plt.show()
    plt.close()
    
for p in province_list:
    keywords_extract_and_show(p)

From the above word frequency analysis results, we can further observe the characteristics of accidents in various provinces. For example, from the words  **'site', 'tower' and 'crane'**, we can know that the most frequent accidents in Zhejiang Province should be **site construction accidents**; From the **'coal' and 'mine'** in the word frequency analysis results of Shanxi Province and Inner Mongolia Province, we can know that the most frequent accidents in these two places should be **coal mine accidents**.

In [None]:
# Analysis of cities with frequent accidents
select = data.groupby('year')['City'].value_counts().sort_values(ascending=False)
select[:10]

Tianjin, Beijing and Shenzhen are the three cities with the most accidents

# 3. Accident company analysis

In [None]:
Company_Ownership_counts = data['Company Ownership'].value_counts()
Company_Ownership_counts

Private enterprise and SOE are the main types of companies with accidents

In [None]:
labels = list(Company_Ownership_counts.index)
plt.title('Accidental Companies',fontsize=15)
colors = cm.rainbow(np.arange(len(labels),0,-1)/len(labels))
plt.barh(y = labels,height = 0.8,width=Company_Ownership_counts,color = colors)
plt.tick_params(labelsize=15) 
plt.xticks(rotation=90)
plt.show()

# 4. Accident type and punishment analysis

In [None]:
# Analysis of fatal accident types
data_temp = data.copy()
idx = data_temp[data_temp['Number of Deaths'] == '0'].index
data_temp = data_temp.drop(axis=0,index=idx)
select = data_temp.groupby('Type')['Number of Deaths'].value_counts().sort_values(ascending=False)
select

Mechanical accident, Gas or Chemical exposure, Falling, Structural failure and Fire are the main causes of fatal accidents

In [None]:
# Analysis of major fatal accidents
data_temp = data.copy()
idx = data_temp[data_temp['Number of Deaths'].isin(['0','1-9'])].index
data_temp = data_temp.drop(axis=0,index=idx)
select = data_temp.groupby('Type')['Number of Deaths'].value_counts().sort_values(ascending=False)
select

Fire, Flood / Structural failure and Drowning are the main causes of major fatal accidents

In [None]:
# Analysis of injury accidents
data_temp = data.copy()
idx = data_temp[data_temp['Number of Injuries'] == '0'].index
data_temp = data_temp.drop(axis=0,index=idx)
select = data_temp.groupby('Type')['Number of Injuries'].value_counts().sort_values(ascending=False)
select

Structural failure, Falling / Structural failure, Gas or Chemical Exposure, Fire and Explosion / Fire are the main causes of injury accidents

In [None]:
# Analysis of major injury accidents
data_temp = data.copy()
idx = data_temp[data_temp['Number of Injuries'].isin(['0','1-9'])].index
data_temp = data_temp.drop(axis=0,index=idx)
select = data_temp.groupby('Type')['Number of Injuries'].value_counts().sort_values(ascending=False)
select

Fire, Flood, Explosion / Fire, Gas or Chemical Exposure, Structural Failure and Transport accident are the main causes of major injury accidents

In [None]:
# Extraordinarily serious injury accident
data_temp = data.copy()
idx = data_temp[data_temp['Number of Injuries'].isin(['0','1-9','10-29'])].index
data_temp = data_temp.drop(axis=0,index=idx)
select = data_temp.groupby('Type')['Number of Injuries'].value_counts().sort_values(ascending=False)
select

Explosion / Fire and Structural failure are the causes of extraordinarily serious injury accident

In [None]:
# Punished person analysis
data['Number of Punished'].value_counts()

In [None]:
# Financial penalty analysis
data['Financial Penalty'].value_counts()

It can be seen from the above results that both the punished person and the financial penalty are 0, **which should be because this part of data has not been collected**