In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [2]:
eq20_years = pd.read_csv('1999-2019.csv')
eq1_year = pd.read_csv('earthquake.csv')
eq_sichuan = pd.read_csv('sichuan.csv')

In [3]:
eq20_years.sample(5)

Unnamed: 0,time,latitude,longitude,depth,mag,magType,nst,gap,dmin,rms,...,updated,place,type,horizontalError,depthError,magError,magNst,status,locationSource,magSource
46097,2017-04-03T18:18:52.520Z,-11.2493,164.2398,35.0,4.6,mb,,146.0,4.59,0.92,...,2017-06-23T07:26:28.040Z,"183km WSW of Lata, Solomon Islands",earthquake,9.7,2.0,0.132,17.0,reviewed,us,us
73208,2014-01-08T22:29:09.090Z,1.4124,126.276,42.92,4.7,mb,,89.0,1.262,0.81,...,2014-03-15T03:38:40.000Z,"121km E of Bitung, Indonesia",earthquake,,6.8,0.086,45.0,reviewed,us,us
30421,2015-09-17T18:04:41.180Z,-30.8769,-71.9928,10.0,4.5,mb,,82.0,0.368,0.95,...,2015-12-21T23:16:17.040Z,"81km WSW of Ovalle, Chile",earthquake,5.1,1.8,0.136,16.0,reviewed,us,us
68248,2017-08-29T05:00:47.180Z,-38.6772,95.5484,10.0,4.9,mb,,49.0,18.49,0.78,...,2017-11-28T18:33:51.040Z,South Indian Ocean,earthquake,11.0,1.8,0.08,66.0,reviewed,us,us
90734,2006-07-22T18:02:36.020Z,-9.437,107.922,10.0,4.5,mb,17.0,86.6,,0.62,...,2014-11-07T01:29:44.164Z,"south of Java, Indonesia",earthquake,,,,11.0,reviewed,us,us


In [4]:
eq20_years.iloc[6666]

time                  2010-05-10T00:25:29.740Z
latitude                                -2.881
longitude                              101.101
depth                                     56.7
mag                                        4.6
magType                                     mb
nst                                         25
gap                                      180.7
dmin                                       NaN
rms                                       0.85
net                                         us
id                                  usp000hcjh
updated               2014-11-07T01:41:28.481Z
place              southern Sumatra, Indonesia
type                                earthquake
horizontalError                            NaN
depthError                                   8
magError                                   NaN
magNst                                       3
status                                reviewed
locationSource                              us
magSource    

### 1. 数据清洗
- 选取出需要的字段
- 数据类型转换

In [5]:
eq20_years.columns

Index(['time', 'latitude', 'longitude', 'depth', 'mag', 'magType', 'nst',
       'gap', 'dmin', 'rms', 'net', 'id', 'updated', 'place', 'type',
       'horizontalError', 'depthError', 'magError', 'magNst', 'status',
       'locationSource', 'magSource'],
      dtype='object')

In [6]:
cols = ['time', 'latitude', 'longitude', 'depth', 'mag', 
        'magType', 'id', 'place', 'type', 'status']
eq20_years = eq20_years[cols]
eq1_year = eq1_year[cols]
eq_sichuan = eq_sichuan[cols]

In [7]:
eq20_years['time'] = pd.to_datetime(eq20_years['time'])
eq1_year['time'] = pd.to_datetime(eq1_year['time'])
eq_sichuan['time'] = pd.to_datetime(eq_sichuan['time'])

### 2. 问题
- 引发地震的因素有哪些？
- 全世界地震频发的地区有哪些？
- 有哪些引发全世界舆情关注的大地震？
- 为什么四川省及其周围好像从2008年开始就地震频发？

#### 2.1 引发地震的因素有哪些？

In [24]:
eq1_year.sample(5)

Unnamed: 0,time,latitude,longitude,depth,mag,magType,id,place,type,status
117734,2019-02-04 17:30:55.440000+00:00,40.7335,-124.096,8.65,2.4,md,nc73139521,"5km E of Pine Hills, CA",earthquake,reviewed
96957,2018-07-23 15:54:54.350000+00:00,35.596833,-118.400667,7.66,1.23,ml,ci38228216,"7km ESE of Lake Isabella, CA",earthquake,reviewed
51544,2018-11-28 04:16:46.735000+00:00,60.1154,-150.9278,53.6,1.6,ml,ak20411671,"35km SE of Cohoe, Alaska",earthquake,reviewed
38065,2018-07-25 23:00:45.568000+00:00,55.6281,-161.1272,110.7,1.9,ml,ak20000552,"51km NW of Sand Point, Alaska",earthquake,reviewed
15070,2018-11-03 23:11:01.370000+00:00,38.825333,-122.855003,2.01,0.55,md,nc73106211,"10km WNW of The Geysers, CA",earthquake,automatic


In [25]:
eq_type = eq1_year['type'].value_counts()
eq_type

earthquake            158194
quarry blast            1173
explosion               1023
ice quake                776
mining explosion         214
other event              136
volcanic eruption         28
chemical explosion        15
rock burst                 2
mine collapse              2
sonic boom                 2
Name: type, dtype: int64

In [26]:
np.round(eq_type/eq_type.sum()*100, 2)

earthquake            97.91
quarry blast           0.73
explosion              0.63
ice quake              0.48
mining explosion       0.13
other event            0.08
volcanic eruption      0.02
chemical explosion     0.01
rock burst             0.00
mine collapse          0.00
sonic boom             0.00
Name: type, dtype: float64

In [27]:
eq1_year['time'].describe()

  """Entry point for launching an IPython kernel.


count                               161565
unique                              161552
top       2018-08-03 21:16:00.990000+00:00
freq                                     2
first     2018-06-25 00:01:27.580000+00:00
last      2019-06-24 06:52:27.530000+00:00
Name: time, dtype: object

In [28]:
eq_type

earthquake            158194
quarry blast            1173
explosion               1023
ice quake                776
mining explosion         214
other event              136
volcanic eruption         28
chemical explosion        15
rock burst                 2
mine collapse              2
sonic boom                 2
Name: type, dtype: int64

In [29]:
from pyecharts import Sankey

nodes = [
    {'name': '构造地震'}, {'name': '天然地震'}, {'name': '火山喷发'},
    {'name': '火山地震'}, {'name': '冰震'}, {'name': '塌陷地震'},
    {'name': '矿井塌陷'}, {'name': '岩爆'}, {'name': '采石场爆破'},
    {'name': '爆炸'}, {'name': '煤矿爆炸'}, {'name': '化学爆炸'},
    {'name': '人工地震'}, {'name': '水库地震'}, {'name': '油田注水'},
    {'name': '诱发地震'}
]

links = [
    {'source': '构造地震', 'target': '天然地震', 'value': 5000},  #为了图形显示，这里重置了构造地震真实值
    {'source': '火山喷发', 'target': '火山地震', 'value': 28},
    {'source': '火山地震', 'target': '天然地震', 'value': 28},
    {'source': '冰震', 'target': '塌陷地震', 'value': 776},
    {'source': '矿井塌陷', 'target': '塌陷地震', 'value': 2},
    {'source': '岩爆', 'target': '塌陷地震', 'value': 2},
    {'source': '塌陷地震', 'target': '天然地震', 'value': 780},
    {'source': '采石场爆破', 'target': '人工地震', 'value': 1173},
    {'source': '爆炸', 'target': '人工地震', 'value': 1023},
    {'source': '煤矿爆炸', 'target': '人工地震', 'value': 214},
    {'source': '化学爆炸', 'target': '人工地震', 'value': 15},
    {'source': '水库地震', 'target': '诱发地震', 'value': 50},
    {'source': '油田注水', 'target': '诱发地震', 'value': 50},
]
sankey = Sankey("近一年全球所有地震的地震类型", "时间: 2018年6月25日至2019年6月24日", width=800, height=900)
sankey.add(
    "",
    nodes,
    links,
    line_opacity=0.2,
    line_curve=0.5,
    line_color="source",
    is_label_show=True,
    label_pos="right",
)
sankey

In [30]:
eq1_year.shape

(161565, 10)

#### 2.2 全世界地震频发的地区有哪些？

In [31]:
eq20_years.shape

(134063, 11)

In [32]:
eq20_years.columns

Index(['time', 'latitude', 'longitude', 'depth', 'mag', 'magType', 'id',
       'place', 'type', 'status', 'year'],
      dtype='object')

In [33]:
eq_place = eq20_years.sample(30000)[['latitude', 'longitude', 'mag']]
eq_place['latitude'] = eq_place['latitude'].astype(str)
eq_place['longitude'] = eq_place['longitude'].astype(str)
eq_place['latlon'] = eq_place['longitude'].str.cat(eq_place['latitude'], sep=',')

In [34]:
eq_place.to_csv('eq_place.csv', index=False)

In [35]:
eq20_years[eq20_years['mag'] > 6].shape

(2410, 11)

In [36]:
eq_place_6 = eq20_years[eq20_years['mag'] > 6]
eq_place_6['latitude'] = eq_place_6['latitude'].astype(str)
eq_place_6['longitude'] = eq_place_6['longitude'].astype(str)
eq_place_6['latlon'] = eq_place_6['longitude'].str.cat(eq_place_6['latitude'], sep=',')

eq_place_6.to_csv('eq_place_6.csv', index=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


In [37]:
region = []
for l in eq20_years['place'].str.split(',').tolist():
    try:
        region.append(l[-1].strip())
    except TypeError:
        region.append(np.nan)

In [38]:
eq20_years['region'] = region

In [39]:
region_top10 = eq20_years['region'].value_counts().nlargest(11)
region_top10

Indonesia           15856
Japan                9701
Papua New Guinea     7969
Chile                5309
Philippines          4461
Tonga                4230
Vanuatu              3562
New Zealand          2999
Solomon Islands      2970
Alaska               2745
Japan region         2577
Name: region, dtype: int64

In [40]:
from pyecharts import Bar


r = ['印尼', '日本', '新几内亚', '智利', '菲律宾', '汤加', '瓦努阿图', '新西兰', '所罗门群岛', '阿拉斯加州']
num = [15661, 9646+2522, 7827, 5249, 4379, 4167, 3489, 2982, 2958, 2700]
bar = Bar("近20年全球4.5级以上地震次数最多的国家/地区", "时间: 2000年1月1日-2019年6月24日, 单位:次", width=900)
bar.add("", r, num, 
        is_stack=True,  bar_category_gap='40%', label_text_size=18, 
        xaxis_label_textsize=11,yaxis_label_textsize=20, xaxis_rotate=0)

bar

#### 2.3 近20年有哪些引发全世界舆情关注的大地震？

In [41]:
eq_gt_6 = eq20_years[eq20_years['mag'] > 6]

In [42]:
eq_gt_6[eq_gt_6['mag']<7].shape[0]

2098

In [43]:
eq_gt_6[(eq_gt_6['mag']>=7)&(eq_gt_6['mag']<8)].shape[0]

288

In [44]:
eq_gt_6[(eq_gt_6['mag']>=8)&(eq_gt_6['mag']<9)].shape[0]

22

In [45]:
eq_gt_6[eq_gt_6['mag']>=9].shape[0]

2

In [46]:
eq_gt_7 = eq20_years[eq20_years['mag'] > 7]

In [47]:
eq_gt_7['influence'] = eq_gt_7['mag'] + np.round(10/(eq_gt_7['depth']+5))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [48]:
eq_gt_7.loc[eq_gt_7.sort_values(by=['mag', 'depth'], ascending=[False, True]).index].head(30)

Unnamed: 0_level_0,time,latitude,longitude,depth,mag,magType,id,place,type,status,year,region,influence
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
2011-03-11 05:46:24.120000+00:00,2011-03-11 05:46:24.120000+00:00,38.297,142.373,29.0,9.1,mww,official20110311054624120_30,"2011 Great Tohoku Earthquake, Japan",earthquake,reviewed,2011,Japan,9.1
2004-12-26 00:58:53.450000+00:00,2004-12-26 00:58:53.450000+00:00,3.295,95.982,30.0,9.1,mw,official20041226005853450_30,2004 Sumatra - Andaman Islands Earthquake,earthquake,reviewed,2004,2004 Sumatra - Andaman Islands Earthquake,9.1
2010-02-27 06:34:11.530000+00:00,2010-02-27 06:34:11.530000+00:00,-36.122,-72.898,22.9,8.8,mww,official20100227063411530_30,"offshore Bio-Bio, Chile",earthquake,reviewed,2010,Chile,8.8
2012-04-11 08:38:36.720000+00:00,2012-04-11 08:38:36.720000+00:00,2.327,93.063,20.0,8.6,mw,official20120411083836720_20,off the west coast of northern Sumatra,earthquake,reviewed,2012,off the west coast of northern Sumatra,8.6
2005-03-28 16:09:36.530000+00:00,2005-03-28 16:09:36.530000+00:00,2.085,97.108,30.0,8.6,mww,official20050328160936530_30,"northern Sumatra, Indonesia",earthquake,reviewed,2005,Indonesia,8.6
2001-06-23 20:33:14.130000+00:00,2001-06-23 20:33:14.130000+00:00,-16.265,-73.641,33.0,8.4,mww,official20010623203314130_33,near the coast of southern Peru,earthquake,reviewed,2001,near the coast of southern Peru,8.4
2007-09-12 11:10:26.830000+00:00,2007-09-12 11:10:26.830000+00:00,-4.438,101.367,34.0,8.4,mww,official20070912111026830_34,"southern Sumatra, Indonesia",earthquake,reviewed,2007,Indonesia,8.4
2006-11-15 11:14:13.570000+00:00,2006-11-15 11:14:13.570000+00:00,46.592,153.266,10.0,8.3,mwc,usp000exfn,Kuril Islands,earthquake,reviewed,2006,Kuril Islands,9.3
2015-09-16 22:54:32.860000+00:00,2015-09-16 22:54:32.860000+00:00,-31.5729,-71.6744,22.44,8.3,mww,us20003k7a,"48km W of Illapel, Chile",earthquake,reviewed,2015,Chile,8.3
2003-09-25 19:50:06.360000+00:00,2003-09-25 19:50:06.360000+00:00,41.815,143.91,27.0,8.3,mwc,usp000c8kv,"Hokkaido, Japan region",earthquake,reviewed,2003,Japan region,8.3


In [49]:
eq_gt_7.to_csv('eq_gt_7.csv', index=False)

#### 2.4 为什么四川省及其周围好像从2008年开始就地震频发？

In [50]:
eq_sichuan.sample(5)

Unnamed: 0,time,latitude,longitude,depth,mag,magType,id,place,type,status
170,2013-08-31 23:18:00.040000+00:00,29.997,99.7169,10.0,4.6,mb,usb000jein,"118km ENE of Gartog, China",earthquake,reviewed
325,2009-02-08 15:17:14.320000+00:00,31.378,103.945,36.7,4.7,mb,usp000gtjd,"eastern Sichuan, China",earthquake,reviewed
335,2008-12-07 03:02:21.690000+00:00,31.619,104.369,10.0,4.6,mb,usp000gq32,"eastern Sichuan, China",earthquake,reviewed
286,2010-04-14 03:15:46.180000+00:00,33.071,96.617,10.0,5.0,mb,usp000hbc6,"southern Qinghai, China",earthquake,reviewed
946,1989-05-03 17:28:21.070000+00:00,30.073,99.432,10.0,5.3,mb,usp0003uut,"western Sichuan, China",earthquake,reviewed


In [51]:
eq_sichuan[['latitude', 'longitude', 'mag']].describe()

Unnamed: 0,latitude,longitude,mag
count,1073.0,1073.0,1073.0
mean,30.260878,102.387508,4.803914
std,2.324542,2.668046,0.352342
min,25.82,96.504,4.5
25%,28.2649,100.099,4.6
50%,30.883,103.4737,4.7
75%,31.829,104.315,4.9
max,35.624,108.8076,7.9


In [52]:
eq_sichuan['time'].describe()

  """Entry point for launching an IPython kernel.


count                                 1073
unique                                1073
top       2019-02-23 21:38:10.120000+00:00
freq                                     1
first     1979-08-05 05:18:22.500000+00:00
last      2019-06-24 01:23:16.859000+00:00
Name: time, dtype: object

In [53]:
eq_sichuan.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1073 entries, 0 to 1072
Data columns (total 10 columns):
 #   Column     Non-Null Count  Dtype              
---  ------     --------------  -----              
 0   time       1073 non-null   datetime64[ns, UTC]
 1   latitude   1073 non-null   float64            
 2   longitude  1073 non-null   float64            
 3   depth      1073 non-null   float64            
 4   mag        1073 non-null   float64            
 5   magType    1073 non-null   object             
 6   id         1073 non-null   object             
 7   place      1073 non-null   object             
 8   type       1073 non-null   object             
 9   status     1073 non-null   object             
dtypes: datetime64[ns, UTC](1), float64(4), object(5)
memory usage: 84.0+ KB


In [54]:
eq_sichuan['year'] = eq_sichuan['time'].dt.year

In [55]:
eq_sichuan_years = eq_sichuan.groupby('year')['id'].count()

In [56]:
from pyecharts import Line


line = Line("近40年四川省及其周围4.5级以上地震次数", width=800, height=400)

line.add('', eq_sichuan_years.index, eq_sichuan_years.values, is_smooth=True,
             legend_text_size=15,xaxis_label_textsize=16,
             yaxis_label_textsize=18)
line

In [57]:
eq_sichuan_years.loc[2009:2018].mean()

30.9

In [58]:
eq_sichuan_years.loc[1979:2007].mean()

13.89655172413793

In [59]:
eq_sichuan_years.values

array([  3,   7,  11,   9,  10,   8,  10,  16,   8,  17,  54,  15,  12,
        11,  15,   7,  18,  26,  19,  10,   9,  18,  16,  11,  18,  16,
        10,  13,   6, 340,  31,  27,  21,  17,  71,  43,  20,  24,  31,
        24,  21], dtype=int64)

In [60]:
eq_sichuan[eq_sichuan['mag']>6]

Unnamed: 0,time,latitude,longitude,depth,mag,magType,id,place,type,status,year
58,2017-08-08 13:19:49.540000+00:00,33.1926,103.8552,9.0,6.5,mww,us2000a5x1,"36km WSW of Yongle, China",earthquake,reviewed,2017
146,2014-08-03 08:30:13.570000+00:00,27.1891,103.4086,12.0,6.2,mww,usb000rzmg,"11km W of Wenping, China",earthquake,reviewed,2014
223,2013-04-20 00:02:47.540000+00:00,30.308,102.888,14.0,6.6,mww,usb000gcdd,"56km WSW of Linqiong, China",earthquake,reviewed,2013
291,2010-04-13 23:49:38.330000+00:00,33.165,96.548,17.0,6.9,mwc,usp000hbbt,"southern Qinghai, China",earthquake,reviewed,2010
445,2008-05-25 08:21:49.990000+00:00,32.56,105.423,18.0,6.1,mwc,usp000g7r4,"Sichuan-Gansu border region, China",earthquake,reviewed,2008
592,2008-05-12 11:11:02.480000+00:00,31.214,103.618,10.0,6.1,mwc,usp000g68b,"eastern Sichuan, China",earthquake,reviewed,2008
663,2008-05-12 06:28:01.570000+00:00,31.002,103.322,19.0,7.9,mwc,usp000g650,"eastern Sichuan, China",earthquake,reviewed,2008
771,2000-09-12 00:27:58.620000+00:00,35.389,99.343,10.0,6.1,mwc,usp000a03h,"southern Qinghai, China",earthquake,reviewed,2000
774,2000-06-07 21:46:55.900000+00:00,26.856,97.238,33.0,6.3,mwc,usp0009u6j,Myanmar,earthquake,reviewed,2000
841,1996-02-03 11:14:20.120000+00:00,27.291,100.276,11.1,6.6,mw,usp0007c7g,"Sichuan-Yunnan border region, China",earthquake,reviewed,1996


In [61]:
eq_sichuan['latitude'] = eq_sichuan['latitude'].astype(str)
eq_sichuan['longitude'] = eq_sichuan['longitude'].astype(str)
eq_sichuan['latlon'] = eq_sichuan['longitude'].str.cat(eq_sichuan['latitude'], sep=',')
eq_sichuan[['latitude', 'longitude', 'mag', 'latlon']].to_csv('sichuan_graph.csv', index=False)