In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression

In [4]:
# read excel file into a pandas dataframe

starting_year = 1995

raw_df = pd.read_excel('../data/sarpol.xlsx')
raw_df.sort_values(['year', 'month', 'day', 'hour', 'min', 'sec'], inplace=True)
raw_df = raw_df[raw_df['year'] >= 1995]
raw_df

In [5]:
MAX_MAG = raw_df['magnitude'].max()
MIN_MAG = raw_df['magnitude'].min()

print("Maximum magnitute: {}".format(MAX_MAG))
print("Minimum magnitute: {}".format(MIN_MAG))


Maximum magnitute: 7.3
Minimum magnitute: 2.8


In [6]:
# raw_df['date'] = pd.to_datetime(raw_df[['year', 'month', 'day']])
raw_df.set_index(['year', 'month'], inplace=True)
raw_df

Unnamed: 0_level_0,Unnamed: 1_level_0,day,hour,min,sec,latitude,longitude,depth,magnitude
year,month,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1995,1,7,15,33,24.0,35.760,44.180,33.0,4.67
1995,1,7,15,33,25.0,35.760,44.200,33.0,4.50
1995,1,13,17,19,0.0,39.290,47.830,33.0,4.03
1995,1,15,7,13,0.0,38.920,43.330,1.8,4.10
1995,1,16,9,55,0.0,38.700,43.580,33.0,4.17
...,...,...,...,...,...,...,...,...,...
2022,1,9,19,19,51.1,39.186,46.588,10.0,3.17
2022,1,10,18,15,8.0,35.637,44.984,10.0,3.75
2022,1,10,18,17,40.7,36.985,49.675,18.0,3.00
2022,1,10,18,29,49.2,35.619,44.949,10.0,4.67


In [7]:
raw_df.isna().sum()

day          0
hour         0
min          0
sec          0
latitude     0
longitude    0
depth        0
magnitude    0
dtype: int64

In [8]:
# some hyperparameters for creating features

TIME_STEP = 100

## Easy method paper parameters

### Feature #1: T = Tn - T1

In [9]:
raw_df.head(20)

Unnamed: 0_level_0,Unnamed: 1_level_0,day,hour,min,sec,latitude,longitude,depth,magnitude
year,month,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1995,1,7,15,33,24.0,35.76,44.18,33.0,4.67
1995,1,7,15,33,25.0,35.76,44.2,33.0,4.5
1995,1,13,17,19,0.0,39.29,47.83,33.0,4.03
1995,1,15,7,13,0.0,38.92,43.33,1.8,4.1
1995,1,16,9,55,0.0,38.7,43.58,33.0,4.17
1995,1,22,3,59,0.0,38.42,44.83,215.0,4.3
1995,1,24,4,22,21.0,31.86,47.35,33.8,4.41
1995,1,24,4,53,0.0,38.6,45.1,199.1,4.5
1995,2,13,22,51,0.0,39.0,43.35,54.8,4.23
1995,2,14,11,13,20.0,37.74,42.71,33.0,4.85


In [10]:
idx_list = sorted(set(raw_df.index.to_list()))
raw_df.reset_index(inplace=True)
raw_df['idx'] = raw_df.index
raw_df.set_index(['year', 'month'], inplace=True)

# dataframe of features
df = pd.DataFrame()
# T = T_n - T_1
T_list = []
for i in idx_list:
    events = raw_df.loc[i] # the dataframe of events in a month
    events.set_index('idx', inplace=True)
    idx_max_mag = events[['magnitude']].idxmax()
    
    if idx_max_mag[0] > TIME_STEP:
        events = raw_df.iloc[idx_max_mag[0] - TIME_STEP + 1:idx_max_mag[0]+1, :]
    else:
        continue
        
#     print(i, events)
    events.reset_index(inplace=True)
    events['datetime'] = pd.to_datetime(events[['year', 'month', 'day', 'hour']])
    events['time_diff'] = events['datetime'].diff(periods=TIME_STEP - 1)
    time_diff = events['time_diff'].iloc[-1].days
#     print(time_diff.days)
    T_list.append(time_diff)

    
df['T'] = T_list
df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  events['datetime'] = pd.to_datetime(events[['year', 'month', 'day', 'hour']])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  events['time_diff'] = events['datetime'].diff(periods=TIME_STEP - 1)


Unnamed: 0,T
0,181
1,207
2,225
3,264
4,284
...,...
313,43
314,47
315,48
316,51


### Feature #2: M_mean

In [11]:
M_mean_list = []
for i in idx_list:
    events = raw_df.loc[i] # the dataframe of events in a month
    events.set_index('idx', inplace=True)
    idx_max_mag = events[['magnitude']].idxmax()
    
    if idx_max_mag[0] > TIME_STEP:
        events = raw_df.iloc[idx_max_mag[0] - TIME_STEP + 1:idx_max_mag[0]+1, :]
    else:
        continue
#     print(i, events)
    M_mean_list.append(np.mean(events['magnitude']))

    
df['M_mean'] = M_mean_list
df

Unnamed: 0,T,M_mean
0,181,4.5867
1,207,4.5828
2,225,4.5895
3,264,4.6001
4,284,4.5934
...,...,...
313,43,3.3777
314,47,3.3869
315,48,3.3972
316,51,3.4106


### Feature #3: dE

In [12]:
dE_list = []
for i in idx_list:
    events = raw_df.loc[i] # the dataframe of events in a month
    events.set_index('idx', inplace=True)
    idx_max_mag = events[['magnitude']].idxmax()
    
    if idx_max_mag[0] > TIME_STEP:
        events = raw_df.iloc[idx_max_mag[0] - TIME_STEP + 1:idx_max_mag[0]+1, :]
    else:
        continue
    
    dE_list.append(np.sum(events['magnitude'][-TIME_STEP:].apply(lambda x: np.sqrt(10 ** (7.1 + 2.06*x)))))
    
df['dE'] = dE_list / df['T']
df

Unnamed: 0,T,M_mean,dE
0,181,4.5867,1.231688e+08
1,207,4.5828,1.067747e+08
2,225,4.5895,1.008165e+08
3,264,4.6001,8.864885e+07
4,284,4.5934,8.168575e+07
...,...,...,...
313,43,3.3777,4.292558e+07
314,47,3.3869,9.800679e+07
315,48,3.3972,4.304746e+07
316,51,3.4106,4.524388e+07


### Feature #4&5: M_expected & eta & delta_M

In [13]:
M_expected_list = []
eta_list = []
b_list = []
delta_M_list = []

for i in idx_list:
    events = raw_df.loc[i] # the dataframe of events in a month
    events.set_index('idx', inplace=True)
    idx_max_mag = events[['magnitude']].idxmax()
    
    if idx_max_mag[0] > TIME_STEP:
        events = raw_df.iloc[idx_max_mag[0] - TIME_STEP + 1:idx_max_mag[0]+1, :]
    else:
        continue
    
    mag = events['magnitude']
    max_M = np.max(mag)
    

    hist_values = plt.hist(mag, bins=10, range=[MIN_MAG, MAX_MAG], cumulative=False, histtype='bar', log=True)
    
    plt.close()

    y = hist_values[0]
    X = []
    for idx in range(len(hist_values[1][:-1])):
        X.append(np.round((hist_values[1][idx] + hist_values[1][idx+1])/2, 2))
        
#     print(X)
#     print(y)
#     print()
    if 0 in y:
        idx_start = np.where(y != 0)[0][0]
        y = y[idx_start:]
        X = X[idx_start:]
        
    if 0 in y:
        idx_end = np.where(y == 0)[0][0]
        y = y[:idx_end]
        X = X[:idx_end]
    y = np.log(y) # log10
    X = np.array(X)
#     print(X)
#     print(y)
#     print()

    # apply linear regression
    lr = LinearRegression()

    lr.fit(X.reshape(-1, 1), y.reshape(-1, 1))
    a = lr.intercept_
    b = -1 * lr.coef_
    M_expected = a / (b + 0.000001)
    M_expected_list.append(M_expected[0][0])
    b_list.append(b[0][0])
    
    eta = 0
    eta = np.linalg.norm(y - lr.predict(X.reshape(-1, 1)))
    eta /= len(X)
    eta_list.append(eta)
    delta_M_list.append(np.abs(max_M - M_expected[0][0]))
    
    
df['M_expected'] = M_expected_list
df['eta'] = eta_list
df['b'] = b_list
df['delta_M'] = delta_M_list
df

Unnamed: 0,T,M_mean,dE,M_expected,eta,b,delta_M
0,181,4.5867,1.231688e+08,-0.519387,1.795153,-0.412321,5.819387
1,207,4.5828,1.067747e+08,-0.630656,1.794082,-0.403303,5.930656
2,225,4.5895,1.008165e+08,-6.671809,1.491627,-0.210577,11.971809
3,264,4.6001,8.864885e+07,-5.777555,1.493104,-0.228961,11.077555
4,284,4.5934,8.168575e+07,-5.777555,1.493104,-0.228961,11.077555
...,...,...,...,...,...,...,...
313,43,3.3777,4.292558e+07,4.974961,1.980275,2.149257,0.304961
314,47,3.3869,9.800679e+07,4.929870,1.995011,2.152564,0.810130
315,48,3.3972,4.304746e+07,5.061241,1.902438,2.059937,0.391241
316,51,3.4106,4.524388e+07,5.343731,1.595128,1.736622,0.593731


# Mu & C

In [14]:
mu_list = []
c_list = []

for i in idx_list:
    events = raw_df.loc[i] # the dataframe of events in a month
    events.set_index('idx', inplace=True)
    idx_max_mag = events[['magnitude']].idxmax()
    
    if idx_max_mag[0] > TIME_STEP:
        events = raw_df.iloc[idx_max_mag[0] - TIME_STEP + 1:idx_max_mag[0]+1, :]
    else:
        continue
    
    mag = events['magnitude']
    

    hist_values = plt.hist(mag, bins=10, range=[MIN_MAG, MAX_MAG], cumulative=False, histtype='bar', log=True)
    plt.close()

    y = hist_values[0]
    X = hist_values[1][:-1]
    
    if 0 in y:
        idx_start = np.where(y != 0)[0][0]
        y = y[idx_start:]
        X = X[idx_start:]
        
    if 0 in y:
        idx_end = np.where(y == 0)[0][0]
        y = y[:idx_end]
        X = X[:idx_end]
    X = np.array(X)
    
    
    if y[-1] == 1:
        mu_list.append(0)
        c_list.append(0)
        continue
    
    characteristics = events[events['magnitude'] >= X[-1]]
    
    characteristics.reset_index(inplace=True)
    characteristics['datetime'] = pd.to_datetime(characteristics[['year', 'month', 'day', 'hour']])
    characteristics['time_diff'] = characteristics['datetime'].diff()
    mu = np.mean(characteristics['time_diff']).days
    if mu == 0:
        c = 0
    else:
        c = np.std(characteristics['time_diff']).days / mu
    
    mu_list.append(mu)
    c_list.append(c)

df['mu'] = mu_list
df['c'] = c_list
df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  characteristics['datetime'] = pd.to_datetime(characteristics[['year', 'month', 'day', 'hour']])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  characteristics['time_diff'] = characteristics['datetime'].diff()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  characteristics['datetime'] = pd.to_datetim

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  characteristics['datetime'] = pd.to_datetime(characteristics[['year', 'month', 'day', 'hour']])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  characteristics['time_diff'] = characteristics['datetime'].diff()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  characteristics['datetime'] = pd.to_datetim

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  characteristics['datetime'] = pd.to_datetime(characteristics[['year', 'month', 'day', 'hour']])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  characteristics['time_diff'] = characteristics['datetime'].diff()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  characteristics['datetime'] = pd.to_datetim

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  characteristics['datetime'] = pd.to_datetime(characteristics[['year', 'month', 'day', 'hour']])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  characteristics['time_diff'] = characteristics['datetime'].diff()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  characteristics['datetime'] = pd.to_datetim

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  characteristics['datetime'] = pd.to_datetime(characteristics[['year', 'month', 'day', 'hour']])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  characteristics['time_diff'] = characteristics['datetime'].diff()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  characteristics['datetime'] = pd.to_datetim

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  characteristics['datetime'] = pd.to_datetime(characteristics[['year', 'month', 'day', 'hour']])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  characteristics['time_diff'] = characteristics['datetime'].diff()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  characteristics['datetime'] = pd.to_datetim

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  characteristics['datetime'] = pd.to_datetime(characteristics[['year', 'month', 'day', 'hour']])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  characteristics['time_diff'] = characteristics['datetime'].diff()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  characteristics['datetime'] = pd.to_datetim

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  characteristics['datetime'] = pd.to_datetime(characteristics[['year', 'month', 'day', 'hour']])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  characteristics['time_diff'] = characteristics['datetime'].diff()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  characteristics['datetime'] = pd.to_datetim

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  characteristics['datetime'] = pd.to_datetime(characteristics[['year', 'month', 'day', 'hour']])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  characteristics['time_diff'] = characteristics['datetime'].diff()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  characteristics['datetime'] = pd.to_datetim

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  characteristics['datetime'] = pd.to_datetime(characteristics[['year', 'month', 'day', 'hour']])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  characteristics['time_diff'] = characteristics['datetime'].diff()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  characteristics['datetime'] = pd.to_datetim

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  characteristics['datetime'] = pd.to_datetime(characteristics[['year', 'month', 'day', 'hour']])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  characteristics['time_diff'] = characteristics['datetime'].diff()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  characteristics['datetime'] = pd.to_datetim

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  characteristics['datetime'] = pd.to_datetime(characteristics[['year', 'month', 'day', 'hour']])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  characteristics['time_diff'] = characteristics['datetime'].diff()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  characteristics['datetime'] = pd.to_datetim

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  characteristics['datetime'] = pd.to_datetime(characteristics[['year', 'month', 'day', 'hour']])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  characteristics['time_diff'] = characteristics['datetime'].diff()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  characteristics['datetime'] = pd.to_datetim

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  characteristics['datetime'] = pd.to_datetime(characteristics[['year', 'month', 'day', 'hour']])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  characteristics['time_diff'] = characteristics['datetime'].diff()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  characteristics['datetime'] = pd.to_datetim

Unnamed: 0,T,M_mean,dE,M_expected,eta,b,delta_M,mu,c
0,181,4.5867,1.231688e+08,-0.519387,1.795153,-0.412321,5.819387,46,0.000000
1,207,4.5828,1.067747e+08,-0.630656,1.794082,-0.403303,5.930656,46,0.000000
2,225,4.5895,1.008165e+08,-6.671809,1.491627,-0.210577,11.971809,88,0.465909
3,264,4.6001,8.864885e+07,-5.777555,1.493104,-0.228961,11.077555,88,0.465909
4,284,4.5934,8.168575e+07,-5.777555,1.493104,-0.228961,11.077555,88,0.465909
...,...,...,...,...,...,...,...,...,...
313,43,3.3777,4.292558e+07,4.974961,1.980275,2.149257,0.304961,0,0.000000
314,47,3.3869,9.800679e+07,4.929870,1.995011,2.152564,0.810130,0,0.000000
315,48,3.3972,4.304746e+07,5.061241,1.902438,2.059937,0.391241,0,0.000000
316,51,3.4106,4.524388e+07,5.343731,1.595128,1.736622,0.593731,5,0.000000


In [64]:
len(df[df['mu'] == 0])

157

In [15]:
df.head(60)

Unnamed: 0,T,M_mean,dE,M_expected,eta,b,delta_M,mu,c
0,181,4.5867,123168800.0,-0.519387,1.795153,-0.412321,5.819387,46,0.0
1,207,4.5828,106774700.0,-0.630656,1.794082,-0.403303,5.930656,46,0.0
2,225,4.5895,100816500.0,-6.671809,1.491627,-0.210577,11.971809,88,0.465909
3,264,4.6001,88648850.0,-5.777555,1.493104,-0.228961,11.077555,88,0.465909
4,284,4.5934,81685750.0,-5.777555,1.493104,-0.228961,11.077555,88,0.465909
5,260,4.5842,90567800.0,60.01287,1.220875,0.047001,54.61287,85,0.388235
6,285,4.5419,74566980.0,10.256106,1.268383,0.455031,4.856106,104,0.230769
7,282,4.5384,74373360.0,10.256106,1.268383,0.455031,4.856106,104,0.230769
8,300,4.4682,62568100.0,7.019309,1.23002,1.136939,1.619309,95,0.157895
9,304,4.4523,60878720.0,6.842223,1.242323,1.236505,1.442223,95,0.157895


## Label

In [20]:
label_list = []
for i in idx_list:
    events = raw_df.loc[i] # the dataframe of events in a month
    events.set_index('idx', inplace=True)
    idx_max_mag = events[['magnitude']].idxmax()
    
    if idx_max_mag[0] > TIME_STEP:
        events = raw_df.iloc[idx_max_mag[0] - TIME_STEP + 1:idx_max_mag[0]+1, :]
    else:
        continue

    
    
    mag = events['magnitude']
    flag = False
    for m in mag:
        if m >= 5.5:
            flag = True
    label_list.append(1 if flag else 0)
    
df['label'] = label_list
df

Unnamed: 0,T,M_mean,dE,M_expected,eta,b,delta_M,mu,c,label
0,181,4.5867,1.231688e+08,-0.519387,1.795153,-0.412321,5.819387,46,0.000000,0
1,207,4.5828,1.067747e+08,-0.630656,1.794082,-0.403303,5.930656,46,0.000000,0
2,225,4.5895,1.008165e+08,-6.671809,1.491627,-0.210577,11.971809,88,0.465909,0
3,264,4.6001,8.864885e+07,-5.777555,1.493104,-0.228961,11.077555,88,0.465909,0
4,284,4.5934,8.168575e+07,-5.777555,1.493104,-0.228961,11.077555,88,0.465909,0
...,...,...,...,...,...,...,...,...,...,...
313,43,3.3777,4.292558e+07,4.974961,1.980275,2.149257,0.304961,0,0.000000,0
314,47,3.3869,9.800679e+07,4.929870,1.995011,2.152564,0.810130,0,0.000000,1
315,48,3.3972,4.304746e+07,5.061241,1.902438,2.059937,0.391241,0,0.000000,0
316,51,3.4106,4.524388e+07,5.343731,1.595128,1.736622,0.593731,5,0.000000,0


In [21]:
len(df[df['label'] == 1])

64

In [22]:
X = df[['T', 'M_mean', 'dE', 'M_expected', 'eta', 'b', 'delta_M', 'mu', 'c']]
y = df['label']

In [23]:
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

acc_score = []
k = 5
kf = KFold(n_splits=k)
for train_index , test_index in kf.split(X):
    X_train , X_test = X.iloc[train_index,:],X.iloc[test_index,:]
    y_train , y_test = y[train_index] , y[test_index]
    
    model = make_pipeline(StandardScaler(), SVC(gamma='auto'))
    model.fit(X_train, y_train)
    pred_values = model.predict(X_test)
     
    acc = accuracy_score(pred_values , y_test)
    acc_score.append(acc)
    
avg_acc_score = sum(acc_score)/k
print(avg_acc_score * 100)

85.86805555555556


In [73]:
df.isna().sum()

T             0
M_mean        0
dE            0
M_expected    0
eta           0
b             0
delta_M       0
mu            0
c             0
label         0
dtype: int64