In [1]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
from datetime import datetime

### Read data and show some statistics

In [2]:
solar_df = pd.read_csv("hessi.solar.flare.UP_To_2018.csv", sep=',', index_col=0)

''' Drop the flag columns'''
solar_df = solar_df.drop(["active.region.ar", "flag.1", "flag.2", "flag.3", "flag.4", "flag.5"], axis=1)

In [3]:
len(solar_df)

116143

In [4]:
solar_df

Unnamed: 0_level_0,start.date,start.time,peak,end,duration.s,peak.c/s,total.counts,energy.kev,x.pos.asec,y.pos.asec,radial
flare,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2021213,2002-02-12,21:29:56,21:33:38,21:41:48,712,136,167304.0,12-25,592,-358,692
2021228,2002-02-12,21:44:08,21:45:06,21:48:56,288,7,9504.0,6-12,604,-341,694
2021332,2002-02-13,00:53:24,00:54:54,00:57:00,216,15,11448.0,6-12,-310,375,487
2021308,2002-02-13,04:22:52,04:23:50,04:26:56,244,20,17400.0,12-25,-277,378,469
2021310,2002-02-13,07:03:52,07:05:14,07:07:48,236,336,313392.0,25-50,-272,390,476
...,...,...,...,...,...,...,...,...,...,...,...
18020903,2018-02-09,16:41:28,16:42:54,16:43:32,124,18,2888.0,6-12,-345,-38,347
18020904,2018-02-09,18:15:56,18:17:26,18:17:40,104,16,1656.0,6-12,-268,-38,271
18021001,2018-02-10,13:04:36,13:06:46,13:07:04,148,15,2224.0,6-12,-115,-38,121
18022601,2018-02-26,15:49:56,15:51:18,15:53:52,236,16,3312.0,6-12,115,192,223


In [5]:
solar_df.isnull().sum()

start.date      0
start.time      0
peak            0
end             0
duration.s      0
peak.c/s        0
total.counts    0
energy.kev      0
x.pos.asec      0
y.pos.asec      0
radial          0
dtype: int64

In [6]:
solar_df.describe()

Unnamed: 0,duration.s,peak.c/s,total.counts,x.pos.asec,y.pos.asec,radial
count,116143.0,116143.0,116143.0,116143.0,116143.0,116143.0
mean,493.643009,215.086617,376884.3,-7.681625,-42.185495,687.896989
std,434.131763,839.382841,3048797.0,755.773503,401.904509,511.364382
min,8.0,0.0,8.0,-10012.0,-10005.0,0.0
25%,212.0,28.0,22840.0,-701.0,-247.0,467.0
50%,364.0,56.0,58560.0,0.0,-71.0,759.0
75%,628.0,144.0,179808.0,708.0,198.0,946.0
max,4444.0,113156.0,435550100.0,1190.0,1223.0,14154.0


### Parse data

In [7]:
# parse date, time
def parse_date(sdatex, stimex):
        datex = datetime.strptime(sdatex, '%Y-%m-%d')
        timex = datetime.strptime(stimex, '%H:%M:%S')
        return datetime(datex.year,datex.month,datex.day,timex.hour,timex.minute,timex.second)

In [8]:
# Adding year, month, day, start date, peak date, end date and dropping earlier columns
solar_df['dt.start'] = solar_df[['start.date','start.time']].apply(lambda x: parse_date(x[0],x[1]), axis=1)
solar_df['dt.peak'] = solar_df[['start.date','peak']].apply(lambda x: parse_date(x[0],x[1]), axis=1)
solar_df['dt.end'] = solar_df[['start.date','end']].apply(lambda x: parse_date(x[0],x[1]), axis=1)

solar_df.drop(['start.date','start.time','peak','end'], axis=1, inplace=True)

# add new columns
solar_df['year'] = solar_df['dt.start'].apply(lambda col: col.year)
solar_df['month'] = solar_df['dt.start'].apply(lambda col: col.month)
solar_df['day'] = solar_df['dt.start'].apply(lambda col: col.day)

In [9]:
solar_df.columns

Index(['duration.s', 'peak.c/s', 'total.counts', 'energy.kev', 'x.pos.asec',
       'y.pos.asec', 'radial', 'dt.start', 'dt.peak', 'dt.end', 'year',
       'month', 'day'],
      dtype='object')

In [10]:
solar_df = solar_df.rename(columns={'duration.s': 'duration', 'peak.c/s': 'peak_c_s', 'total.counts': 'total_counts', 
                                    'energy.kev': 'energy_kev', 'x.pos.asec': 'x_pos', 'y.pos.asec': 'y_pos', 
                                    'dt.start': 'date_start', 'dt.peak':'date_peak', 'dt.end': 'date_end'})

In [11]:
solar_df.columns

Index(['duration', 'peak_c_s', 'total_counts', 'energy_kev', 'x_pos', 'y_pos',
       'radial', 'date_start', 'date_peak', 'date_end', 'year', 'month',
       'day'],
      dtype='object')

In [12]:
solar_df

Unnamed: 0_level_0,duration,peak_c_s,total_counts,energy_kev,x_pos,y_pos,radial,date_start,date_peak,date_end,year,month,day
flare,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
2021213,712,136,167304.0,12-25,592,-358,692,2002-02-12 21:29:56,2002-02-12 21:33:38,2002-02-12 21:41:48,2002,2,12
2021228,288,7,9504.0,6-12,604,-341,694,2002-02-12 21:44:08,2002-02-12 21:45:06,2002-02-12 21:48:56,2002,2,12
2021332,216,15,11448.0,6-12,-310,375,487,2002-02-13 00:53:24,2002-02-13 00:54:54,2002-02-13 00:57:00,2002,2,13
2021308,244,20,17400.0,12-25,-277,378,469,2002-02-13 04:22:52,2002-02-13 04:23:50,2002-02-13 04:26:56,2002,2,13
2021310,236,336,313392.0,25-50,-272,390,476,2002-02-13 07:03:52,2002-02-13 07:05:14,2002-02-13 07:07:48,2002,2,13
...,...,...,...,...,...,...,...,...,...,...,...,...,...
18020903,124,18,2888.0,6-12,-345,-38,347,2018-02-09 16:41:28,2018-02-09 16:42:54,2018-02-09 16:43:32,2018,2,9
18020904,104,16,1656.0,6-12,-268,-38,271,2018-02-09 18:15:56,2018-02-09 18:17:26,2018-02-09 18:17:40,2018,2,9
18021001,148,15,2224.0,6-12,-115,-38,121,2018-02-10 13:04:36,2018-02-10 13:06:46,2018-02-10 13:07:04,2018,2,10
18022601,236,16,3312.0,6-12,115,192,223,2018-02-26 15:49:56,2018-02-26 15:51:18,2018-02-26 15:53:52,2018,2,26


In [13]:
# Enumerating energy range values from str to category
dt = solar_df['energy_kev'].astype('category')
dict(enumerate((dt.cat.categories)))

{0: '100-300',
 1: '12-25',
 2: '25-50',
 3: '3-6',
 4: '300-800',
 5: '50-100',
 6: '6-12',
 7: '7000-20000',
 8: '800-7000'}

In [75]:
# Assigning the enumerated codes to ernegy column
solar_df['energy_kev'] = dt.cat.codes

In [76]:
solar_df.head(5)

Unnamed: 0_level_0,duration,peak_c_s,total_counts,energy_kev,x_pos,y_pos,radial,date_start,date_peak,date_end,year,month,day
flare,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
2021213,712,136,167304.0,1,592,-358,692,2002-02-12 21:29:56,2002-02-12 21:33:38,2002-02-12 21:41:48,2002,2,12
2021228,288,7,9504.0,6,604,-341,694,2002-02-12 21:44:08,2002-02-12 21:45:06,2002-02-12 21:48:56,2002,2,12
2021332,216,15,11448.0,6,-310,375,487,2002-02-13 00:53:24,2002-02-13 00:54:54,2002-02-13 00:57:00,2002,2,13
2021308,244,20,17400.0,1,-277,378,469,2002-02-13 04:22:52,2002-02-13 04:23:50,2002-02-13 04:26:56,2002,2,13
2021310,236,336,313392.0,2,-272,390,476,2002-02-13 07:03:52,2002-02-13 07:05:14,2002-02-13 07:07:48,2002,2,13


In [77]:
solar_df.dtypes

duration                 int64
peak_c_s                 int64
total_counts           float64
energy_kev                int8
x_pos                    int64
y_pos                    int64
radial                   int64
date_start      datetime64[ns]
date_peak       datetime64[ns]
date_end        datetime64[ns]
year                     int64
month                    int64
day                      int64
dtype: object

### Create Train/Test Set

In [24]:
pd_df = solar_df.copy(deep=True)
pd_df['total_counts'] = pd_df['total_counts'].astype(int)

In [25]:
pd_df.dtypes

duration                 int64
peak_c_s                 int64
total_counts             int64
energy_kev              object
x_pos                    int64
y_pos                    int64
radial                   int64
date_start      datetime64[ns]
date_peak       datetime64[ns]
date_end        datetime64[ns]
year                     int64
month                    int64
day                      int64
dtype: object

In [26]:
pd_df = pd_df.drop(['date_start', 'date_peak', 'date_end'], axis=1)

In [27]:
pd_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 116143 entries, 2021213 to 18030301
Data columns (total 10 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   duration      116143 non-null  int64 
 1   peak_c_s      116143 non-null  int64 
 2   total_counts  116143 non-null  int64 
 3   energy_kev    116143 non-null  object
 4   x_pos         116143 non-null  int64 
 5   y_pos         116143 non-null  int64 
 6   radial        116143 non-null  int64 
 7   year          116143 non-null  int64 
 8   month         116143 non-null  int64 
 9   day           116143 non-null  int64 
dtypes: int64(9), object(1)
memory usage: 9.7+ MB


In [28]:
from sklearn.model_selection import train_test_split
X_train, X_test = train_test_split(pd_df, test_size=0.25)

In [29]:
X_train.shape, X_test.shape

((87107, 10), (29036, 10))

In [30]:
y_train = X_train['energy_kev']
X_train = X_train.drop(['energy_kev'], axis=1)

y_test = X_test['energy_kev']
X_test = X_test.drop(['energy_kev'], axis=1)

### Prediction

In [31]:
# Linear Regression
from sklearn.linear_model import LinearRegression
linear_regression = LinearRegression().fit(X_train, y_train)
linear_regression_predictions = linear_regression.predict(X_test)
linear_regression_score = linear_regression.score(X_test, y_test)
linear_regression_score_train = linear_regression.score(X_train, y_train)
linear_regression_score, linear_regression_score_train

ValueError: could not convert string to float: '12-25'

In [99]:
# Logistic Regression
from sklearn.linear_model import LogisticRegression
logistic_regression = LogisticRegression().fit(X_train, y_train)
logistic_regression_predictions = logistic_regression.predict(X_test)
logistic_regression_score = logistic_regression.score(X_test, y_test)
logistic_regression_score_train = logistic_regression.score(X_train, y_train)
logistic_regression_score, logistic_regression_score_train

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


(0.7762777242044359, 0.7749893808763934)

In [100]:
from sklearn import metrics
print ('\nReport on data set:')
print(metrics.classification_report(y_test, logistic_regression_predictions))
print(metrics.precision_recall_fscore_support(y_test, logistic_regression_predictions, average='micro'))


Report on data set:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00        54
           1       0.60      0.23      0.33      4844
           2       0.32      0.02      0.03       518
           3       0.00      0.00      0.00      1717
           4       0.00      0.00      0.00         5
           5       0.00      0.00      0.00       123
           6       0.79      0.98      0.88     21774
           7       0.00      0.00      0.00         1

    accuracy                           0.78     29036
   macro avg       0.21      0.15      0.15     29036
weighted avg       0.70      0.78      0.71     29036

(0.7762777242044359, 0.7762777242044359, 0.7762777242044359, None)


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [101]:
# Decision Tree Classifier
from sklearn.tree import DecisionTreeClassifier
decision_tree_classifier = DecisionTreeClassifier().fit(X_train, y_train)
decision_tree_predictions = decision_tree_classifier.predict(X_test)
decision_tree_score = decision_tree_classifier.score(X_test, y_test)
decision_tree_score_train = decision_tree_classifier.score(X_train, y_train)
decision_tree_score, decision_tree_score_train

(0.8135418101666896, 1.0)

In [102]:
print ('\nReport on data set:')
print(metrics.classification_report(y_test, decision_tree_predictions))
print(metrics.precision_recall_fscore_support(y_test, decision_tree_predictions, average='micro'))


Report on data set:
              precision    recall  f1-score   support

           0       0.12      0.11      0.12        54
           1       0.50      0.51      0.51      4844
           2       0.11      0.12      0.12       518
           3       1.00      1.00      1.00      1717
           4       0.00      0.00      0.00         5
           5       0.13      0.15      0.14       123
           6       0.90      0.89      0.89     21774
           7       0.00      0.00      0.00         1
           8       0.00      0.00      0.00         0

    accuracy                           0.81     29036
   macro avg       0.31      0.31      0.31     29036
weighted avg       0.82      0.81      0.82     29036

(0.8135418101666896, 0.8135418101666896, 0.8135418101666896, None)


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [104]:
# Random Forest
from sklearn.ensemble import RandomForestClassifier
random_forest_classifier = RandomForestClassifier(n_jobs=-1).fit(X_train, y_train)
random_forest_predictions = random_forest_classifier.predict(X_test)
random_forest_score = random_forest_classifier.score(X_test, y_test)
random_forest_score_train = random_forest_classifier.score(X_train, y_train)
random_forest_score, random_forest_score_train

(0.8719864995178399, 1.0)

In [105]:
print ('\nReport on data set:')
print(metrics.classification_report(y_test, random_forest_predictions))
print(metrics.precision_recall_fscore_support(y_test, random_forest_predictions, average='micro'))


Report on data set:
              precision    recall  f1-score   support

           0       0.45      0.09      0.15        54
           1       0.67      0.56      0.61      4844
           2       0.38      0.04      0.07       518
           3       1.00      1.00      1.00      1717
           4       0.00      0.00      0.00         5
           5       0.37      0.11      0.16       123
           6       0.90      0.96      0.93     21774
           7       0.00      0.00      0.00         1

    accuracy                           0.87     29036
   macro avg       0.47      0.34      0.37     29036
weighted avg       0.85      0.87      0.86     29036

(0.8719864995178399, 0.8719864995178399, 0.8719864995178399, None)


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [106]:
# K-nearest Neighbors
from sklearn.neighbors import KNeighborsClassifier
kneighbors_classifier = KNeighborsClassifier().fit(X_train, y_train)
kneighbors_predictions = kneighbors_classifier.predict(X_test)
kneighbors_score = kneighbors_classifier.score(X_test, y_test)
kneighbors_score_train = kneighbors_classifier.score(X_train, y_train)
kneighbors_score, kneighbors_score_train

(0.8051040088166415, 0.848565557303087)

In [107]:
print ('\nReport on data set:')
print(metrics.classification_report(y_test, kneighbors_predictions))
print(metrics.precision_recall_fscore_support(y_test, kneighbors_predictions, average='micro'))


Report on data set:
              precision    recall  f1-score   support

           0       0.11      0.02      0.03        54
           1       0.51      0.42      0.46      4844
           2       0.12      0.02      0.04       518
           3       0.73      0.75      0.74      1717
           4       0.00      0.00      0.00         5
           5       0.00      0.00      0.00       123
           6       0.86      0.92      0.89     21774
           7       0.00      0.00      0.00         1

    accuracy                           0.81     29036
   macro avg       0.29      0.27      0.27     29036
weighted avg       0.78      0.81      0.79     29036

(0.8051040088166415, 0.8051040088166415, 0.8051040088166415, None)


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [108]:
# Gradient Boosting Classifier
from sklearn.ensemble import GradientBoostingClassifier
gradient_classifier = GradientBoostingClassifier().fit(X_train, y_train)
gradient_classifier_predictions = gradient_classifier.predict(X_test)
gradient_classifier_score = gradient_classifier.score(X_test, y_test)
gradient_classifier_score_train = gradient_classifier.score(X_train, y_train)
gradient_classifier_score, gradient_classifier_score_train

(0.6896266703402673, 0.690702239774071)

In [109]:
print ('\nReport on data set:')
print(metrics.classification_report(y_test, gradient_classifier_predictions))
print(metrics.precision_recall_fscore_support(y_test, gradient_classifier_predictions, average='micro'))


Report on data set:
              precision    recall  f1-score   support

           0       0.13      0.06      0.08        54
           1       0.67      0.41      0.50      4844
           2       0.18      0.01      0.03       518
           3       1.00      0.13      0.23      1717
           4       0.00      0.00      0.00         5
           5       0.40      0.10      0.16       123
           6       0.89      0.82      0.85     21774
           7       0.00      0.00      0.00         1
           8       0.00      0.00      0.00         0

    accuracy                           0.69     29036
   macro avg       0.36      0.17      0.20     29036
weighted avg       0.84      0.69      0.74     29036

(0.6896266703402673, 0.6896266703402673, 0.6896266703402673, None)


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [110]:
# Using Neural Network (scikit-learn)
from sklearn.neural_network import MLPClassifier
mlp_classifier = MLPClassifier(hidden_layer_sizes=(100,10)).fit(X_train, y_train)
mlp_predictions = mlp_classifier.predict(X_test)
mlp_score = mlp_classifier.score(X_test, y_test)
mlp_score_train = mlp_classifier.score(X_train, y_train)
mlp_score, mlp_score_train

(0.7498966799834688, 0.7483325105904233)

In [111]:
print ('\nReport on data set:')
print(metrics.classification_report(y_test, mlp_predictions))
print(metrics.precision_recall_fscore_support(y_test, mlp_predictions, average='micro'))


Report on data set:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00        54
           1       0.00      0.00      0.00      4844
           2       0.00      0.00      0.00       518
           3       0.00      0.00      0.00      1717
           4       0.00      0.00      0.00         5
           5       0.00      0.00      0.00       123
           6       0.75      1.00      0.86     21774
           7       0.00      0.00      0.00         1

    accuracy                           0.75     29036
   macro avg       0.09      0.12      0.11     29036
weighted avg       0.56      0.75      0.64     29036

(0.7498966799834688, 0.7498966799834688, 0.7498966799834689, None)


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
