In [1]:
import os
import numpy as np
import pandas as pd
import sklearn.metrics as metrics
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report

from components.dataToolkit import dataToolkit

In [2]:
list_filename="gspy-db-20180813_O1_filtered_t1126400691-1205493119_snr7.5_tr_gspy.csv"

In [3]:
data_dir = os.path.join(os.path.dirname(os.getcwd()),"data")
data_dir

'/notebooks/data'

In [4]:
gl_df = pd.read_csv(os.path.join(data_dir,list_filename))
gl_df.describe()

Unnamed: 0,GPStime,peakFreq,snr,centralFreq,duration,bandwidth
count,6667.0,6667.0,6667.0,6667.0,6667.0,6667.0
mean,1131870000.0,204.124335,192.144943,1529.353582,1.779605,2941.086955
std,3165652.0,374.45467,1589.944713,1320.377343,2.672461,2664.10321
min,1126403000.0,10.072,7.501,9.78,0.007,1.25918
25%,1128905000.0,34.175,10.337,255.412,0.227,423.261902
50%,1132168000.0,111.128,15.407,1228.915,0.766,2320.672363
75%,1134571000.0,183.495,36.9975,2630.882,2.1445,5228.188477
max,1137250000.0,2047.106,81178.727,4615.132,42.156,7946.48291


In [5]:
gl_df.shape

(6667, 9)

In [6]:
#X = gl_df.get(['GPStime','peakFreq','snr','amplitude','centralFreq','duration','bandwidth','chisq','chisqDof','confidence'])
X = gl_df.get(['GPStime','peakFreq', 'snr', 'centralFreq', 'duration', 'bandwidth'])
Y = gl_df.get('label')

dtk = dataToolkit()
cAmean, cD3mean, cD2mean, cD1mean = dtk.getExtraFeatures()
dtk.close()

X.loc[:,'cAmean'] = pd.Series(cAmean, index=X.index)
X.loc[:,'cD3mean'] = pd.Series(cD3mean, index=X.index)

#X.fillna(X.mean(), inplace=True)

print('X : ')
print(X.count())
print()
print('Y: ')
print(Y.count())

Y_num = pd.factorize(Y)
Y_num_labels = Y_num[0]
Y_count_labels = Y_num[1].size

print()
print('Distinct classes :', Y_count_labels)
print()
print('Y_num_labels : ', Y_num_labels.size)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item_labels[indexer[info_axis]]] = value


X : 
GPStime        6667
peakFreq       6667
snr            6667
centralFreq    6667
duration       6667
bandwidth      6667
cAmean         6667
cD3mean        6667
dtype: int64

Y: 
6667

Distinct classes : 22

Y_num_labels :  6667


In [7]:
seed = 7
test_size = 0.33
X_train, X_test, Y_train, Y_test = train_test_split(X, Y_num_labels, test_size=test_size, random_state=seed, stratify=Y)

In [8]:
print('X_train : ')
print(X_train.count())
print()
print('Y_train: ')
print(Y_train.size)
print()
print('X_test: ')
print(X_test.count())
print()
print('Y_test: ')
print(Y_test.size)

X_train : 
GPStime        4466
peakFreq       4466
snr            4466
centralFreq    4466
duration       4466
bandwidth      4466
cAmean         4466
cD3mean        4466
dtype: int64

Y_train: 
4466

X_test: 
GPStime        2201
peakFreq       2201
snr            2201
centralFreq    2201
duration       2201
bandwidth      2201
cAmean         2201
cD3mean        2201
dtype: int64

Y_test: 
2201


In [9]:
rf = RandomForestClassifier(max_depth=None, n_estimators=100, max_features=2, verbose=1)
rf.fit(X_train, Y_train)
rf_preds = rf.predict(X_test)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    1.1s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.1s finished


In [10]:
error_rate_rf = np.sum(rf_preds != Y_test) / Y_test.size
error_rate_rf

0.12221717401181281

In [11]:
rf.feature_importances_

array([0.12497417, 0.26506372, 0.18639142, 0.08875672,
       0.17747314, 0.08771392, 0.03525202, 0.0343749 ])

In [12]:
print(classification_report(Y_test, rf_preds))

              precision    recall  f1-score   support

           0       0.89      0.96      0.92       141
           1       0.80      0.13      0.23        30
           2       1.00      1.00      1.00        45
           3       0.99      0.99      0.99       149
           4       0.88      0.96      0.92        48
           5       0.91      0.90      0.90        89
           6       0.99      0.96      0.97        89
           7       0.74      0.66      0.70       132
           8       0.83      0.71      0.77         7
           9       0.77      0.85      0.81       174
          10       0.90      0.93      0.92       234
          11       0.86      0.80      0.83       163
          12       0.90      0.96      0.93       582
          13       1.00      1.00      1.00        27
          14       0.88      0.75      0.81        20
          15       0.89      0.89      0.89       148
          16       0.67      0.44      0.53        50
          17       1.00    

  'precision', 'predicted', average, warn_for)


0       9.121110e-23
1       6.817579e-24
2       1.343891e-23
3       4.636141e-23
4      -7.044793e-23
5      -1.280729e-22
6       1.473982e-23
7       2.479176e-23
8       4.038468e-23
9       5.700009e-23
10      1.078155e-22
11     -8.559324e-23
12     -9.359010e-23
13      6.021650e-23
14     -9.581370e-23
15     -8.027903e-23
16      1.745142e-23
17      2.654983e-23
18      9.001840e-23
19      4.671199e-23
20      1.307739e-22
21      1.761694e-23
22     -5.494450e-23
23      1.495215e-23
24      1.121051e-23
25      5.227749e-23
26      8.213173e-23
27     -8.816400e-23
28     -2.627676e-23
29      5.343343e-23
            ...     
6637    4.940120e-20
6638   -3.252053e-23
6639   -1.695959e-20
6640   -4.255542e-20
6641    2.525572e-21
6642   -5.051950e-24
6643   -5.035929e-22
6644   -2.674454e-21
6645   -2.933268e-20
6646    9.080055e-20
6647   -3.353437e-23
6648    5.925801e-24
6649   -5.845469e-19
6650   -1.735833e-21
6651   -6.900887e-24
6652    5.776668e-24
6653   -3.295