In [None]:
import pandas as pd
pd.options.mode.chained_assignment = None
import numpy as np
import sklearn as sk
from sklearn import preprocessing
from sklearn import model_selection
import os
from sklearn.ensemble import IsolationForest
from sklearn.neighbors import LocalOutlierFactor
from sklearn.svm import OneClassSVM
from sklearn.metrics import f1_score,recall_score,precision_score
import itertools as it

In [None]:
#list of manually picked true positives
positives = [
	'AA16064612',
	'AA15898033',
	'AA16859504',
	'AA15555547',
	'AA16253838',
	'AA15690400',
	'AA16482851',
	'AA16650075',
	'AA17123584',
	'AA16916241',
	'AA17152178',
	'AA16589401',
	'AA17060787',
	'AA17130951',
	'AA15363462',
	'AA16857365',
	'AA16315097',
	'AA16658861',
	'AA15200234',
	'AA15653581',
	'AA16192547',
	'AA17026936',
	'AA15159907',
	'AA17054885',
	'AA15923342',
	'AA16073135',
	'AA16489819',
	'AA17051095',
	'AA15121656',
	'AA17205192',
	'AA17080146',
	'AA15964197',
	'AA16196411',
	'AA16576061',
	'AA16435085',
	'AA16176520',
	'AA16004931',
	'AA15793620',
	'AA15551849',
	'AA16879602',
	'AA15617742',
	'AA16524282',
	'AA16552065',
	'AA15310198',
	'AA15625977',
	'AA16907969',
	'AA16678130',
	'AA16095467',
	'AA16385441',
	'AA16171152',
	'AA16824984',
	'AA16921682',
	'AA16636556',
	'AA16744631',
	'AA16999415',
	'AA15499446',
	'AA16344473',
	'AA16961156',
	'AA16189242',
	'AA17158789',
	'AA15749776',
	'AA16895405',
	'AA15198624',
	'AA15853554',
	'AA15204685',
	'AA16274578',
	'AA15949178',
	'AA15886116',
	'AA15190364',
	'AA15433376',
	'AA17237470',
	'AA15433050',
	'AA16134022',
	'AA16620792',
	'AA16977468',
	'AA15515743',
	'AA17091147',
	'AA15618542',
	'AA15588480',
	'AA16584718',
	'AA16433513',
	'AA16505331',
	'AA15259877',
	'AA15313819',
	'AA16820543',
	'AA15775132',
	'AA16298995',
	'AA15929560',
	'AA16488663',
	'AA15694903',
]

In [None]:
# full train set of suitable flights
df_train = pd.read_pickle("../pickles/ifpsid_train.pickle")

# test set without heathrow data but including true positives from manual data (-1 is a true positive)
df_test = pd.read_pickle("../pickles/df_test.pickle")

## full dataframes with constructed feature vectors for machine learning
# l2-normalized histogram with 12 bins
df_hist_l2_b12 = pd.read_pickle("../pickles/df_hist_l2_12bins.pickle")

# linear-interpolated fft with 16 frequency portions
df_fft_lip = pd.read_pickle("../pickles/df_fft_lip.pickle")

## derived dataframes with constructed feature vectors merged to train and test data;
## df_train and df_test are derived from df_ifpsid_suitable via train-test-split
# train
train_fft = pd.merge(df_train, df_fft_lip, how='inner', on=['ifpsid'])
train_hist = pd.merge(df_train, df_hist_l2_b12, how='inner', on=['ifpsid'])
# test
test_fft = pd.merge(df_test, df_fft_lip, how='inner', on=['ifpsid'])
test_hist = pd.merge(df_test, df_hist_l2_b12, how='inner', on=['ifpsid'])

In [None]:
## custom grid search for each method (for more control)
def search_IF_FFT(params,freq_count,train_sample='full',verbose=False,random_state=1337,n_jobs=-1):
    # hyperparam sets
    grid = [dict(zip(params.keys(), combination)) for combination in it.product(*(params[key] for key in params.keys()))]
    # acc lists
    contamination_list,max_features_list,n_estimators_list,freq_list,sample_list,recall_list,precision_list,f1_list = ([] for i in range(8))
    # train
    if train_sample == 'full':
        train = train_fft[['f{}'.format(i) for i in range(freq_count)]]
    else:
        train = train_fft[['f{}'.format(i) for i in range(freq_count)]].iloc[:train_sample]
    # test
    test = test_fft[['f{}'.format(i) for i in range(freq_count)]]
    for n,gp in enumerate(grid):
        if verbose:
            print('{}/{}: {}'.format(n+1,len(grid),gp))
        clf = IsolationForest(contamination=gp['contamination'],
                              max_features=gp['max_features'],
                              n_estimators=gp['n_estimators'],
                              random_state=random_state,n_jobs=n_jobs)
        # fit
        clf.fit(train)
        # predict
        pred = clf.predict(test)
        # construct result df cols
        contamination_list.append(gp['contamination'])
        max_features_list.append(gp['max_features'])
        n_estimators_list.append(gp['n_estimators'])
        freq_list.append(freq_count)
        sample_list.append(train_sample)
        recall_list.append(recall_score(y_true=df_test['true_positive'], y_pred=pred, pos_label=-1, average='binary'))
        precision_list.append(precision_score(y_true=df_test['true_positive'], y_pred=pred, pos_label=-1, average='binary'))
        f1_list.append(f1_score(y_true=df_test['true_positive'], y_pred=pred, pos_label=-1, average='binary'))
    # construct result df
    return pd.DataFrame(list(zip(contamination_list,max_features_list,n_estimators_list,freq_list,sample_list,recall_list,precision_list,f1_list)),
                                  columns =['contamination','max_features','n_estimators','freq_count','sample_count','recall','precision','f1'])

def search_IF_hist(params,train_sample='full',verbose=False,random_state=1337,n_jobs=-1):
    # hyperparam sets
    grid = [dict(zip(params.keys(), combination)) for combination in it.product(*(params[key] for key in params.keys()))]
    # acc lists
    contamination_list,max_features_list,n_estimators_list,sample_list,recall_list,precision_list,f1_list = ([] for i in range(7))
    # train
    if train_sample == 'full':
        train = train_hist[['h{}'.format(i) for i in range(12)]]
    else:
        train = train_hist[['h{}'.format(i) for i in range(12)]].iloc[:train_sample]
    # test
    test = test_hist[['h{}'.format(i) for i in range(12)]]
    for n,gp in enumerate(grid):
        if verbose:
            print('{}/{}: {}'.format(n+1,len(grid),gp))
        clf = IsolationForest(contamination=gp['contamination'],
                              max_features=gp['max_features'],
                              n_estimators=gp['n_estimators'],
                              random_state=random_state,n_jobs=n_jobs)
        # fit
        clf.fit(train)
        # predict
        pred = clf.predict(test)
        # construct result df cols
        contamination_list.append(gp['contamination'])
        max_features_list.append(gp['max_features'])
        n_estimators_list.append(gp['n_estimators'])
        sample_list.append(train_sample)
        recall_list.append(recall_score(y_true=df_test['true_positive'], y_pred=pred, pos_label=-1, average='binary'))
        precision_list.append(precision_score(y_true=df_test['true_positive'], y_pred=pred, pos_label=-1, average='binary'))
        f1_list.append(f1_score(y_true=df_test['true_positive'], y_pred=pred, pos_label=-1, average='binary'))
    # construct result df
    return pd.DataFrame(list(zip(contamination_list,max_features_list,n_estimators_list,sample_list,recall_list,precision_list,f1_list)),
                                  columns =['contamination','max_features','n_estimators','sample_count','recall','precision','f1'])

def search_LOF_FFT(params,freq_count,train_sample='full',verbose=False,n_jobs=-1):
    # hyperparam sets
    grid = [dict(zip(params.keys(), combination)) for combination in it.product(*(params[key] for key in params.keys()))]
    # acc lists
    n_neighbors_list,algorithm_list,leaf_size_list,metric_list,p_list,contamination_list,freq_list,sample_list,recall_list,precision_list,f1_list = ([] for i in range(11))
    # train
    if train_sample == 'full':
        train = train_fft[['f{}'.format(i) for i in range(freq_count)]]
    else:
        train = train_fft[['f{}'.format(i) for i in range(freq_count)]].iloc[:train_sample]
    # test
    test = test_fft[['f{}'.format(i) for i in range(freq_count)]]
    for n,gp in enumerate(grid):
        if verbose:
            print('{}/{}: {}'.format(n+1,len(grid),gp))
        clf = LocalOutlierFactor(n_neighbors=gp['n_neighbors'],
                              algorithm=gp['algorithm'],
                              leaf_size=gp['leaf_size'],
                              metric=gp['metric'],
                              p=gp['p'],
                              contamination=gp['contamination'],
                              novelty=True,n_jobs=n_jobs)
        # fit
        clf.fit(train)
        # predict
        pred = clf.predict(test)
        # construct result df cols
        n_neighbors_list.append(gp['n_neighbors'])
        algorithm_list.append(gp['algorithm'])
        leaf_size_list.append(gp['leaf_size'])
        metric_list.append(gp['metric'])
        p_list.append(gp['p'])
        contamination_list.append(gp['contamination'])
        freq_list.append(freq_count)
        sample_list.append(train_sample)
        recall_list.append(recall_score(y_true=df_test['true_positive'], y_pred=pred, pos_label=-1, average='binary'))
        precision_list.append(precision_score(y_true=df_test['true_positive'], y_pred=pred, pos_label=-1, average='binary'))
        f1_list.append(f1_score(y_true=df_test['true_positive'], y_pred=pred, pos_label=-1, average='binary'))
    # construct result df
    return pd.DataFrame(list(zip(n_neighbors_list,algorithm_list,leaf_size_list,metric_list,p_list,contamination_list,freq_list,sample_list,recall_list,precision_list,f1_list)),
                                  columns =['n_neighbors','algorithm','leaf_size','metric','p','contamination','freq_count','sample_count','recall','precision','f1'])

def search_LOF_hist(params,train_sample='full',verbose=False,n_jobs=-1):
    # hyperparam sets
    grid = [dict(zip(params.keys(), combination)) for combination in it.product(*(params[key] for key in params.keys()))]
    # acc lists
    n_neighbors_list,algorithm_list,leaf_size_list,metric_list,p_list,contamination_list,sample_list,recall_list,precision_list,f1_list = ([] for i in range(10))
    # train
    if train_sample == 'full':
        train = train_hist[['h{}'.format(i) for i in range(12)]]
    else:
        train = train_hist[['h{}'.format(i) for i in range(12)]].iloc[:train_sample]
    # test
    test = test_hist[['h{}'.format(i) for i in range(12)]]
    for n,gp in enumerate(grid):
        if verbose:
            print('{}/{}: {}'.format(n+1,len(grid),gp))
        clf = LocalOutlierFactor(n_neighbors=gp['n_neighbors'],
                              algorithm=gp['algorithm'],
                              leaf_size=gp['leaf_size'],
                              metric=gp['metric'],
                              p=gp['p'],
                              contamination=gp['contamination'],
                              novelty=True,n_jobs=n_jobs)
        # fit
        clf.fit(train)
        # predict
        pred = clf.predict(test)
        # construct result df cols
        n_neighbors_list.append(gp['n_neighbors'])
        algorithm_list.append(gp['algorithm'])
        leaf_size_list.append(gp['leaf_size'])
        metric_list.append(gp['metric'])
        p_list.append(gp['p'])
        contamination_list.append(gp['contamination'])
        sample_list.append(train_sample)
        recall_list.append(recall_score(y_true=df_test['true_positive'], y_pred=pred, pos_label=-1, average='binary'))
        precision_list.append(precision_score(y_true=df_test['true_positive'], y_pred=pred, pos_label=-1, average='binary'))
        f1_list.append(f1_score(y_true=df_test['true_positive'], y_pred=pred, pos_label=-1, average='binary'))
    # construct result df
    return pd.DataFrame(list(zip(n_neighbors_list,algorithm_list,leaf_size_list,metric_list,p_list,contamination_list,sample_list,recall_list,precision_list,f1_list)),
                                  columns =['n_neighbors','algorithm','leaf_size','metric','p','contamination','sample_count','recall','precision','f1'])

def search_OCSVM_FFT(params,freq_count,train_sample='full',verbose=False):
    # hyperparam sets
    grid = [dict(zip(params.keys(), combination)) for combination in it.product(*(params[key] for key in params.keys()))]
    # acc lists
    kernel_list,degree_list,gamma_list,coef0_list,tol_list,nu_list,shrinking_list,freq_list,sample_list,recall_list,precision_list,f1_list = ([] for i in range(12))
    # train
    if train_sample == 'full':
        train = train_fft[['f{}'.format(i) for i in range(freq_count)]]
    else:
        train = train_fft[['f{}'.format(i) for i in range(freq_count)]].iloc[:train_sample]
    # test
    test = test_fft[['f{}'.format(i) for i in range(freq_count)]]
    for n,gp in enumerate(grid):
        if verbose:
            print('{}/{}: {}'.format(n+1,len(grid),gp))
        clf = OneClassSVM(kernel=gp['kernel'],
                              degree=gp['degree'],
                              gamma=gp['gamma'],
                              coef0=gp['coef0'],
                              tol=gp['tol'],
                              nu=gp['nu'],
                              shrinking=gp['shrinking'])
        # fit
        clf.fit(train)
        # predict
        pred = clf.predict(test)
        # construct result df cols
        kernel_list.append(gp['kernel'])
        degree_list.append(gp['degree'])
        gamma_list.append(gp['gamma'])
        coef0_list.append(gp['coef0'])
        tol_list.append(gp['tol'])
        nu_list.append(gp['nu'])
        shrinking_list.append(gp['shrinking'])
        freq_list.append(freq_count)
        sample_list.append(train_sample)
        recall_list.append(recall_score(y_true=df_test['true_positive'], y_pred=pred, pos_label=-1, average='binary'))
        precision_list.append(precision_score(y_true=df_test['true_positive'], y_pred=pred, pos_label=-1, average='binary'))
        f1_list.append(f1_score(y_true=df_test['true_positive'], y_pred=pred, pos_label=-1, average='binary'))
    # construct result df
    return pd.DataFrame(list(zip(kernel_list,degree_list,gamma_list,coef0_list,tol_list,nu_list,shrinking_list,freq_list,sample_list,recall_list,precision_list,f1_list)),
                                  columns =['kernel','degree','gamma','coef0','tol','nu','shrinking','freq_count','sample_count','recall','precision','f1'])

def search_OCSVM_hist(params,train_sample='full',verbose=False):
    # hyperparam sets
    grid = [dict(zip(params.keys(), combination)) for combination in it.product(*(params[key] for key in params.keys()))]
    # acc lists
    kernel_list,degree_list,gamma_list,coef0_list,tol_list,nu_list,shrinking_list,sample_list,recall_list,precision_list,f1_list = ([] for i in range(11))
    # train
    if train_sample == 'full':
        train = train_hist[['h{}'.format(i) for i in range(12)]]
    else:
        train = train_hist[['h{}'.format(i) for i in range(12)]].iloc[:train_sample]
    # test
    test = test_hist[['h{}'.format(i) for i in range(12)]]
    for n,gp in enumerate(grid):
        if verbose:
            print('{}/{}: {}'.format(n+1,len(grid),gp))
        clf = OneClassSVM(kernel=gp['kernel'],
                              degree=gp['degree'],
                              gamma=gp['gamma'],
                              coef0=gp['coef0'],
                              tol=gp['tol'],
                              nu=gp['nu'],
                              shrinking=gp['shrinking'])
        # fit
        clf.fit(train)
        # predict
        pred = clf.predict(test)
        # construct result df cols
        kernel_list.append(gp['kernel'])
        degree_list.append(gp['degree'])
        gamma_list.append(gp['gamma'])
        coef0_list.append(gp['coef0'])
        tol_list.append(gp['tol'])
        nu_list.append(gp['nu'])
        shrinking_list.append(gp['shrinking'])
        sample_list.append(train_sample)
        recall_list.append(recall_score(y_true=df_test['true_positive'], y_pred=pred, pos_label=-1, average='binary'))
        precision_list.append(precision_score(y_true=df_test['true_positive'], y_pred=pred, pos_label=-1, average='binary'))
        f1_list.append(f1_score(y_true=df_test['true_positive'], y_pred=pred, pos_label=-1, average='binary'))
    # construct result df
    return pd.DataFrame(list(zip(kernel_list,degree_list,gamma_list,coef0_list,tol_list,nu_list,shrinking_list,sample_list,recall_list,precision_list,f1_list)),
                                  columns =['kernel','degree','gamma','coef0','tol','nu','shrinking','sample_count','recall','precision','f1'])

In [None]:
### examples of testing

In [None]:
%%time
## testing if_fft
params = {'contamination': [0.0053],
        'max_features': [4],
        'n_estimators': [100,200,300,400,500]}
df_if_fft = search_IF_FFT(params=params,freq_count=4,train_sample=100,verbose=True)
for f in [2,4,8]:
    params['max_features'] = [f]
    for s in [1000,10000,20000,40000,80000,'full']:
        #print("Working on f={}, s={}".format(f,s))
        df_if_fft = pd.concat([df_if_fft,search_IF_FFT(params=params,freq_count=f,train_sample=s,verbose=True)],ignore_index=True)

In [None]:
df_if_fft.query('recall >= 0.95').sort_values(by='precision',ascending=False)

In [None]:
%%time
## testing if_hist
params = {'contamination': ['auto',0.06,0.07,0.08,0.09,0.1],
        'max_features': [12],
        'n_estimators': [100,200,300,400,500]}
df_if_hist = search_IF_hist(params=params,train_sample=100)
for s in [1000,10000,20000,40000,80000,'full']:
    #print("Working on {}".format(s))
    df_if_hist = pd.concat([df_if_hist,search_IF_hist(params=params,train_sample=s)],ignore_index=True)

In [None]:
df_if_hist.query('recall >= 0.95').sort_values(by='precision',ascending=False)

In [None]:
%%time
## testing lof_fft
params = {'n_neighbors': [1000,1500],
          'algorithm': ['auto'],
          'leaf_size': [30],
          'metric': ['minkowski'],
          'p': [2,3,4],
          'contamination': ['auto',0.07]}
df_lof_fft = search_LOF_FFT(params=params,freq_count=1,train_sample=100)
for f in [2,4,8]:
    for s in [10000,20000,40000]:
        #print("Working on f={}, s={}".format(f,s))
        df_lof_fft = pd.concat([df_lof_fft,search_LOF_FFT(params=params,freq_count=f,train_sample=s,verbose=True)],ignore_index=True)

In [None]:
df_lof_fft.query('recall >= 0.95').sort_values(by='precision',ascending=False)

In [None]:
%%time
## testing lof_hist
params = {'n_neighbors': [20,50,75],
          'algorithm': ['auto'],
          'leaf_size': [30],
          'metric': ['minkowski'],
          'p': [4,5,6,7,8],
          'contamination': ['auto',0.07,0.08]}
df_lof_hist = search_LOF_hist(params=params,train_sample=100)
for s in [100]:
    #print("Working on s={}".format(s))
    df_lof_hist = pd.concat([df_lof_hist,search_LOF_hist(params=params,train_sample=s,verbose=True)],ignore_index=True)

In [None]:
df_lof_hist.query('recall >= 0.87').sort_values(by='precision',ascending=False)

In [None]:
%%time
## testing ocsvm_fft
params = {'kernel': ['rbf','poly'],
          'degree': [3],
          'gamma': ['scale'],
          'coef0': [0.0,0.5],
          'tol': [1e-3],
          'nu': [0.02,0.04,0.06,0.08,0.1,0.12,0.14,0.16,0.18,0.2,0.22,0.24,0.26,0.28,0.3,0.4,0.5,0.7,0.9],
          'shrinking': [True]}
df_ocsvm_fft = search_OCSVM_FFT(params=params,freq_count=1,train_sample=1000)
for f in [2,4,8,12]:
    for s in [5000,10000,20000,40000]:
        #print("Working on f={}, s={}".format(f,s))
        df_ocsvm_fft = pd.concat([df_ocsvm_fft,search_OCSVM_FFT(params=params,freq_count=f,train_sample=s,verbose=True)],ignore_index=True)

In [None]:
df_ocsvm_fft.query('recall >= 0.95').sort_values(by='precision',ascending=False)

In [None]:
%%time
## testing ocsvm_hist
params = {'kernel': ['poly'],
          'degree': [2,4,5,6,7],
          'gamma': ['scale'],
          'coef0': [0.0,0.5,1.0],
          'tol': [1e-3],
          'nu': [0.16,0.18,0.2,0.22],
          'shrinking': [True]}

df_ocsvm_hist = search_OCSVM_hist(params=params,train_sample=1000)
for s in [20000]:
    print("Working on s={}".format(s))
    df_ocsvm_hist = pd.concat([df_ocsvm_hist,search_OCSVM_hist(params=params,train_sample=s,verbose=True)],ignore_index=True)

In [None]:
df_ocsvm_hist.query('recall >= 0.95').sort_values(by='precision',ascending=False)