# The Data Analysis Bureau Exercise

In [1]:
import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from statsmodels.tsa.stattools import adfuller, kpss
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import xgboost as xgb
from sklearn.metrics import classification_report, roc_auc_score, r2_score

## Data import and inspection

In [8]:
# Open JSON file and assign to dict
f = open('C:/Users/samue/Documents/DS 2021 Exercise/data_dict.json')
data_dict = json.load(f)
f.close()

In [9]:
# Print line-by-line to fit everything
for key in data_dict.keys():
    print(data_dict[key])

{'name': 'CurrentSpeed', 'units': 'knots', 'range': None, 'description': ' '}
{'name': 'CurrentDir', 'units': 'degrees', 'range': None, 'description': ' '}
{'name': 'TWS', 'units': 'knots', 'range': None, 'description': 'True Wind Speed'}
{'name': 'TWA', 'units': 'degrees', 'range': None, 'description': 'True Wind Angle'}
{'name': 'AWS', 'units': 'knots', 'range': None, 'description': 'Apparent Wind Speed'}
{'name': 'AWA', 'units': 'degrees', 'range': None, 'description': 'Apparent Wind Angle'}
{'name': 'Roll', 'units': 'degrees', 'range': None, 'description': 'Roll, also equals to -Heel'}
{'name': 'Pitch', 'units': 'degrees', 'range': None, 'description': 'Pitch angle'}
{'name': 'HeadingMag', 'units': 'degrees', 'range': None, 'description': 'magnetic heading'}
{'name': 'HoG', 'units': 'degrees', 'range': None, 'description': 'heading over ground'}
{'name': 'HeadingTrue', 'units': 'degrees', 'range': None, 'description': 'true heading. True heading - heading over ground = Yaw'}
{'name

In [2]:
# Import data and inspect top
data = pd.read_csv('C:/Users/samue/Documents/test_data.csv')
data.head()

Unnamed: 0,CurrentSpeed,CurrentDir,TWS,TWA,AWS,AWA,Roll,Pitch,HeadingMag,HoG,...,VMG,RudderAng,Leeway,TWD,WSoG,VoltageDrawn,ModePilote,DateTime,Yaw,Tacking
0,0.0756,123.0,10.8,48.0,10.4,48.0,-3.54,9.08,24.0,308.0,...,0.0594,4.666667,0.0,356.0,10.5,11.8,5.0,2019-04-14 00:00:00.000,-299.0,1.0
1,0.0756,123.0,10.8,48.0,10.4,48.0,-3.54,9.08,24.0,308.0,...,0.0594,4.666667,0.0,356.0,10.5,11.8,5.0,2019-04-14 00:00:01.000,-299.0,1.0
2,0.0756,123.0,10.8,48.0,10.4,48.0,-3.52,9.099999,24.0,308.0,...,0.0594,4.666667,0.0,356.0,9.9,11.8,5.0,2019-04-14 00:00:02.000,-299.0,1.0
3,0.0756,123.0,10.8,48.0,10.4,48.0,-3.52,9.099999,24.0,308.0,...,0.0594,4.666667,0.0,356.0,9.9,11.8,5.0,2019-04-14 00:00:03.000,-299.0,1.0
4,0.0756,123.0,10.8,48.0,10.4,48.0,-3.5,9.099999,24.0,308.0,...,0.0594,4.666667,0.0,356.0,10.3,11.8,5.0,2019-04-14 00:00:04.000,-299.0,1.0


In [3]:
# Convert date to DT
data['DateTime'] = pd.to_datetime(data['DateTime'])

In [11]:
# Check column types
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 220000 entries, 0 to 219999
Data columns (total 27 columns):
 #   Column        Non-Null Count   Dtype         
---  ------        --------------   -----         
 0   CurrentSpeed  219833 non-null  float64       
 1   CurrentDir    219832 non-null  float64       
 2   TWS           219837 non-null  float64       
 3   TWA           219833 non-null  float64       
 4   AWS           219834 non-null  float64       
 5   AWA           219838 non-null  float64       
 6   Roll          219833 non-null  float64       
 7   Pitch         219836 non-null  float64       
 8   HeadingMag    219835 non-null  float64       
 9   HoG           219838 non-null  float64       
 10  HeadingTrue   219837 non-null  float64       
 11  AirTemp       219840 non-null  float64       
 12  Longitude     219836 non-null  float64       
 13  Latitude      219840 non-null  float64       
 14  SoG           219842 non-null  float64       
 15  SoS           219

In [20]:
# Describe numerical data
data.iloc[:,12:].describe()

Unnamed: 0,Longitude,Latitude,SoG,SoS,AvgSoS,VMG,RudderAng,Leeway,TWD,WSoG,VoltageDrawn,ModePilote,Yaw,Tacking
count,219836.0,219840.0,219842.0,219840.0,219838.0,219837.0,219838.0,219839.0,219838.0,219836.0,219839.0,219839.0,219834.0,219995.0
mean,-60.675999,16.805625,7.658572,7.607856,6.382535,4.109238,2.025093,-1.226548,83.323893,14.746686,12.417475,2.422614,5.595763,0.209273
std,0.982475,3.929849,3.075285,3.084592,2.903147,2.066419,4.963518,0.793367,53.810225,4.29134,0.570748,1.043669,140.5214,0.406791
min,-61.816873,11.971172,0.0054,0.0,0.0702,0.0,-37.333336,-10.0,0.0,0.0,11.1,2.0,-359.0,0.0
25%,-61.639917,12.913855,7.6842,7.5978,5.6376,3.1482,-0.666667,-2.0,61.0,11.8,12.1,2.0,-13.0,0.0
50%,-61.199546,15.232683,8.7264,8.6994,7.5816,4.5792,2.333333,-1.0,70.0,14.8,12.3,2.0,-6.0,0.0
75%,-59.73539,20.902214,9.369,9.2988,8.461801,5.5836,4.666667,-1.0,82.0,17.6,12.5,2.0,-1.0,0.0
max,-59.279375,22.209945,12.598201,12.7008,8.532001,9.8604,47.0,9.0,359.0,35.700001,14.2,5.0,359.0,1.0


In [22]:
for col in data.columns:
    print(f'{col}: {data[col].isna().sum()}')

CurrentSpeed: 0
CurrentDir: 0
TWS: 0
TWA: 0
AWS: 0
AWA: 0
Roll: 0
Pitch: 0
HeadingMag: 0
HoG: 0
HeadingTrue: 0
AirTemp: 0
Longitude: 0
Latitude: 0
SoG: 0
SoS: 0
AvgSoS: 0
VMG: 0
RudderAng: 0
Leeway: 0
TWD: 0
WSoG: 0
VoltageDrawn: 0
ModePilote: 0
DateTime: 0
Yaw: 0
Tacking: 0


In [4]:
# Check which rows do not have a timestep of 1
# All are NAs except for 200,000
data.index[data['DateTime'] - data['DateTime'].shift(1) != pd.to_timedelta(1, unit='s')]

Int64Index([     0,  39959,  39960,  81738,  81739,  82751,  82752,  91468,
             91469, 121548, 121549, 200000],
           dtype='int64')

In [None]:
# Seems like 200,000 is repeat as time goes "back"
# Previous same time is 180,000
# First we check how many rows are duplicates
# We get 20,000 (suspiciously this is 2x20,000)
data[data.duplicated(keep=False)].shape

In [55]:
# Index starts at 180,000 and ends at the end of the DF
# It seems very likely that the last 20k rows are duplicates
# We can also double check by printing some rows and checking to see if they'er the same
data[data.duplicated(keep=False)].index

Int64Index([180000, 180001, 180002, 180003, 180004, 180005, 180006, 180007,
            180008, 180009,
            ...
            219990, 219991, 219992, 219993, 219994, 219995, 219996, 219997,
            219998, 219999],
           dtype='int64', length=40000)

In [5]:
# We remove the last 20k rows
data = data.iloc[:200000,:]

In [75]:
# Get some information about the averages for ModePilote to understand the variable
data.groupby('ModePilote').mean()

Unnamed: 0_level_0,CurrentSpeed,CurrentDir,TWS,TWA,AWS,AWA,Roll,Pitch,HeadingMag,HoG,...,SoS,AvgSoS,VMG,RudderAng,Leeway,TWD,WSoG,VoltageDrawn,Yaw,Tacking
ModePilote,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2.0,1.204505,123.759791,15.615349,56.236411,21.418909,35.568044,-16.897729,5.491981,31.696284,52.00476,...,8.775041,7.163739,4.798522,1.478423,-1.427548,69.554729,15.619036,12.472722,27.994191,0.102289
5.0,0.290463,201.066534,11.879056,62.804915,12.421379,61.262327,-3.964755,8.615092,42.166673,168.22135,...,0.899513,0.740219,0.486938,4.416704,-0.17411,161.285544,11.840941,12.173335,-134.876118,0.928348


In [6]:
# Change ModePilote to 0 and 1
data.loc[data['ModePilote'] == 5, 'ModePilote'] = 1
data.loc[data['ModePilote'] == 2, 'ModePilote'] = 0

## Cleaning data

Interpolate NaN in DT as we know it samples once per second. So if DT is NaN at index 1, we can insert the value as DT at index 0 plus one second.

In [7]:
# Get index of missing in DT
inds = pd.isnull(data['DateTime']).to_numpy().nonzero()[0]
inds

array([ 39959,  81738,  82751,  91468, 121548], dtype=int64)

In [8]:
# For each missing value, we add the previous timestamp plus 1
# Note that we run chronologically down DF
# Thus, if there are multiple NaN in a row (not the case), this would be fine
for item in inds:
    data.loc[item, 'DateTime'] = data.loc[item-1, 'DateTime'] + pd.to_timedelta(1, unit='s')

Other missing values could be interpolated, as we might assume that things such as direction and speed might not change drastically from one time point to another. This could be done by taking an average of the n previous and following values and setting it in place of a missing value. This would exclude the Tacking and ModePilote columns as they seem categorical.

In [9]:
# Interpolating for all other numerical variables
# Input mean of previous 5 and following 5 values
for col in [item for item in data.columns if item not in ['ModePilote', 'Tacking']]:
    inds = pd.isnull(data[col]).to_numpy().nonzero()[0]
    for i in inds:
           data.loc[i, col] = data.loc[(i-5):(i+5), col].mean()

# For categorical variables, we take only the previous value
# Trade-off between labels being correct and complete dataset
for col in ['ModePilote', 'Tacking']:
    inds = pd.isnull(data[col]).to_numpy().nonzero()[0]
    for i in inds:
           data.loc[i, col] = data.loc[i-1, col].mean()

In [10]:
# Create time-to-tack variable
# Get indices for starting and ending tacking
# make new variable with NaNs (all times when we tack should be nan)
start_tack = data.index[(data['Tacking'] != data['Tacking'].shift(1)) & (data['Tacking'] == 1)]
end_tack = data.index[(data['Tacking'] != data['Tacking'].shift(1)) & (data['Tacking'] == 0)]
data['time_to_tack'] = np.nan

# Add countdown in seconds until time of tacking for all non-tacking periods
for i, item in enumerate(end_tack[:-1]):
    data.loc[item:(start_tack[i+1]-1), 'time_to_tack'] = range(start_tack[i+1]-item,0,-1)

In [11]:
# Scale numeric features
numeric_cols = [item for item in data.columns if item not in ['ModePilote', 'Tacking', 'time_to_tack', 'DateTime']]
X_numeric = data[numeric_cols]

In [17]:
# Check if data is stationary

# Augmented Dickey-Fuller Test (ADF Test)/unit root test
def adf_test(ts, signif=0.05):
    dftest = adfuller(ts, autolag='AIC')
    adf = pd.Series(dftest[0:4], index=['Test Statistic','p-value','# Lags','# Observations'])
    for key,value in dftest[4].items():
       adf['Critical Value (%s)'%key] = value
    
    p = adf['p-value']
    if p > signif:
        print(f'Series is Non-Stationary')

# Look at all numerical features
for col in X_numeric.columns:
    print(col)
    adf_test(X_numeric[col])

CurrentSpeed
CurrentDir
TWS
TWA
AWS
AWA
Roll
Pitch
HeadingMag
HoG
HeadingTrue
AirTemp
Longitude
Series is Non-Stationary
Latitude
Series is Non-Stationary
SoG
SoS
AvgSoS
VMG
RudderAng
Leeway
TWD
WSoG
VoltageDrawn
Yaw


In [18]:
# KPSS
def kpss_test(ts):
    kpsstest = kpss(ts, regression='c', lags='auto')
    kpss_output = pd.Series(kpsstest[0:3], index=['Test Statistic','p-value','Lags Used'])
    if kpss_output['p-value'] > .05:
        print('Stationary')

# Look at all numerical features
for col in X_numeric.columns:
    print(col)
    kpss_test(X_numeric[col])

CurrentSpeed
CurrentDir
TWS
TWA
AWS
AWA
Roll
Pitch
HeadingMag
HoG
HeadingTrue
AirTemp
Longitude
Latitude
SoG
SoS
AvgSoS
VMG
RudderAng
Leeway
TWD
WSoG
VoltageDrawn
Yaw


  kpsstest = kpss(ts, regression='c', lags='auto')
look-up table. The actual p-value is smaller than the p-value returned.

  Computes the Kwiatkowski-Phillips-Schmidt-Shin (KPSS) test for the null
look-up table. The actual p-value is smaller than the p-value returned.

  Computes the Kwiatkowski-Phillips-Schmidt-Shin (KPSS) test for the null
look-up table. The actual p-value is smaller than the p-value returned.

  Computes the Kwiatkowski-Phillips-Schmidt-Shin (KPSS) test for the null
look-up table. The actual p-value is smaller than the p-value returned.

  Computes the Kwiatkowski-Phillips-Schmidt-Shin (KPSS) test for the null
look-up table. The actual p-value is smaller than the p-value returned.

  Computes the Kwiatkowski-Phillips-Schmidt-Shin (KPSS) test for the null
look-up table. The actual p-value is smaller than the p-value returned.

  Computes the Kwiatkowski-Phillips-Schmidt-Shin (KPSS) test for the null
look-up table. The actual p-value is smaller than the p-value retur

#### RESULTS stationarity
- ADF non-stationary: longitude, latitude
- KPSS non-stationary: All variables
- Longitude & latitude non-stationary
- All others difference stationary

In [12]:
# Difference all numeric variables
for col in X_numeric.columns:
    X_numeric[col] = X_numeric[col] - X_numeric[col].shift(1)

X_numeric = X_numeric.dropna()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_numeric[col] = X_numeric[col] - X_numeric[col].shift(1)


In [43]:
# Scale numeric features
scaled_features = StandardScaler().fit_transform(X_numeric.values)
scaled_df = pd.DataFrame(scaled_features, columns=X_numeric.columns)

In [None]:
# Get correlation matrix
corrMatrix = scaled_df.corr()

In [None]:
# Make nice and colorful plot of matrix
sns.heatmap(corrMatrix, annot=True)
plt.show()

In [44]:
# drop certain columns due to multicolinearity

scaled_df = scaled_df.drop(['HoG', 'HeadingTrue', 'AvgSoS'], axis=1)

In [45]:
# Tested 3 set-ups before PCA
# With and without HoG, HT and AvgSoS with 95% var explained
# Without above and 90% var

# Dimensionality reduction
array_x_pca = np.array(scaled_df)

# Create the PCA instance
pca = PCA(n_components = 0.9)

# Fit on data
pca.fit(array_x_pca)

# Access values and vectors
print(pca.explained_variance_)

# Transform data
smol_scaled = pca.transform(array_x_pca)

[3.65199545 2.68989748 2.0187239  1.69324012 1.26033787 1.06824996
 1.00331091 0.99785961 0.98544124 0.9828819  0.97069282 0.9218892
 0.74233353]


In [46]:
smol_scaled2 = pd.DataFrame(smol_scaled)

In [47]:
# Combined numeric and ModePilote
cat_df = data.loc[1:, ['ModePilote', 'Tacking', 'DateTime']]
cat_df.index = range(199999)
full_df = pd.concat([smol_scaled2, cat_df], axis=1, ignore_index=True)
full_df.set_index(15, inplace=True)
full_df.head()

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
15,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
2019-04-14 00:00:01,0.000206,9.7e-05,-0.008418,-0.000361,2e-05,0.000132,2.6e-05,0.000116,0.000435,0.000281,-7.2e-05,-4.2e-05,5.6e-05,1.0,1.0
2019-04-14 00:00:02,0.043406,-0.022752,-0.007706,-0.023952,0.073131,-0.042774,-0.011494,-0.016449,-0.015278,0.01466,0.025648,0.024849,0.015241,1.0,1.0
2019-04-14 00:00:03,0.000206,9.7e-05,-0.008418,-0.000361,2e-05,0.000132,2.6e-05,0.000116,0.000435,0.000281,-7.2e-05,-4.2e-05,5.6e-05,1.0,1.0
2019-04-14 00:00:04,-0.031286,0.007498,-0.009271,0.019884,-0.047706,0.027697,0.008281,0.011519,0.010601,-0.009124,-0.017419,-0.01757,-0.007667,1.0,1.0
2019-04-14 00:00:05,-0.003896,-0.011994,-0.010079,0.015523,-0.031099,0.00578,-0.220125,-0.162762,-0.073092,0.075696,0.047694,-0.183269,-0.00682,1.0,1.0


In [48]:
# Try with simple resampling to get more coarse data
# Make Tacking binary again (slightly problematic)
data_down = full_df.copy()
data_down = data_down.resample('1T').mean()
data_down[14] = [1 if item > 0.5 else 0 for item in data_down[14]]

In [49]:
# Pick out data, rewritten function from https://machinelearningmastery.com/convert-time-series-supervised-learning-problem-python/

def series_to_supervised(data, n_in=1, dropnan=True):
	"""
	Frame a time series as a supervised learning dataset.
	Arguments:
		data: Sequence of observations as a list or NumPy array.
		n_in: Number of lag observations as input (X).
		n_out: Number of observations as output (y).
		dropnan: Boolean whether or not to drop rows with NaN values.
	Returns:
		Pandas DataFrame of series framed for supervised learning.
	"""

	n_vars = data.shape[1]
	df = pd.DataFrame(data)
	target = df.iloc[:,-1]
	# df = df.iloc[:,:-1]
	cols, names = list(), list()
	# input sequence (t-n, ... t-1)
	for i in range(n_in, -1, -1):
		cols.append(df.shift(i))
		if i == 0:
			names += [('var%d(t)' % (j+1)) for j in range(n_vars)]
		else:
			names += [('var%d(t-%d)' % (j+1, i)) for j in range(n_vars)]

	# forecast sequence
	cols.append(target.shift(-30))
	names.append('Tacking')
	# put it all together
	agg = pd.concat(cols, axis=1)
	agg.columns = names
	# drop rows with NaN values
	if dropnan:
		agg.dropna(inplace=True)
	return agg

In [50]:
supervised_df = series_to_supervised(data_down.values, n_in=10)
supervised_df.shape

(3294, 166)

In [55]:
# Remaking the DF without PCA to port into R for visualization

# Scale numeric features
scaled = StandardScaler().fit_transform(X_numeric.values)
scaled = pd.DataFrame(scaled, columns=X_numeric.columns)

scaled = pd.DataFrame(scaled)

# Combined numeric and ModePilote
cat_df = data.loc[1:, ['ModePilote', 'Tacking', 'DateTime']]
cat_df.index = range(199999)
full_df2 = pd.concat([scaled, cat_df], axis=1, ignore_index=True)
full_df2.set_index(-1, inplace=True)
full_df2.head()

full_df2.to_csv('C:/Users/samue/Documents/to_r_tdab_full.csv')

data_down2 = full_df2.copy()
data_down2 = data_down2.resample('1T').mean()
data_down2[-1] = [1 if item > 0.5 else 0 for item in data_down2[-1]]

supervised_df2 = series_to_supervised(data_down2.values, n_in=10)
supervised_df2.shape

supervised_df2.to_csv('C:/Users/samue/Documents/to_r_tdab_super.csv')

KeyError: 'None of [-1] are in the columns'

In [51]:
# Split into train and test sets
values = supervised_df.values
n_train = int(round(values.shape[0]*0.7, 0))
train = values[:n_train, :]
test = values[n_train:, :]

# Split into input and outputs
X_train, y_train = train[:, :-1], train[:, -1]
X_test, y_test = test[:, :-1], test[:, -1]

In [25]:
# Make CV function
from itertools import product

def custom_CV(X_train, y_train, X_test, y_test, parameters):

    param_list = list(product(*parameters.values()))

    best_roc = 0

    for i, row in enumerate(param_list):
        xgb_mod = xgb.XGBClassifier(random_state=0, use_label_encoder=False, eval_metric='logloss',
        eta = row[0], min_child_weight = row[1], gamma = row[2], subsample = row[3],
        colsample_bytree = row[4], max_depth = row[5], scale_pos_weight = row[6],
        reg_alpha = row[7], reg_lambda = row[8])

        xgb_mod.fit(X_train, y_train)
            
        y_pred = [1 if item[1] > 0.5 else 0 for item in xgb_mod.predict_proba(X_test)]

        if roc_auc_score(y_test, y_pred) > best_roc:
            best_roc = roc_auc_score(y_test, y_pred)
            best_params = xgb_mod.get_params

        if i % 1000 == 0:
            print(f'Current iteration is: {i}. Best roc_auc is {best_roc}')
    
    return best_roc, best_params



In [54]:
dist = {
        'eta': [0.1, 0.2, 0.3],
        'min_child_weight': [5, 10],
        'gamma': [0, 1.0, 10],
        'subsample': np.arange(0.5, 1, 0.1),
        'colsample_bytree': np.arange(0.5, 1, 0.1),
        'max_depth': np.arange(3, 10, 2),
        'scale_pos_weight': [5, 10],
        'reg_alpha': [1, 10.0, 100.0],
        'reg_lambda': [1, 10.0, 100.0]
        }

best_roc, best_params = custom_CV(X_train, y_train, X_test, y_test, dist)

Current iteration is: 0. Best roc_auc is 0.7465062111801242
Current iteration is: 1000. Best roc_auc is 0.821185947204969
Current iteration is: 2000. Best roc_auc is 0.821185947204969
Current iteration is: 3000. Best roc_auc is 0.821185947204969
Current iteration is: 4000. Best roc_auc is 0.821185947204969
Current iteration is: 5000. Best roc_auc is 0.821185947204969
Current iteration is: 6000. Best roc_auc is 0.8243885869565217
Current iteration is: 7000. Best roc_auc is 0.8243885869565217
Current iteration is: 8000. Best roc_auc is 0.8243885869565217
Current iteration is: 9000. Best roc_auc is 0.8243885869565217
Current iteration is: 10000. Best roc_auc is 0.8245341614906833
Current iteration is: 11000. Best roc_auc is 0.8245341614906833
Current iteration is: 12000. Best roc_auc is 0.8408385093167703
Current iteration is: 13000. Best roc_auc is 0.8408385093167703
Current iteration is: 14000. Best roc_auc is 0.8408385093167703
Current iteration is: 15000. Best roc_auc is 0.84083850931

In [70]:
best_params

<bound method XGBModel.get_params of XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.7,
              enable_categorical=False, eta=0.3, eval_metric='logloss',
              gamma=10, gpu_id=-1, importance_type=None,
              interaction_constraints='', learning_rate=0.300000012,
              max_delta_step=0, max_depth=5, min_child_weight=5, missing=nan,
              monotone_constraints='()', n_estimators=100, n_jobs=12,
              num_parallel_tree=1, predictor='auto', random_state=0,
              reg_alpha=10.0, reg_lambda=1, scale_pos_weight=10,
              subsample=0.8999999999999999, tree_method='exact',
              use_label_encoder=False, validate_parameters=1, ...)>

In [40]:
xgb_mod = xgb.XGBClassifier(random_state=0, use_label_encoder=False, eval_metric='logloss',
        eta = 0.3, min_child_weight = 5, gamma = 10, subsample = 0.9,
        colsample_bytree = 0.7, max_depth = 5, scale_pos_weight = 10,
        reg_alpha = 10, reg_lambda = 1)

xgb_mod.fit(X_train, y_train)

y_pred = [1 if item[1] > 0.53 else 0 for item in xgb_mod.predict_proba(X_test)]

print(classification_report(y_test, y_pred))

print(roc_auc_score(y_test, y_pred))

              precision    recall  f1-score   support

         0.0       0.91      0.98      0.94      1795
         1.0       0.03      0.01      0.01       184

    accuracy                           0.89      1979
   macro avg       0.47      0.49      0.48      1979
weighted avg       0.82      0.89      0.86      1979

0.4935251907472447


In [None]:
# 30 min forward, no back lag, by 30 min
# roc_auc = .833

xgb_mod = xgb.XGBClassifier(random_state=0, use_label_encoder=False, eval_metric='logloss',
        eta = 0.1, min_child_weight = 10, gamma = 0, subsample = 0.5,
        colsample_bytree = 0.8, max_depth = 3, scale_pos_weight = 5,
        reg_alpha = 1, reg_lambda = 10)

xgb_mod.fit(X_train, y_train)

y_pred = [1 if item[1] > 0.5 else 0 for item in xgb_mod.predict_proba(X_test)]

print(classification_report(y_test, y_pred))

print(roc_auc_score(y_test, y_pred))

In [27]:
# 10 min back, 30 min forward, by 5 min

xgb_mod = xgb.XGBClassifier(random_state=0, use_label_encoder=False, eval_metric='logloss',
        eta = 0.3, min_child_weight = 5, gamma = 10, subsample = 0.9,
        colsample_bytree = 0.7, max_depth = 5, scale_pos_weight = 10,
        reg_alpha = 10, reg_lambda = 1)

xgb_mod.fit(X_train, y_train)

y_pred = [1 if item[1] > 0.5 else 0 for item in xgb_mod.predict_proba(X_test)]

print(classification_report(y_test, y_pred))

print(roc_auc_score(y_test, y_pred))

              precision    recall  f1-score   support

         0.0       0.97      0.94      0.96       180
         1.0       0.55      0.67      0.60        18

    accuracy                           0.92       198
   macro avg       0.76      0.81      0.78       198
weighted avg       0.93      0.92      0.92       198

0.8055555555555555


In [None]:
# 10 min back, 30 min forward, by 2 min

In [41]:
# 5 min back, 30 min forward, by 1 min
# roc_auc = 0.884695

xgb_mod = xgb.XGBClassifier(random_state=0, use_label_encoder=False, eval_metric='logloss',
        eta = 0.2, min_child_weight = 10, gamma = 10, subsample = 0.7,
        colsample_bytree = 0.5, max_depth = 3, scale_pos_weight = 10,
        reg_alpha = 10, reg_lambda = 100)

xgb_mod.fit(X_train, y_train)

y_pred = [1 if item[1] > 0.34 else 0 for item in xgb_mod.predict_proba(X_test)]

print(classification_report(y_test, y_pred))

print(roc_auc_score(y_test, y_pred))

              precision    recall  f1-score   support

         0.0       0.97      0.86      0.91      1795
         1.0       0.35      0.74      0.48       184

    accuracy                           0.85      1979
   macro avg       0.66      0.80      0.70      1979
weighted avg       0.91      0.85      0.87      1979

0.8004844374470147


In [53]:
# 10 min back, 30 min forward, by 1 min
# roc_auc = .876868

xgb_mod = xgb.XGBClassifier(random_state=0, use_label_encoder=False, eval_metric='logloss',
        eta = 0.2, min_child_weight = 5, gamma = 10, subsample = 0.8,
        colsample_bytree = 0.9, max_depth = 3, scale_pos_weight = 10,
        reg_alpha = 10, reg_lambda = 100)

xgb_mod.fit(X_train, y_train)

y_pred = [1 if item[1] > 0.5 else 0 for item in xgb_mod.predict_proba(X_test)]

print(classification_report(y_test, y_pred))

print(roc_auc_score(y_test, y_pred))

              precision    recall  f1-score   support

         0.0       0.97      0.95      0.96       896
         1.0       0.59      0.68      0.64        92

    accuracy                           0.93       988
   macro avg       0.78      0.82      0.80       988
weighted avg       0.93      0.93      0.93       988

0.8183957686335405


In [None]:
# 10 min back, 30 min forward, by 30s

In [None]:
dist = {
        'eta': [0.01, 0.05, 0.1, 0.2, 0.3],
        'min_child_weight': [1, 5, 10],
        'gamma': [0, 0.1, 0.5, 1.0, 5, 10],
        'subsample': np.arange(0.5, 1, 0.1),
        'colsample_bytree': np.arange(0.5, 1, 0.1),
        'max_depth': np.arange(3, 10, 2),
        'scale_pos_weight': [1, 3, 5, 10],
        'reg_alpha': [0, 1.0, 10.0, 100.0],
        'reg_lambda': [0, 1.0, 10.0, 100.0]
        }

## Predicting time to tack

In [82]:
cat_df = data.loc[1:, ['ModePilote', 'time_to_tack']]
full_df = pd.concat([scaled_df, cat_df], axis=1, ignore_index=True)
full_df = full_df.dropna()

In [83]:
# Split into train and test sets
values = full_df.values
n_train_hours = int(round(values.shape[0]*0.7, 0))
train = values[:n_train_hours, :]
test = values[n_train_hours:, :]

# Split into input and outputs
X_train, y_train = train[:, :-1], train[:, -1]
X_test, y_test = test[:, :-1], test[:, -1]

In [90]:
xgb_mod = xgb.XGBRegressor(random_state=47, eta=0.2, min_child_weight=5, gamma=0.1,
subsample=0.8, colsample_bytree=0.8, max_depth=8, reg_alpha=1, reg_lambda=1)

xgb_mod.fit(X_train, y_train)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=0.8, enable_categorical=False,
             eta=0.2, gamma=0.1, gpu_id=-1, importance_type=None,
             interaction_constraints='', learning_rate=0.200000003,
             max_delta_step=0, max_depth=8, min_child_weight=5, missing=nan,
             monotone_constraints='()', n_estimators=100, n_jobs=12,
             num_parallel_tree=1, predictor='auto', random_state=47,
             reg_alpha=1, reg_lambda=1, scale_pos_weight=1, subsample=0.8,
             tree_method='exact', validate_parameters=1, verbosity=None)

In [91]:
y_pred = xgb_mod.predict(X_test)
print(r2_score(y_test, y_pred))

-2.194805435586546
