In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import make_scorer
from scipy.ndimage.interpolation import shift
import random
from scipy.signal import argrelextrema
import warnings
warnings.filterwarnings("ignore")
from copy import deepcopy
from sklearn.linear_model import LinearRegression
from sklearn.tree import ExtraTreeRegressor

In [2]:
%matplotlib notebook

In [63]:
train = pd.read_csv('train_idao.csv')
test = pd.read_csv('test.csv')

## Preprocessing

In [1]:
def lags(sat_data):
    for i in [1, 2]:
        sat_data['epoch'] = pd.to_datetime(sat_data['epoch'])
        sat_data['epoch_delta{}'.format(1)] = sat_data['epoch'].diff(periods = 1).dt.seconds
        sat_data['x_sim_delta{}'.format(i)] = sat_data['x_sim'].diff(periods = i)
        sat_data['y_sim_delta{}'.format(i)] = sat_data['y_sim'].diff(periods = i)
        sat_data['z_sim_delta{}'.format(i)] = sat_data['z_sim'].diff(periods = i)
        sat_data['Vx_sim_delta{}'.format(i)] = sat_data['Vx_sim'].diff(periods = i)
        sat_data['Vy_sim_delta{}'.format(i)] = sat_data['Vy_sim'].diff(periods = i)
        sat_data['Vz_sim_delta{}'.format(i)] = sat_data['Vz_sim'].diff(periods = i)
        sat_data['velocity_delta{}'.format(i)] = sat_data['velocity'].diff(periods = i)
        sat_data['distance_delta{}'.format(i)] = sat_data['distance'].diff(periods = i)
        
        first_obs = train[(train['sat_id'] == sat_data.name)].iloc[0, :]
        
        sat_data['distance_from_epoch'] = np.sqrt((sat_data['x_sim'] - first_obs['x_sim']) ** 2 +\
                                                  (sat_data['y_sim'] - first_obs['y_sim']) ** 2 +\
                                                  (sat_data['z_sim'] - first_obs['z_sim']) ** 2)
        sat_data['drag_coeff_delta{}'.format(i)] = sat_data['drag_coeff'].diff(periods = i)

    return sat_data

def dataframe_fix(data, is_test=False):
    
    if is_test==False:
        
        df = []
        sats = data['sat_id'].unique()
        
        for sat in sats:
            
            sat_data = data[data['sat_id'] == sat].reset_index(drop=True)
            data_subset = np.array(sat_data.iloc[:,9:])
            indexes = list(sat_data[sat_data['epoch_delta1'] < 1].index)
            
            del indexes[0]
            
            if (len(indexes)) != 0:
                for i in indexes:
                    split_data = np.split(data_subset, [i], axis=0)
                    data_subset = np.concatenate((split_data[0], shift(split_data[1], [1,0], cval=np.NaN)), axis=0)
            
            if df==[]:
                df = data_subset
                
            else:
                df = np.concatenate((df, data_subset), axis = 0)
                
        data.iloc[:,9:] = df
        data = data.fillna(method='ffill')
        
        return data
    
    else:
        
        df = []
        sats = data['sat_id'].unique()
        
        for sat in sats:
            
            sat_data = data[data['sat_id'] == sat].reset_index(drop=True)
            data_subset = np.array(sat_data.iloc[:,3:])
            indexes = list(sat_data[sat_data['epoch_delta1'] < 1].index)
            
            del indexes[0]
            
            if (len(indexes)) != 0:
                for i in indexes:
                    split_data = np.split(data_subset, [i], axis=0)
                    data_subset = np.concatenate((split_data[0], shift(split_data[1], [1,0], cval=np.NaN)), axis=0)
            
            if df==[]:
                df = data_subset
                
            else:
                df = np.concatenate((df, data_subset), axis = 0)
                
        data.iloc[:,3:] = df
        data = data.fillna(method='ffill')
        
        return data

def features(data):
        
    data['epoch'] = pd.to_datetime(data['epoch'])
    data['velocity'] = np.sqrt(data['Vx_sim'] ** 2 + data['Vy_sim'] ** 2 + data['Vz_sim'] ** 2)
    data['distance'] = np.sqrt(data['x_sim'] ** 2 + data['y_sim'] ** 2 + data['z_sim'] ** 2)
    data['drag_coeff'] = 1 / (data['velocity'] ** 2)
    
    sats = data['sat_id'].unique()

    data = data.groupby(['sat_id']).apply(lags)    
    data = data.fillna(0)
    
    return data

In [65]:
%%time

data = deepcopy(train)
targets = [i for i in train.columns if i not in test.columns]

data = data[test.columns].append(test, ignore_index =True)
data = pd.concat([data, train[targets]], axis=1)
data = data.reindex(train.columns, axis=1).sort_values(by=['sat_id','id'])

data = features(data)
data = dataframe_fix(data)
data = data.sort_index()

train = data.iloc[:train.shape[0], :]
test = data.iloc[train.shape[0]:, :]

Wall time: 1min 9s


In [66]:
new_delta_epoch = []

for i in range(600):
    a = train[train['sat_id'] == i]['epoch_delta1'].unique()
    new_delta_epoch.extend([a[-1]] * (len(train[train['sat_id'] == i])))

train['epoch_delta1'] = new_delta_epoch
train['sec_from_epoch'] = (train['epoch'] - pd.to_datetime('2014-01-01T00:00:00.000')).dt.total_seconds()

new_delta_epoch = []

for i in test['sat_id'].unique():
    a = test[test['sat_id'] == i]['epoch_delta1'].unique()
    new_delta_epoch.extend([a[-1]] * (len(test[test['sat_id'] == i])))
    
test['epoch_delta1'] = new_delta_epoch
test['sec_from_epoch'] = (test['epoch'] - pd.to_datetime('2014-01-01T00:00:00.000')).dt.total_seconds()

test['epoch_delta1'] = new_delta_epoch
test['sec_from_epoch'] = (test['epoch'] - pd.to_datetime('2014-01-01T00:00:00.000')).dt.total_seconds()

In [67]:
final = []
for i in range(600):
    train_subset = train[train['sat_id'] == i]
    
    try:
        period = argrelextrema(np.array(train_subset['distance_from_epoch']), np.less)[0][0]
    except:
        period = 'fail'
    
    train_ed = train_subset['epoch_delta1']
    itog = []
    counter = 0
    
    if period != 'fail':
        for i in train_ed:
            if counter == period:
                counter = 0
                
            if i<1 and len(itog)==0:
                itog.append(counter)
                counter += 1
                
            elif i<1 and len(itog)!=0:
                itog.append(itog[-1])
                
            else:
                itog.append(counter)
                counter += 1
                
    elif period == 'fail':
        counter_1 = 0
        
        for i in train_ed:
            itog.append(counter_1)
            counter_1 += 1
        
    final.extend(itog)
    
train['numeration'] = final

In [68]:
test_sat = test['sat_id'].unique()
test['epoch_delta1'] = test['epoch_delta1'].fillna(0)
final = []

for i in test_sat:
    last_n_train = np.array(train[train['sat_id'] == i]['numeration'])[-1]
    
    try:
        period = argrelextrema(np.array(train[train['sat_id']==i]['distance_from_epoch']), np.less)[0][0]
    except:
        period = 'fail'
    
    test_subset = test[test['sat_id'] == i]
    test_ed = test_subset['epoch_delta1']
    itog = []
    counter = last_n_train + 1
    
    if period != 'fail':
        if counter == period:
            counter = 0
            
        for i in test_ed:
            if counter == period:
                counter = 0
                
            if i < 1 and len(itog) == 0:
                itog.append(counter)
                counter += 1
                
            elif i < 1 and len(itog) != 0:
                itog.append(itog[-1])
                
            else:
                itog.append(counter)
                counter += 1
                
    elif period == 'fail':
        counter_1 = last_n_train + 1
        
        for i in test_ed:
            itog.append(counter_1)
            counter_1 += 1
        
    final.extend(itog)
    
test['numeration'] = final

## Metrics

In [52]:
def smape(satellite_predicted_values, satellite_true_values): 
    return np.mean(np.abs((satellite_predicted_values - satellite_true_values) 
        / (np.abs(satellite_predicted_values) + np.abs(satellite_true_values))))

def score_idao(satellite_predicted_values, satellite_true_values):
    return 1 - smape(satellite_predicted_values, satellite_true_values)

# Custom metric
def custom_metric(y_true, y_pred): 
    return np.mean(np.abs((y_pred - y_true) 
        / (np.abs(y_pred) + np.abs(y_true))))

smape_metric = make_scorer(custom_metric, greater_is_better = True)

## Models

In [95]:
sats = test['sat_id'].unique()
results = []

for sat in sats:
    X = train[(train['sat_id'] == sat)].iloc[:,9:]
    y = train[(train['sat_id'] == sat)].iloc[:, 3:9]
    
    X_train = X.head(int(len(X)*(50/100)))
    y_train = y.head(int(len(X)*(50/100)))
    
    X_test = X.tail(int(len(X)*(50/100)))
    y_test = y.tail(int(len(X)*(50/100)))
    indexes = X_test.index
    
    model = LinearRegression()
    model.fit(np.array(X_train), y_train)
    
    results.append(custom_metric(y_test, model.predict(X_test)))

results = pd.DataFrame(results)

badsats = sats[results[results.mean(axis=1)>0.5].index]

## Submissions

In [43]:
np.random.seed(42)

### Baseline Model

In [61]:
%%time
sats = test['sat_id'].unique()
results = pd.DataFrame()

for sat in sats:
    X = train[train['sat_id'] == sat].iloc[:,9:]
    y = train[train['sat_id'] == sat].iloc[:, 3:9]
    
    X_test = test[test['sat_id'] == sat].iloc[:, 9:]
    
    model = LinearRegression()
    model.fit(X, y)
    
    predictions = model.predict(X_test)
    results = results.append(pd.DataFrame(predictions))
    
results = results.reset_index(drop = True)
results.columns = ['x', 'y', 'z', 'Vx', 'Vy', 'Vz']
results['id'] = test['id'].values


results = results[['id', 'x', 'y', 'z', 'Vx', 'Vy', 'Vz']]
results.to_csv('submission.csv', index = False)

print('Successfully finished! Submit to get IDAO 90%+ score!')

Successfully finished! Submit to get IDAO 90%+ score!
Wall time: 8.6 s


### Final Model

In [96]:
%%time
results = pd.DataFrame()
cols = train.columns[3:9]

for sat in sats:
    if sat not in badsats:
        X = train[train['sat_id'] == sat].iloc[:,9:]
        y = train[train['sat_id'] == sat].iloc[:, 3:9]
        
        X_test = test[test['sat_id'] == sat].iloc[:, 9:]

        model = LinearRegression()
        model.fit(X, y)
        
        predictions = pd.DataFrame(model.predict(X_test))
        predictions.index = X_test.index
        results = results.append(predictions)
        
    else:
        sat_data = train[train['sat_id'] == sat]
        for i in test[test['sat_id'] == sat]['numeration'].unique():
            X = sat_data[(sat_data['numeration'] == i)].iloc[:,9:]
            Y = sat_data[(sat_data['numeration'] == i)].iloc[:, 3:9]
            
            X_test = test[test['sat_id'] == sat][test['numeration'] == i].iloc[:, 9:]

            model = ExtraTreesRegressor(random_state=42)
            model.fit(np.array(X), Y)
            
            predictions = pd.DataFrame(model.predict(X_test))
            predictions.index = X_test.index
            results = results.append(predictions)


results = results.sort_index()
results = results.reset_index(drop = True)
results.columns = ['x', 'y', 'z', 'Vx', 'Vy', 'Vz']
results['id'] = test['id'].values


results = results[['id', 'x', 'y', 'z', 'Vx', 'Vy', 'Vz']]
results.to_csv('submission.csv', index = False)

print('Successfully finished! Submit to get IDAO 90%+ score!')

Successfully finished! Submit to get IDAO 90%+ score!
Wall time: 33.1 s
