In [None]:
import numpy as np 
import pandas as pd 
import os
from tqdm import tqdm
import random
import gc
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go

from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingRegressor

import tensorflow as tf
from tensorflow.keras.models import *
from tensorflow.keras.layers import *

import warnings
warnings.filterwarnings('ignore')

In [None]:
n_features = 300
features = [f'f_{i}' for i in range(n_features)] 
train = pd.read_pickle('../input/ubiquant-market-prediction-half-precision-pickle/train.pkl')
print(train.shape)
train.head()

In [None]:
train.info()

### **Indicators only by investment ID**

In [None]:
investment_id_one = train[train['investment_id']==1]
investment_id_one = investment_id_one.astype(float)
print(investment_id_one.shape)
investment_id_one.head()

In [None]:
fig = px.line(investment_id_one, x='time_id', y="f_0")
fig.show()

**Let's try to create a moving average for the same f_0**

In [None]:
f_0_scroll_mean = investment_id_one[['time_id', 'f_0']]
f_0_scroll_mean['f_0_scroll_mean_7'] = f_0_scroll_mean.f_0.rolling(window=7).mean()
f_0_scroll_mean['f_0_scroll_mean_6'] = f_0_scroll_mean.f_0.rolling(window=6).mean()
f_0_scroll_mean['f_0_scroll_mean_5'] = f_0_scroll_mean.f_0.rolling(window=5).mean()
f_0_scroll_mean['f_0_scroll_mean_4'] = f_0_scroll_mean.f_0.rolling(window=4).mean()
f_0_scroll_mean['f_0_scroll_mean_3'] = f_0_scroll_mean.f_0.rolling(window=3).mean()
f_0_scroll_mean['f_0_scroll_mean_2'] = f_0_scroll_mean.f_0.rolling(window=2).mean()

fig = go.Figure()
fig.add_trace(go.Scatter(x=f_0_scroll_mean.time_id, y=f_0_scroll_mean.f_0,
                    mode='lines',
                    name='f_0'))

fig.add_trace(go.Scatter(x=f_0_scroll_mean.time_id, y=f_0_scroll_mean.f_0_scroll_mean_7,
                    mode='lines+markers',
                    name='f_0_scroll_mean_7'))

fig.add_trace(go.Scatter(x=f_0_scroll_mean.time_id, y=f_0_scroll_mean.f_0_scroll_mean_6,
                    mode='lines+markers',
                    name='f_0_scroll_mean_6'))

fig.add_trace(go.Scatter(x=f_0_scroll_mean.time_id, y=f_0_scroll_mean.f_0_scroll_mean_5,
                    mode='lines+markers',
                    name='f_0_scroll_mean_5'))

fig.add_trace(go.Scatter(x=f_0_scroll_mean.time_id, y=f_0_scroll_mean.f_0_scroll_mean_4,
                    mode='lines+markers',
                    name='f_0_scroll_mean_4'))

fig.add_trace(go.Scatter(x=f_0_scroll_mean.time_id, y=f_0_scroll_mean.f_0_scroll_mean_3,
                    mode='lines+markers',
                    name='f_0_scroll_mean_3'))

fig.add_trace(go.Scatter(x=f_0_scroll_mean.time_id, y=f_0_scroll_mean.f_0_scroll_mean_2,
                    mode='lines+markers',
                    name='f_0_scroll_mean_2'))

fig.show()

**f_299**

In [None]:
f_299_scroll_mean = investment_id_one[['time_id', 'f_299']]
f_299_scroll_mean['f_299_scroll_mean_7'] = f_299_scroll_mean.f_299.rolling(window=7).mean()
f_299_scroll_mean['f_299_scroll_mean_6'] = f_299_scroll_mean.f_299.rolling(window=6).mean()
f_299_scroll_mean['f_299_scroll_mean_5'] = f_299_scroll_mean.f_299.rolling(window=5).mean()
f_299_scroll_mean['f_299_scroll_mean_4'] = f_299_scroll_mean.f_299.rolling(window=4).mean()
f_299_scroll_mean['f_299_scroll_mean_3'] = f_299_scroll_mean.f_299.rolling(window=3).mean()
f_299_scroll_mean['f_299_scroll_mean_2'] = f_299_scroll_mean.f_299.rolling(window=2).mean()

fig = go.Figure()
fig.add_trace(go.Scatter(x=f_299_scroll_mean.time_id, y=f_299_scroll_mean.f_299,
                    mode='lines',
                    name='f_299'))

fig.add_trace(go.Scatter(x=f_299_scroll_mean.time_id, y=f_299_scroll_mean.f_299_scroll_mean_7,
                    mode='lines+markers',
                    name='f_299_scroll_mean_7'))

fig.add_trace(go.Scatter(x=f_299_scroll_mean.time_id, y=f_299_scroll_mean.f_299_scroll_mean_6,
                    mode='lines+markers',
                    name='f_299_scroll_mean_6'))

fig.add_trace(go.Scatter(x=f_299_scroll_mean.time_id, y=f_299_scroll_mean.f_299_scroll_mean_5,
                    mode='lines+markers',
                    name='f_299_scroll_mean_5'))

fig.add_trace(go.Scatter(x=f_299_scroll_mean.time_id, y=f_299_scroll_mean.f_299_scroll_mean_4,
                    mode='lines+markers',
                    name='f_299_scroll_mean_4'))

fig.add_trace(go.Scatter(x=f_299_scroll_mean.time_id, y=f_299_scroll_mean.f_299_scroll_mean_3,
                    mode='lines+markers',
                    name='f_299_scroll_mean_3'))

fig.add_trace(go.Scatter(x=f_299_scroll_mean.time_id, y=f_299_scroll_mean.f_299_scroll_mean_2,
                    mode='lines+markers',
                    name='f_299_scroll_mean_2'))

fig.show()

In [None]:
del f_0_scroll_mean
del f_299_scroll_mean
del fig

### **Creating moving averages**

In [None]:
WINDOWS = 7

def reduce_memory_usage(df, features):
    for feature in features:
        item = df[feature].astype(np.float16)
        df[feature] = item
        del item
        gc.collect()

In [None]:
scroll_mean_five = pd.DataFrame()

for i_id in tqdm(train.investment_id.unique()[:17]):  # [:1700]
    
    df_scroll_mean = train[train['investment_id'] == i_id]
    if df_scroll_mean.shape[0] > WINDOWS-1:
        df_scroll_mean = df_scroll_mean.rolling(window=WINDOWS).mean()

        for columns, values_isnm in zip(df_scroll_mean.columns, df_scroll_mean.iloc[WINDOWS-1:WINDOWS,:].values[0]):
            for index in df_scroll_mean.iloc[:WINDOWS-1,:].index:
                df_scroll_mean.at[index, columns] = values_isnm
       
    scroll_mean_five = pd.concat([scroll_mean_five, df_scroll_mean])

In [None]:
scroll_mean_five.info()

In [None]:
%%time
reduce_memory_usage(scroll_mean_five, features + ["target"])

In [None]:
scroll_mean_five.info()

In [None]:
scroll_mean_five_two = pd.DataFrame()

for i_id in tqdm(train.investment_id.unique()[3562:]):  # [1700:] 
    
    df_scroll_mean = train[train['investment_id'] == i_id]
    if df_scroll_mean.shape[0] > WINDOWS-1:
        df_scroll_mean = df_scroll_mean.rolling(window=WINDOWS).mean()
    
        for columns, values_isnm in zip(df_scroll_mean.columns, df_scroll_mean.iloc[WINDOWS-1:WINDOWS,:].values[0]):
            for index in df_scroll_mean.iloc[:WINDOWS-1,:].index:
                df_scroll_mean.at[index, columns] = values_isnm
       
    scroll_mean_five_two = pd.concat([scroll_mean_five_two, df_scroll_mean])

In [None]:
scroll_mean_five_two.info()

In [None]:
%%time
reduce_memory_usage(scroll_mean_five_two, features + ["target"])

In [None]:
scroll_mean_five_two.info()

In [None]:
scroll_mean_five = pd.concat([scroll_mean_five_two, scroll_mean_five])

In [None]:
print(scroll_mean_five.shape)
scroll_mean_five.head(7)

In [None]:
scroll_mean_five = scroll_mean_five.sort_index()

In [None]:
scroll_mean_five['initial_train'] = train['target']

In [None]:
scroll_mean_five.head()

In [None]:
del df_scroll_mean
del scroll_mean_five
del scroll_mean_five_two

### **PERMUTATION IMPORTANCE**

In [None]:
train.head(2)

In [None]:
train_corr = train[features+['target']].loc[:1000].corr(method='pearson')
train_corr.style.background_gradient(cmap='coolwarm', axis=None)

In [None]:
train_corr.nlargest(7, 'target').index

In [None]:
del train_corr

In [None]:
n = gc.collect()
print("Number of unreachable objects collected by GC:", n)
print("Uncollectable garbage:", gc.garbage)
gc.collect()

In [None]:
train.head()

In [None]:
target = train.pop("target")
investment_id = train.pop("investment_id")
time_id = train.pop("time_id")

In [None]:
X_train, X_test, y_train, y_test = train_test_split(train, target, test_size = 0.2, random_state = 42)
print(X_train.shape, y_train.shape) 
print(X_test.shape, y_test.shape) 

In [None]:
del train
del target

In [None]:
scaler = StandardScaler()
scaled_train = scaler.fit_transform(X_train)
scaled_test = scaler.transform(X_test)

In [None]:
mean_train = y_train.mean()#.values
std_train = y_train.std()#.values

# scale the target variable to a lower range
def scale_target(y, mean, std):
    return np.asarray((y - mean)/std)

def reverse_target(pred, mean, std): 
    return np.asarray(pred*std + mean)

def set_seed(seed):
    tf.random.set_seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    random.seed(seed)

def ret(a):
    return  a     
    
set_seed(33)

In [None]:
model= Sequential()

model.add(Lambda(ret, input_shape = [scaled_train.shape[1]]))
model.add(Dense(128, activation = 'relu'))
model.add(Dense(32, activation = 'relu'))
model.add(Dense(1))

model.compile(optimizer=tf.keras.optimizers.Adam(), loss=tf.keras.losses.MeanSquaredError())

In [None]:
model.fit(scaled_train, y_train, epochs=25, batch_size=512)

In [None]:
real_pred = model.predict(scaled_test).ravel()
MAE = mean_absolute_error(y_test, real_pred)

MAE

In [None]:
final_score = []
shuff_pred = []

In [None]:
del X_train
del y_train
del scaled_train

In [None]:
for i,col in enumerate(X_test.columns):

    shuff_test = scaled_test.copy()
    shuff_test[:,i] = np.random.permutation(shuff_test[:,i]) 
    
    score = mean_absolute_error(y_test, model.predict(shuff_test).ravel())
    
    final_score.append(score)
       
final_score = np.asarray(final_score)

In [None]:
df = pd.DataFrame()
df['columns'] = X_test.columns
df['score'] = final_score
df.nlargest(7, 'score') 

In [None]:
del model
del final_score