# Complete Feature exploration notebook

This notebook aims to explore Features accross time ids and investment ids. It does so by looping trough all 300 features and giving 6 plots for each. 

## Other Feature Exploration / Feature engineering for Ubiquant:

- [Complete Feature Exploration](https://www.kaggle.com/lucasmorin/complete-feature-exploration)
- [Weird pattern in unique values](https://www.kaggle.com/lucasmorin/weird-patterns-in-unique-values-across-time-ids/)
- [Time x Strategy EDA](https://www.kaggle.com/lucasmorin/time-x-strategy-eda)  
- [UMAP Data Analysis & Applications](https://www.kaggle.com/lucasmorin/umap-data-analysis-applications)   
- [LB probing Notebook  ](https://www.kaggle.com/lucasmorin/don-t-mind-me-just-probing-the-lb)
- On-Line Feature Engineering (in progress)

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import random
import warnings
warnings.filterwarnings("ignore")

Using @slawekbiel Feather dataset: https://www.kaggle.com/slawekbiel/ubiquant-trainfeather-32-bit

In [None]:
%%time
train_data = pd.read_feather('../input/ubiquant-trainfeather-32-bit/train32.feather')
label_invest = pd.read_csv('../input/ubiquant-target-eda-pca-magic/label_feature')

In [None]:
label_invest

In [None]:
train_data = train_data.merge(label_invest, on='investment_id',how='left')

In [None]:
investments_ids_ref = []

for u in train_data.label_feature.unique():
    if ~np.isnan(u):

        investments_ids_ref.append(train_data[['time_id','investment_id']][train_data.label_feature == u].groupby('investment_id').count().sort_values(by='time_id',ascending=False).index[0])
        
investments_ids_ref

In [None]:
time_ids_ref =  [49,50,51]

In [None]:
train_data_time0 = train_data[train_data.time_id==time_ids_ref[0]].copy()
train_data_time1 = train_data[train_data.time_id==time_ids_ref[1]].copy()
train_data_time2 = train_data[train_data.time_id==time_ids_ref[2]].copy()

train_data_time = train_data[train_data.time_id.isin(time_ids_ref)]

pivot = train_data_time[['time_id','investment_id',feature_name]].pivot(index='investment_id',columns='time_id',values=feature_name)
    

for i in range(300):
    feature_name = 'f_'+str(i)
    Feature = train_data_time[feature_name]
    color = (random.random(), random.random(), random.random())
    fig, axs = plt.subplots(2,4, figsize=(30,12))
    
    sns.scatterplot(train_data_time0.investment_id,train_data_time0[feature_name], color=color, ax=axs[0,0]).set(title = feature_name+' v.s. investment id - time id '+str(time_ids_ref[0]))
    sns.scatterplot(train_data_time1.investment_id,train_data_time1[feature_name], color=color, ax=axs[0,1]).set(title = feature_name+' v.s. investment id - time id '+str(time_ids_ref[1]))
    sns.scatterplot(train_data_time2.investment_id,train_data_time2[feature_name], color=color, ax=axs[0,2]).set(title = feature_name+' v.s. investment id - time id '+str(time_ids_ref[2]))
    
    if train_data_time0[feature_name].nunique() == 1:
        print('f_'+str(i)+'is constant for a given tilme id')
    else:
        sns.regplot(x=train_data_time0[feature_name],y=train_data_time0['target'],color=color, order = 2, line_kws={"color": 'black'}, ax=axs[1,0]).set(title = feature_name+' v.s. target time '+str(time_ids_ref[0]))
    
    if train_data_time1[feature_name].nunique() == 1:
        print('f_'+str(i)+'is constant for a given tilme id')
    else:
        sns.regplot(x=train_data_time1[feature_name],y=train_data_time1['target'],color=color, order = 2, line_kws={"color": 'black'}, ax=axs[1,1]).set(title = feature_name+' v.s. target time '+str(time_ids_ref[1]))
     
    if train_data_time2[feature_name].nunique() == 1:
        print('f_'+str(i)+'is constant for a given tilme id')
    else:
        sns.regplot(x=train_data_time2[feature_name],y=train_data_time2['target'],color=color, order = 2, line_kws={"color": 'black'}, ax=axs[1,2]).set(title = feature_name+' v.s. target time '+str(time_ids_ref[2]))
    
    pivot = train_data_time[['time_id','investment_id',feature_name]].pivot(index='investment_id',columns='time_id',values=feature_name)
    t1, t2, t3  = pivot.columns

    sns.scatterplot(pivot[t1], pivot[t2], color=color, ax=axs[0,3]).set(title = f'{feature_name} - {t1} v.s. {t2}')
    sns.scatterplot(pivot[t2], pivot[t3], color=color, ax=axs[1,3]).set(title = f'{feature_name} - {t2} v.s. {t3}')
    
    plt.show()