# Explore Feature Correlations with graphics

(Linda Cobb, http://github.com/timestocome)

### Lots of features and a huge dataset make it difficult to analyze data with list of numbers

### Loops have been set to only loop over a few samples, use can use the commented line below to go through full dataset


### * edit: correlations are strongest with larger numbers so graph plot should invert that so that stronger correlations closer to zero, weaker ones closer to one





## Import libraries


In [None]:
import numpy as np
import pandas as pd
pd.options.display.max_rows=300
pd.options.display.max_columns=300

from typing import List

import networkx as nx
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

## Fetch data

In [None]:
# load up data
train = pd.read_csv('../input/jane-street-market-prediction/train.csv')


In [None]:
# sanity check data
print(train.head())

In [None]:

# load up feature meta-data
feature_info = pd.read_csv('../input/jane-street-market-prediction/features.csv')


In [None]:
# sanity check 

print(feature_info.describe(include='all'))
tag_features = features = [c for c in feature_info.columns if 'tag_' in c]
print(tag_features)

### Build correlation dataframe, convert to link dataframe, plot graphs

In [None]:

# tagged features are strongly correlated
# graph correlations between features by tag

sample_data = train.sample(frac=.1)
n_samples = len(sample_data)
n_features = len(features)
print('n samples', n_samples)

tags = tag_features
n_tags = len(feature_info)



for i in range(3):
#for i in range(n_features):

    tag = tags[i]
    print(tag)

    feature_list = list(feature_info[feature_info[tag] == True]['feature'])
    
    z = sample_data[feature_list]
    corr_df = np.abs(z.corr())
    
    # stronger correlations have larger values so reverse it to show stronger correlations 
    # closer and weaker corrrelations further apart
    corr_df = corr_df - 1
    
 

    links_df = corr_df.stack().reset_index()
    links_df.columns = ['source', 'target', 'weight']
    
    plt.figure(figsize=(20,20))
    plt.title(tags[i])
    g = nx.from_pandas_edgelist(links_df, 'source', 'target', 'weight')
    nx.draw(g, with_labels=True, node_color='orange', node_size=4, edge_color='blue')
    plt.show()


### Look for temporal patterns, (idea from https://www.kaggle.com/lachlansuter/important-and-hidden-temporal-data )

In [None]:
# fetch a sample of data from middle to avoid edge weirdness, if any
ordered_sample = train.loc[10000:80000]
print(len(ordered_sample))

In [None]:
# look for time patterns in features vs time stamp id
features = [c for c in train.columns if 'feature_' in c]
n_features = len(features)

def plotFeatureSplits(df, f):

        plt.figure(figsize=(10, 10))

        plt.scatter(df['ts_id'], df[f], s=.2)
        plt.title(f, color='white')
        plt.ylabel(f)
        plt.xlabel('td_id')
        plt.show()
        
        
# sample run just to check or run all range(1, n_features)
# skip feature_0
for i in range(1, 5):
#for i in range(1, len(features)):
    plotFeatureSplits(ordered_sample, features[i])