In [None]:
import os
InputBaseDirectory  = '/kaggle/input'
InputDataDirectory = os.path.join(InputBaseDirectory, 'jane-street-market-prediction')

import numpy as np
import pandas as pd

import collections

In [None]:
columns = ['date', 'weight', 'resp']
for iFeature in range(130):
    columns.append('feature_%d'%iFeature)
    
dsTrain = pd.read_csv(os.path.join(InputDataDirectory, 'train.csv'),
                     usecols=columns,
                     dtype={c:'float16' for c in columns})

dateColumn = dsTrain.pop('date')
#dsTrain = dsTrain.drop(columns='weight')

# Well Engineered Features for Machine Learning
features_1..129 have median very close to zero.

The features look quite balanced as well at 25% vs 75% and min vs max.

In [None]:
statistics = dsTrain.describe()
statistics[statistics.columns[0:32]]

In [None]:
statistics[statistics.columns[32:64]]

In [None]:
statistics[statistics.columns[64:96]]

In [None]:
statistics[statistics.columns[96:128]]

In [None]:
statistics[statistics.columns[128:160]]

In [None]:
dsTrain = dsTrain.drop(columns='weight')

# Record count by Date

In [None]:
countByDate = dateColumn.value_counts()
countByDate

In [None]:
for iDate in range(10):
    print('iDate:', countByDate[iDate])

# Columns with null values

In [None]:
columns_withNull = dsTrain.columns[dsTrain.isnull().any()]

In [None]:
columns_nullCount = dsTrain.isnull().sum().sort_values(ascending=True)
columns_nullCount = columns_nullCount[columns_nullCount > 0]
print('dsTrain length:', len(dsTrain))
print(columns_nullCount[0:50])
print(columns_nullCount[50:100])
print(columns_nullCount[100:])

In [None]:
columns_lt500Nulls = [k for k,v in columns_nullCount.items() if v < 500]
columns_lt500Nulls

# Null patterns between dates

In [None]:
dsNull = dsTrain.isnull()
dsNull = dsNull[columns_withNull][dsNull.any(axis=1)]

dsNull['date']    = dateColumn
dsNull['pattern'] = ''
for col in columns_withNull:
    dsNull['pattern'] = dsNull['pattern'] + np.where(dsNull[col], 'N','.')

In [None]:
headers_withNull  = 'f'*len(columns_withNull) + '\n'
headers_withNull += ''.join([c[-3] if c[-3].isdigit() else '0' for c in columns_withNull]) + '\n'
headers_withNull += ''.join([c[-2] if c[-2].isdigit() else '0' for c in columns_withNull]) + '\n'
headers_withNull += ''.join([c[-1] for c in columns_withNull]) + '\n'
headers_withNull += '-'*len(columns_withNull)

In [None]:
nullPattern = dsNull[['date','pattern']]
nullPattern[nullPattern['date'] == 0.].value_counts()

In [None]:
nullPattern[nullPattern['date'] == 1.].value_counts()

In [None]:
nullPattern[nullPattern['date'] == 3.].value_counts()

In [None]:
nullPattern[nullPattern['date'] == 50.].value_counts()

# Null value patterns
There is a strong correlation between missing values from different columns.

They are likely to be different product or due to product specific features

Different features seems to be calculated from same missing values

In [None]:
nullPattern = dsNull['pattern'].copy()
print(len(dsTrain), 'dsTrain records')
print(len(nullPattern), 'dsNull records')
print()
print(headers_withNull)
print(nullPattern.value_counts()[0:32])
print('...')

In [None]:
patterns = ['..NN........NN........NN..............N....N....N....N....N....N....N....N..............',
            '..NN........NN........NN................................................................',
            '......................................N....N....N....N....N....N....N....N..............']

for pattern in patterns:
    matches = nullPattern.str.match(pattern)
    print(pattern, np.count_nonzero(matches))
print()

# remove pattern
for pattern in patterns:
    matches = nullPattern.str.match(pattern)
    
    iChar = pattern.find('N')
    while iChar >= 0:
        nullPattern[matches] = nullPattern[matches].str[:iChar]+'.'+nullPattern[matches].str[iChar+1:]
        iChar = pattern.find('N', iChar+1)

print('After removing patterns...')
print(len(dsTrain), 'dsTrain records')
print(len(nullPattern), 'dsNull records')
print()
print(headers_withNull)
print(nullPattern.value_counts()[0:32])
print('...')

# May be Missing at Random

In [None]:
print('Continuing on...')

patterns = ['..................................N...........................................NN........',
            '..............................................................................NN........',
            '..................................N.....................................................']

for pattern in patterns:
    matches = nullPattern.str.match(pattern)
    print(pattern, np.count_nonzero(matches))
print()

# remove pattern
for pattern in patterns:
    matches = nullPattern.str.match(pattern)
    
    iChar = pattern.find('N')
    while iChar >= 0:
        nullPattern[matches] = nullPattern[matches].str[:iChar]+'.'+nullPattern[matches].str[iChar+1:]
        iChar = pattern.find('N',iChar+1)

print('After removing patterns...')
print(len(dsTrain), 'dsTrain records')
print(len(nullPattern), 'dsNull records')
print()
print(headers_withNull)
print(nullPattern.value_counts()[0:32])
print('...')

# Perhaps some rolling, look back calculations

In [None]:
nullPattern = dsNull['pattern'].copy()
#masks = ['..NNNNNNNNNNNNNNNNNNNNNNNNNNNNNN......NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN..NNNNNNNN']
masks  = ['......................................NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN..........']

for mask in masks:    
    iChar = mask.find('.')
    while iChar >= 0:
        nullPattern = nullPattern.str[:iChar]+'.'+nullPattern.str[iChar+1:]
        iChar = mask.find('.', iChar+1)

print(len(dsTrain), 'dsTrain records')
print(len(nullPattern), 'dsNull records')
print()
print(headers_withNull)
print(nullPattern.value_counts()[0:32])
print('...')

In [None]:
nullPattern = dsNull['pattern'].copy()
#masks = ['..NNNNNNNNNNNNNNNNNNNNNNNNNNNNNN......NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN..NNNNNNNN']
masks  = ['................................................................................NNNNNNNN']

for mask in masks:    
    iChar = mask.find('.')
    while iChar >= 0:
        nullPattern = nullPattern.str[:iChar]+'.'+nullPattern.str[iChar+1:]
        iChar = mask.find('.', iChar+1)

print(len(dsTrain), 'dsTrain records')
print(len(nullPattern), 'dsNull records')
print()
print(headers_withNull)
print(nullPattern.value_counts()[0:32])
print('...')

In [None]:
nullPattern = dsNull['pattern'].copy()
#masks = ['..NNNNNNNNNNNNNNNNNNNNNNNNNNNNNN......NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN..NNNNNNNN']
masks  = ['..NNNNNNNNNNNNNNNNNNNNNNNNNNNNNN........................................................']

for mask in masks:    
    iChar = mask.find('.')
    while iChar >= 0:
        nullPattern = nullPattern.str[:iChar]+'.'+nullPattern.str[iChar+1:]
        iChar = mask.find('.', iChar+1)

print(len(dsTrain), 'dsTrain records')
print(len(nullPattern), 'dsNull records')
print(nullPattern.value_counts()[0:32])
print('...')

# Columns with low Null count

In [None]:
nullColumn = 'feature_58'
nullPattern = dsNull['pattern'].copy()

iColumn = columns_withNull.tolist().index(nullColumn)
pattern = '.'*iColumn + 'N' + '.'*(len(columns_withNull)-1-iColumn)

matches = nullPattern.str.match(pattern)
nullPattern[~matches] = ''

print('checking null pattern for', nullColumn)
print(nullPattern.value_counts())

In [None]:
nullColumn = 'feature_59'
nullPattern = dsNull['pattern'].copy()

iColumn = columns_withNull.tolist().index(nullColumn)
pattern = '.'*iColumn + 'N' + '.'*(len(columns_withNull)-1-iColumn)

matches = nullPattern.str.match(pattern)
nullPattern[~matches] = ''

print('checking null pattern for', nullColumn)
print(headers_withNull)
print(nullPattern.value_counts())

In [None]:
nullColumn = 'feature_123'
nullPattern = dsNull['pattern'].copy()

iColumn = columns_withNull.tolist().index(nullColumn)
pattern = '.'*iColumn + 'N' + '.'*(len(columns_withNull)-1-iColumn)

matches = nullPattern.str.match(pattern)
nullPattern[~matches] = ''

print('checking null pattern for', nullColumn)
print(headers_withNull)
print(nullPattern.value_counts())

In [None]:
nullColumn = 'feature_4'
nullPattern = dsNull['pattern'].copy()

iColumn = columns_withNull.tolist().index(nullColumn)
pattern = '.'*iColumn + 'N' + '.'*(len(columns_withNull)-1-iColumn)

matches = nullPattern.str.match(pattern)
nullPattern[~matches] = ''

print('checking null pattern for', nullColumn)
print(headers_withNull)
print(nullPattern.value_counts())

In [None]:
nullColumn = 'feature_56'
nullPattern = dsNull['pattern'].copy()

iColumn = columns_withNull.tolist().index(nullColumn)
pattern = '.'*iColumn + 'N' + '.'*(len(columns_withNull)-1-iColumn)

matches = nullPattern.str.match(pattern)
nullPattern[~matches] = ''

print('checking null pattern for', nullColumn)
print(headers_withNull)
print(nullPattern.value_counts())

In [None]:
nullColumn = 'feature_97'
nullPattern = dsNull['pattern'].copy()

iColumn = columns_withNull.tolist().index(nullColumn)
pattern = '.'*iColumn + 'N' + '.'*(len(columns_withNull)-1-iColumn)

matches = nullPattern.str.match(pattern)
nullPattern[~matches] = ''

print('checking null pattern for', nullColumn)
print(headers_withNull)
print(nullPattern.value_counts())

In [None]:
nullColumn = 'feature_129'
nullPattern = dsNull['pattern'].copy()

iColumn = columns_withNull.tolist().index(nullColumn)
pattern = '.'*iColumn + 'N' + '.'*(len(columns_withNull)-1-iColumn)

matches = nullPattern.str.match(pattern)
nullPattern[~matches] = ''

print('checking null pattern for', nullColumn)
print(headers_withNull)
print(nullPattern.value_counts())

In [None]:
nullColumn = 'feature_35'
nullPattern = dsNull['pattern'].copy()

iColumn = columns_withNull.tolist().index(nullColumn)
pattern = '.'*iColumn + 'N' + '.'*(len(columns_withNull)-1-iColumn)

matches = nullPattern.str.match(pattern)
nullPattern[~matches] = ''

print('checking null pattern for', nullColumn)
print(headers_withNull)
print(nullPattern.value_counts())

# Grouping columns with same null pattern
reduces the number of columns required for null analysis

In [None]:
columns_groupByNulls = []

columns_groupByNullCount = collections.defaultdict(list)
for k,v in columns_nullCount.items():
    columns_groupByNullCount[v].append(k)
print('columns_groupByNullCount Length:', len(columns_groupByNullCount))

for columns_sameNullCount in columns_groupByNullCount.values():
    #print(columns_sameNullCount)
    while len(columns_sameNullCount) > 0:
        column_i = columns_sameNullCount.pop()
        #print(' ', column_i)
        
        columns_sameNulls = [column_i]
        for column_j in columns_sameNullCount:
            #print('  ', column_j)
            if dsNull[column_i].equals(dsNull[column_j]):
                columns_sameNulls.append(column_j)
                columns_sameNullCount.remove(column_j)
        
        columns_groupByNulls.append(columns_sameNulls)
        #if len(columns_sameNulls) >= 2:
        #    columns_groupByNulls.append(columns_sameNulls)

print('columns_groupByNulls Length:', len(columns_groupByNulls))