In [67]:
import math
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.preprocessing import PolynomialFeatures, StandardScaler

In [85]:
def load_data (filename, 
               fill=False, fill_type='mean',
               scale=False, log=False):
    
    data = pd.read_csv('../data/base/' + filename, index_col=0)

    if fill:
        data['F5'] = data['F5'].fillna(11.0)
        data['F19'] = data['F19'].fillna(data['F19'].mean())
    
    if scale:
        numer_cols = []
        for c in data:
            if c != 'Y':
                data[c] = StandardScaler().fit_transform(data[c].values.reshape(-1,1))
    
    if log:
        for c in data:
            if c != 'Y':
                data[c] = np.log(data[c]+1)
            
    return data

def save_data (filename, save_data):
    #submission = pd.DataFrame(columns=['Y'], index=index_data.index, data=predictions)
    save_data.to_csv('../data/processed/' + filename)

# Load data

### Data filled in

In [3]:
train = load_data ('train.csv', fill=True)
test = load_data ('test.csv', fill=True)
print (train.shape)
print (test.shape)

y_train = train['Y']
X_train = train.drop (['Y'], axis=1)

(49998, 28)
(50000, 27)


In [4]:
save_data('train_fill.csv', train)
save_data('test_fill.csv', test)

### Data not filled in

In [56]:
train_missing = load_data ('train.csv', fill=False)
test_missing = load_data ('test.csv', fill=False)
print (train_missing.shape)
print (test_missing.shape)

save_data('train_missing.csv', train_missing)
save_data('test_missing.csv', test_missing)

(49998, 28)
(50000, 27)


### Data with interaction terms

In [66]:
train_inter = load_data ('train.csv', fill=True)
test_inter = load_data ('test.csv', fill=True)
                
terms = train_inter[['F23', 'F14']]
poly = PolynomialFeatures(degree=2, interaction_only=False)
inter_terms = poly.fit_transform(terms)[:,1:]
inter_terms = pd.DataFrame(inter_terms)
inter_terms = inter_terms.rename(lambda x: x+1)
train_inter = pd.concat([train_inter, inter_terms, train_inter['F23']**2], axis=1)

terms = test_inter[['F23', 'F14']]
poly = PolynomialFeatures(degree=2, interaction_only=False)
inter_terms = poly.fit_transform(terms)[:,1:]
inter_terms = pd.DataFrame(inter_terms)
inter_terms = inter_terms.rename(lambda x: x+49999)
test_inter = pd.concat([test_inter, inter_terms, test_inter['F23']**2], axis=1)
                         
print (train_inter.shape)
print (test_inter.shape)

y_train_inter = train_inter['Y']
X_train_inter = train_inter.drop (['Y'], axis=1)

save_data('train_inter.csv', train_inter)
save_data('test_inter.csv', test_inter)

(49998, 34)
(50000, 33)


### Data scaled

In [79]:
train_scaled = load_data ('train.csv', fill=True, scale=True)
test_scaled = load_data ('test.csv', fill=True, scale=True)
print (train_scaled.shape)
print (test_scaled.shape)

y_train_scaled = train_scaled['Y']
X_train_scaled = train_scaled.drop (['Y'], axis=1)

save_data('train_scaled.csv', train_scaled)
save_data('test_scaled.csv', test_scaled)



(49998, 28)
(50000, 27)


### Data log(x+1)

In [84]:
train_log = load_data ('train.csv', fill=True, log=True)
test_log = load_data ('test.csv', fill=True, log=True)
print (train_log.shape)
print (test_log.shape)

y_train_log = train_log['Y']
X_train_log = train_log.drop (['Y'], axis=1)

save_data('train_log.csv', train_log)
save_data('test_log.csv', test_log)

(49998, 28)
(50000, 27)


In [6]:
print ("Unique values per column:")
print (X_train.apply(lambda x: len(x.unique())))

Unique values per column:
F1         9
F2        12
F3     49998
F4         9
F5        12
F6      1880
F7         9
F8         9
F9       322
F10       23
F11       43
F12        9
F13       10
F14       16
F15       10
F16      310
F17        9
F18       83
F19     8771
F20       10
F21      334
F22       55
F23    42562
F24        9
F25       14
F26       83
F27    41705
dtype: int64


In [7]:
categ_cols = []
for c in X_train:
    if len(X_train[c].unique()) <= 30:
        print (str(c) + ': ' + str(sorted(train[c].unique())))
        print (str(c) + ': ' + str(sorted(test[c].unique())))
        categ_cols.append(c)

print (str(len(categ_cols)) + ': ' + str(categ_cols))

F1: [1, 2, 3, 4, 5, 6, 8, 12, 18]
F1: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 12]
F2: [0, 1, 2, 3, 4, 5, 6, 7, 9, 11, 96, 98]
F2: [0, 1, 2, 3, 4, 5, 6, 7, 8, 96, 98]
F4: [0, 1, 2, 3, 4, 5, 6, 7, 8]
F4: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 17]
F5: [0.0, 1.0, 2.0, 3.0, 5.0, nan, 4.0, 6.0, 7.0, 8.0, 9.0, 10.0]
F5: [0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, nan, 7.0, 8.0, 10.0, 20.0]
F7: [1, 2, 3, 4, 5, 6, 8, 10, 23]
F7: [1, 2, 3, 4, 5, 6, 7, 8, 10, 14, 18]
F8: [1, 2, 3, 4, 5, 6, 7, 8, 9]
F8: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 13]
F10: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 23, 29, 54]
F10: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 20, 21, 23, 25, 26]
F12: [1, 2, 3, 4, 5, 6, 7, 11, 12]
F12: [1, 2, 3, 4, 5, 6, 7, 8, 9, 17]
F13: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
F13: [1, 2, 3, 4, 5, 7, 8, 9]
F14: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 13, 14, 96, 98]
F14: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 15, 96, 98]
F15: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
F15: [1, 2, 3, 4, 5, 6, 7, 

In [8]:
numer_cols = []
for c in X_train:
    if len(X_train[c].unique()) > 30:
        print ('%3s'%(c) + ' (' + '%-5d'%(len(train[c].unique())) + ' unique values) : ' + str(min(train[c])) + ' - ' + str(max(train[c])))
        print ('%3s'%(c) + ' (' + '%-5d'%(len(test[c].unique())) + ' unique values) : ' + str(min(test[c])) + ' - ' + str(max(test[c])))

        numer_cols.append(c)

print (str(len(numer_cols)) + ': ' + str(numer_cols))

 F3 (49998 unique values) : -0.372758387 - 29110.04058
 F3 (49998 unique values) : -0.364406804 - 50708.04549
 F6 (1880  unique values) : 1 - 8194101904
 F6 (1880  unique values) : 1 - 69627726
 F9 (322   unique values) : 1 - 59249
 F9 (312   unique values) : 1 - 45887
F11 (43    unique values) : 18 - 63
F11 (42    unique values) : 19 - 60
F16 (310   unique values) : 1.77 - 35807.1
F16 (307   unique values) : 1.77 - 115009.29
F18 (83    unique values) : 98 - 184
F18 (83    unique values) : 77 - 182
F19 (8771  unique values) : 0.0 - 3008750.0
F19 (8802  unique values) : 0.0 - 702500.0
F21 (334   unique values) : 0 - 630367
F21 (295   unique values) : 0 - 41061
F22 (55    unique values) : 0 - 58
F22 (50    unique values) : 0 - 54
F23 (42562 unique values) : 0.0 - 29110.0
F23 (42460 unique values) : 0.0 - 50708.0
F26 (83    unique values) : 21 - 107
F26 (83    unique values) : 0 - 105
F27 (41705 unique values) : 0.0 - 329664.0
F27 (41563 unique values) : 0.0 - 326442.0
12: ['F3', 'F6', 'F