In [None]:
import seaborn as sns
from matplotlib import pyplot as plt
import numpy as np
import time

## Read data

In [None]:
from src.features import load_raw, imply_columns

In [None]:
df = load_raw()

from src.features import append_features

df['train'], df['submit'] = map(append_features, [df['train'], df['submit']])

cols = imply_columns(df)

In [None]:
df['train'].shape, df['train'].columns

In [None]:
df['train'].head()

In [None]:
df['submit'].shape, df['submit'].columns

In [None]:
df['submit'].head()

In [None]:
cols

## plot distributions per target class

In [None]:
for col in cols['features']:
    df_in = df['train']
    x_in = df_in[col]
    y_in = df_in[cols['target'][0]].values
    sns.distplot(x_in[y_in==0])
    sns.distplot(x_in[y_in==1])
    plt.legend(['0', '1'])
    #plt.title(col)
    plt.show()

## joint distribution: feature vs target

In [None]:
c2 = cols['target'][0]
for c1 in cols['features']:
    sns.jointplot(c1, c2,
                  data=df['train'],
                  kind="kde", space=0, color="g")

## joint distributions: feature 1 and feature 2 vs target

In [None]:
import itertools
y_in = df['train'][cols['target'][0]]
for c1, c2 in itertools.combinations(cols['features'], 2):
    sns.jointplot(c1, c2,
                  data=df['train'][y_in==0],
                  kind="kde", space=0, color="g")
    plt.show()
    sns.jointplot(c1, c2,
                  data=df['train'][y_in==1],
                  kind="kde", space=0, color="r")
    plt.show()
    #break

## select k best features using chi2 test

In [None]:
# copied from http://scikit-learn.org/stable/auto_examples/text/document_classification_20newsgroups.html#sphx-glr-auto-examples-text-document-classification-20newsgroups-py
from sklearn.feature_selection import SelectKBest, chi2, f_classif
from time import time

t0 = time()
# ch2 = SelectKBest(chi2, k=3)
ch2 = SelectKBest(f_classif, k=3)
X_train = ch2.fit_transform(df['train'][cols['features']], df['train'][cols['target'][0]])
selected_features = [cols['features'][i] for i in ch2.get_support(indices=True)]
print("done in %fs" % (time() - t0))

selected_features

## calculate mutual information
http://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.mutual_info_classif.html#sklearn.feature_selection.mutual_info_classif

In [None]:
from sklearn.feature_selection import mutual_info_classif
result = mutual_info_classif(
    df['train'][cols['features']],
    df['train'][cols['target'][0]],
    # df['train'][cols['features'][0]], # to hint at colinearity
    discrete_features='auto',
    n_neighbors=20,
    copy=True,
    random_state=None
)
import pandas as pd
pd.DataFrame({'mi': result, 'feat': cols['features']}).sort_values('mi', ascending=False)
# , np.argsort(result), np.array(cols['features'])[np.argsort(result)]
# result, np.array(cols['features'])[np.argsort(result)]