In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Read the Data
We read the data  to a data frame object using the pandas library. 

In [None]:
df=pd.read_csv("../input/jane-street-market-prediction/train.csv")

# Sample

We create a sample in order to do our exploratory analysis fast.

In [None]:
sample_sz=10000
df_positive=df[df["weight"]!=0]
sample=df_positive.sample(n=sample_sz)

# Use the feature columns

We use only the columns that contain the word "feature"

In [None]:
feature_cols=[col for col in df.columns if "feature" in col]
sample[feature_cols].head()

# Missing Values

* We observe that there  are missing values in the dataset and  especially in the feature colums.

Some questions that arise:

* How many missing values are there?
* Can we impute the missing values?
* Do we simply need a  model that works with missing data?

In [None]:
from matplotlib import pyplot as plt

missing_values=sample[feature_cols].isnull().sum().sort_values(ascending=False)
plt.figure(figsize=(10,10))
plt.hist(missing_values.values, density=False, bins=10)  # `density=False` would make counts
plt.ylabel('#Features')
plt.xlabel('#Missing Values');

In [None]:
top_missing_features=missing_values[1:15]
top_missing_features

In [None]:
import matplotlib.pyplot as plt


corr=sample[feature_cols].corr()
plt.figure(figsize=(10,10))
plt.matshow(corr,fignum=1)
# plt.xticks(range(df.shape[1]), sample[feature_cols].columns, fontsize=14, rotation=45)
# plt.yticks(range(df.shape[1]), sample[feature_cols].columns, fontsize=14)
cb = plt.colorbar()
cb.ax.tick_params(labelsize=14)
plt.title('Correlation Matrix', fontsize=16);
plt.show()

# Yay! Some Correlations

* We observe that they is  some significant correlation between some variables
* In some cases there is significant also negative correlation

# Lets inspect Feature_100

In [None]:
c=sample[feature_cols].corr()
pairs=c.unstack()
pairs=pairs.sort_values(ascending=False)
pairs.index.names=('Feature1','Feature2')
pairs=pd.DataFrame(pairs[pairs!=1.0])
pairs.columns=["Correlation"]
pairs.iloc[pairs.index.get_level_values('Feature1') == "feature_100"]

# Can we  impute the missing values?

* Using the Mean Imputer

In [None]:
from sklearn.decomposition import TruncatedSVD
from sklearn.impute import SimpleImputer

data=sample[feature_cols]
si=SimpleImputer(missing_values=np.nan, strategy='mean')
si.fit(data)
data=si.transform(data)

In [None]:
data=pd.DataFrame(data)
data.columns=feature_cols
data.isnull().sum()

# Feature Selection

Are  all features usefull?

We  can try:

* Stepwise Elimination
* Model Based
* Information Gain

In [None]:
from sklearn.ensemble import RandomForestClassifier

X=data
y=(sample['resp']>0).astype(int)
clf=RandomForestClassifier()
clf.fit(X,y)

In [None]:
from sklearn.feature_selection import SelectFromModel

sfm = SelectFromModel(estimator=RandomForestClassifier()).fit(X,y)

In [None]:
selected_feature_indices=sfm.get_support()

In [None]:
selected_features=data.columns[selected_feature_indices]

In [None]:
from sklearn.model_selection import train_test_split

x_train, x_valid, y_train, y_valid= train_test_split(X,y)

In [None]:
clf.fit(x_train,y_train)

In [None]:
x_train.shape

In [None]:
y_pred=clf.predict(x_valid)

In [None]:
from sklearn.metrics import classification_report

print(classification_report(y_pred,y_valid))

# Work in progress