In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
# Data Visualization
import matplotlib.pyplot as plt
import seaborn as sns 
import plotly.offline as pyo
sns.set(color_codes=True) 

#Predictive Modeling
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.ensemble import BaggingClassifier, AdaBoostClassifier, GradientBoostingClassifier, RandomForestClassifier

from sklearn import metrics

# Evaluation metrics
from sklearn.metrics import confusion_matrix, classification_report, roc_curve, auc, accuracy_score, precision_recall_curve,roc_auc_score
from sklearn.model_selection import train_test_split,GridSearchCV
# Suppress warnings
import warnings; warnings.filterwarnings('ignore')

# Visualize Tree
from sklearn.tree import export_graphviz
from IPython.display import Image
from os import system

# Display settings
pd.options.display.max_rows = 10000
pd.options.display.max_columns = 10000

random_state = 42
np.random.seed(random_state)
from sklearn.impute import SimpleImputer
from sklearn.decomposition import PCA

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
%time
train = pd.read_csv("../input/jane-street-market-prediction/train.csv",nrows=1e5)

In [None]:
print(f"Train data has {train.shape[0]} rows and {train.shape[1]} features")

In [None]:
train.head()

In [None]:
train.info()

In [None]:
train.describe().T

In [None]:
# Checking missing values in dataframe
train.isnull().sum()

In [None]:
nullvaluecheck = pd.DataFrame(train.isna().sum().sort_values(ascending=False)*100/train.shape[0],columns=['missing %']).head(60)
nullvaluecheck.style.background_gradient(cmap='PuBu')

In [None]:
from time import time
import itertools
import warnings
# Analyze the body of the distributions
cols = [i for i in nullvaluecheck.index]
fig = plt.figure(figsize=(17,60))
for i,j,k in itertools.zip_longest(cols, range(len(cols)), ["c"]):
    plt.subplot(15,4,j+1)
    ax = sns.distplot(train[i],color=k)
    plt.axvline(train[i].mean(),linestyle="dashed",label="mean",color="k")
    plt.legend(loc="best")

From the distplot, it is more or less clear that the majority values are concentrated around the mean value of the features. So we will apply imputation on missing values on the basis of mean.

In [None]:
null_columns = train.columns[train.isnull().any()]; columns = list(train.columns)

print('Descriptive Stats for columns with missing values before imputation : \n', '--'*30)
display(train[null_columns].describe().T)

# Using SimpleImputer to fill missing values by mean
impute = SimpleImputer(missing_values = np.nan, strategy = 'mean', verbose = 1)
train = pd.DataFrame(impute.fit_transform(train), columns = columns)

print('Descriptive Stats after imputation: \n', '--'*30)
display(train[null_columns].describe().T)

del null_columns

Calculation for the target varaiable

In [None]:
train=train[train['weight']!=0]
train['action']=(train['resp']>0)*1
train.action.value_counts()

In [None]:
import plotly.express as px
df = px.data.tips()
fig = px.histogram(train, x="action")
fig.show()


In [None]:
for i,col in enumerate([f'resp_{i}' for i in range(1,5)]):
 fig = px.scatter(x=train[col], y=train['weight'],color=train['action'],labels={"x":f'resp_{i}',"y":"weight","color":"action"},title="Response variable Vs Weight")
 fig.show() 


The distribution looks kind of normal without much bais.

In [None]:
# Copying all the predictor variables into X dataframe. Since 'action' is dependent variable drop it
X = train.drop(["action"], axis=1) 

# Copy the 'action' column alone into the y dataframe. This is the dependent variable
y = train[["action"]]

#Apolying standardization the variable
from sklearn.preprocessing import LabelEncoder, StandardScaler
sc = StandardScaler()
X_scaled = sc.fit_transform(X)
X_scaled = pd.DataFrame(X_scaled, columns=X.columns)

**Principal Component Analysis**
Principal component Analysis is a method to identify the patterns in data using their similarities and dissimilarities between the sample points. The patterns within data is hard to find especially when we cannot visualize it graphically. The principal component analysis is a powerful tool to explore data with its hidden patterns and reduce the dimensions.

Steps of principal Component Analysis
* Normalize all the data variables within same range of values (-1, 1)
* Compute PCA is to calculate covariance matrix
* Compute eigenvalues and corresponding eigenvectors
* Arrange eigenvalues with corresponding eigenvectors in descending order. 
The higher value eigen vectors have more significance over the data and form principal components whereas the lower value eigen vectors can be removed in order to reduce the dimensions

In [None]:
# Covariance matrix
cov_matrix = np.cov(X_scaled.T)
print('Covariance Matrix \n%s', cov_matrix)

In [None]:
# Eigen values and vector
eig_vals, eig_vecs = np.linalg.eig(cov_matrix)
print('Eigen Vectors \n%s', eig_vecs)
print('\n Eigen Values \n%s', eig_vals)

In [None]:
# Cumulative variance explained
tot = sum(eig_vals)
var_exp = [(i /tot) * 100 for i in sorted(eig_vals, reverse = True)]
cum_var_exp = np.cumsum(var_exp)

print('Cumulative Variance Explained', cum_var_exp)

In [None]:
plt.figure(figsize = (15 , 7.2))
plt.plot(var_exp)
plt.xlabel('# of Components')
plt.ylabel('Eigen Values')

In [None]:
# Ploting 
plt.figure(figsize = (15 , 7.2))
plt.bar(range(1, eig_vals.size + 1), var_exp, alpha = 0.5, align = 'center', label = 'Individual explained variance')
plt.step(range(1, eig_vals.size + 1), cum_var_exp, where = 'mid', label = 'Cumulative explained variance')
plt.axhline(y = 95, color = 'r', linestyle = '--')
plt.axvline(x = 6, color = 'r', linestyle = '--')
plt.ylabel('Explained Variance Ratio')
plt.xlabel('Principal Components')
plt.legend(loc = 'best')
plt.tight_layout()
plt.show()

In [None]:
# Reducing the dimensions to 6
pca = PCA(n_components = 6, random_state = random_state)
pca.fit(X_scaled)
X_reduced = pca.fit_transform(X_scaled)
display(X_reduced.shape)

In [None]:
pca.components_

In [None]:
# Pairplot after dimension reduction
sns.pairplot(pd.DataFrame(X_reduced), diag_kind = 'kde')

Lets apply the following set of alogorithms for our prediction
1. Logistic Regression
2. Gaussian Naive Bayes Classifier
3. Support Vector Machine
4. K-nearest neighbors Classifier
5. Decision Tree Classifier
6. Random Forest Classifier


In [None]:
# Let's create a generic method to train and test the model
def run_classification(estimator, X_train, X_test, y_train, y_test, prec_rcl=True):
    timer.start()
    # train the model
    clf = estimator.fit(X_train, y_train)
    # predict from the claffier
    y_pred = clf.predict(X_test)
    print('Estimator:', clf)
    print('='*80)
    print('Training accuracy: %.2f%%' % (accuracy_score(y_train, clf.predict(X_train)) * 100))
    print('Testing accuracy: %.2f%%' % (accuracy_score(y_test, y_pred) * 100))
    print('='*80)
    print('Classification report:\n %s' % (classification_report(y_test, y_pred)))
    print(timer.stop(), 'to run the model')
    cm=metrics.confusion_matrix(y_test, y_pred, labels=[1, 0])
    df_cm = pd.DataFrame(cm, index = [i for i in ["1","0"]],columns = [i for i in ["Predict 1","Predict 0"]])
    plt.figure(figsize = (7,5))
    sns.heatmap(df_cm, annot=True,fmt='g')
    plt.title('Confusion matrix')
    plt.show()
        
    if prec_rcl:
        print('='*80)
        y_proba = clf.predict_proba(X_test)
        roc_auc = roc_auc_score(y_test,y_pred )
        fpr, tpr, thresholds = roc_curve(y_test, y_proba[:,1])
        plt.figure()
        plt.plot(fpr, tpr, label='(area = %0.2f)' % roc_auc)
        plt.plot([0, 1], [0, 1],'r--')
        plt.xlim([0.0, 1.0])
        plt.ylim([0.0, 1.05])
        plt.xlabel('False Positive Rate')
        plt.ylabel('True Positive Rate')
        plt.title('Receiver operating characteristic')
        plt.legend(loc="lower right")
        plt.show()

In [None]:
# Divide the projected dataset into train and test split
X_reduced_train, X_reduced_test, y_train, y_test = train_test_split(X_reduced, y, test_size=0.2, random_state=1)
X_reduced_train.shape, X_reduced_test.shape, y_train.shape, y_test.shape

In [None]:
#Utilities
from time import time
# A class that logs the time
class Timer():
    '''
    A generic class to log the time
    '''
    def __init__(self):
        self.start_ts = None
    def start(self):
        self.start_ts = time()
    def stop(self):
        return 'Time taken: %2fs' % (time()-self.start_ts)
    
timer = Timer()

In [None]:
# Run Classification for Logistic Regression
run_classification(LogisticRegression(), X_reduced_train, X_reduced_test, y_train, y_test)

In [None]:
#Run Classification for Gaussian Naive Bayes Classifier
run_classification(GaussianNB(), X_reduced_train, X_reduced_test, y_train, y_test)

In [None]:
#Run Classification for Support Vector Classifier
run_classification(SVC(C= 1, kernel='rbf', gamma= 1), X_reduced_train, X_reduced_test, y_train, y_test,False)

In [None]:
# Run Classification for K-nearest neighbors Classifier
run_classification(KNeighborsClassifier(n_neighbors=5), X_reduced_train, X_reduced_test, y_train, y_test)

In [None]:
# Run Classification for Decision Tree Classifier
run_classification(DecisionTreeClassifier(criterion='gini', max_depth=7), X_reduced_train, X_reduced_test, y_train, y_test)

In [None]:
run_classification(RandomForestClassifier(),  X_reduced_train, X_reduced_test, y_train, y_test)

**Hyperparameter Tuning**

Random Forest Regressor is coming out to be best performing algorithm among all.So we chose this algorithm to proceed further with Model Tuning.  We will use the Randomized SearchCV for tuning.

In [None]:
from scipy.stats import randint as sp_randint
from sklearn.model_selection import RandomizedSearchCV

In [None]:
# specify parameters and distributions to sample from
param_dist = {"max_depth": [3, None],
              "max_features": sp_randint(1, 11),
              "min_samples_split": sp_randint(2, 11),
              "min_samples_leaf": sp_randint(1, 11),
              "bootstrap": [True, False],
              "criterion": ["gini", "entropy"]}


In [None]:
# build a classifier
clf = RandomForestClassifier(n_estimators=50)

In [None]:
# run randomized search
samples = 10  # number of random samples 
randomCV = RandomizedSearchCV(clf, param_distributions=param_dist, n_iter=samples) #default cv = 3
randomCV.fit(X_reduced_train,y_train)
print(randomCV.best_params_)

In [None]:
run_classification(RandomForestClassifier(n_estimators=50,bootstrap= True,criterion= 'gini',max_depth= None, max_features= 2, min_samples_leaf= 10, min_samples_split= 10),  X_reduced_train, X_reduced_test, y_train, y_test)

In [None]:
clf=RandomForestClassifier(n_estimators=50,bootstrap= True,criterion= 'gini',max_depth= None, max_features= 2, min_samples_leaf= 10, min_samples_split= 10)

In [None]:
#import janestreet
#env = janestreet.make_env() # initialize the environment
#iter_test = env.iter_test() # an iterator which loops over the test set

#for (test_df, sample_prediction_df) in iter_test:
   #sample_prediction_df.action = 0
    #X = test_df
   #null_columns = X.columns[train.isnull().any()]; columns = list(X.columns)
    #impute = SimpleImputer(missing_values = np.nan, strategy = 'mean', verbose = 1)
   # X = pd.DataFrame(impute.fit_transform(X), columns = columns)
   #X_scaled = sc.fit_transform(X)
   # X_scaled = pd.DataFrame(X_scaled, columns=X.columns)
   # pca.fit(X_scaled)
   # X_reduced = pca.fit_transform(X_scaled)
   # sample_prediction_df.action=clf.predict(X_reduced)
   # env.predict(sample_prediction_df)*/

In [None]:
features = pd.read_csv('../input/jane-street-market-prediction/features.csv')
example_test = pd.read_csv('../input/jane-street-market-prediction/example_test.csv')
sample_prediction_df = pd.read_csv('../input/jane-street-market-prediction/example_sample_submission.csv')

In [None]:
import janestreet


for (test_df, sample_prediction_df) in iter_test:
      X_test = test_df.loc[:, test_df.columns.str.contains('feature')]
      null_columns = X_test.columns[X_test.isnull().any()]; columns = list(X_test.columns)
      impute = SimpleImputer(missing_values = np.nan, strategy = 'mean', verbose = 1)
      X_test = pd.DataFrame(impute.fit_transform(X_test), columns = columns)
      X_test = sc.fit_transform(X_test)
      X_test = pca.fit_transform(X_test)
      y_preds = clf.predict(X_test)
      sample_prediction_df.action = y_preds
      env.predict(sample_prediction_df)

***Please upvote if you find this notebook useful :)***