The purpose is to perform predictive analytics on the data provided for the Jane Street Market Prediction 

### Import the relevant packages an librairies

In [None]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
import time
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.metrics import make_scorer, accuracy_score
# CatBoost model
from catboost import CatBoostClassifier, Pool
from sklearn.model_selection import train_test_split, KFold, GridSearchCV, StratifiedKFold
# import lightgbm as lgb

### DataSets Loading 

In [None]:
# df.fread().to_pandas() is faster than pd.read_csv()

In [None]:
folder_path = '../input/jane-street-market-prediction/'

In [None]:
%%time
train_df = pd.read_csv(folder_path +'train.csv' , nrows=1800000)
features_df = pd.read_csv(folder_path + 'features.csv')
sample_df = pd.read_csv(folder_path + 'example_sample_submission.csv')
test_data_df = pd.read_csv(folder_path + 'example_test.csv')

### Data Analysis

In [None]:
    train_df.info()

In [None]:
train_df

In [None]:
train_df.describe()

In [None]:
test_data_df.describe()

In [None]:
train_df

In [None]:
features = [c for c in train_df.columns if 'feature' in c]
resps = [c for c in train_df.columns if 'resp' in c]

In [None]:
train_df = train_df[train_df['weight'] != 0]

In [None]:
train_df['action'] = train_df['resp'].apply(lambda x:x>0).astype(int)

In [None]:
train_df_median = train_df[features].median()

In [None]:
X = train_df[features].fillna(train_df_median)

In [None]:
y = train_df['action']

In [None]:
del train_df

### Data reduction

In [None]:
# Before we perform PCA, we need to normalise the features so that they have zero mean and unit variance
scaler = StandardScaler()
scaler.fit(X)
x_norm = scaler.transform(X)

pca = PCA()
comp = pca.fit(x_norm)



In [None]:
# We plot a graph to show how the explained variation in the 129 features varies with the number of principal components
plt.plot(np.cumsum(comp.explained_variance_ratio_))
plt.grid()
plt.xlabel('Number of Principal Components')
plt.ylabel('Explained Variance')
sns.despine();

# The first 15 principal components explains about 80% of the variation
# The first 40 principal components explains about 95% of the variation

In [None]:
pca = PCA(n_components=50).fit(x_norm)
x_transform = pca.transform(x_norm)

In [None]:
x_norm

In [None]:
x_transform

### Machine Learning

In [None]:
model = CatBoostClassifier()
cv = StratifiedKFold(shuffle=True, n_splits=5, random_state=108)
params = {'iterations': [100],
          #'depth': [6, 8, 10, 15],
          'depth': [10, 15],
          'loss_function': ['Logloss'],
          #'learning_rate':[0.01, 0.02, 0.03, 0.04,0.05],
          'learning_rate':[0.03,0.05],
          #'l2_leaf_reg': np.logspace(-20, -19, 3),
          #'leaf_estimation_iterations': [10],
          'eval_metric': ['Accuracy'],
          #'use_best_model': ['True'],
          'task_type':['GPU'],
#          'logging_level':['Silent'],
          'random_seed': [42]
         }
scorer = make_scorer(accuracy_score)
clf = GridSearchCV(estimator=model, param_grid=params, scoring=scorer, cv=cv)

In [None]:
#model = CatBoostClassifier(iterations=10000, task_type="GPU", learning_rate=0.05, l2_leaf_reg=3.5, depth=11, loss_function= 'Logloss', eval_metric='AUC',use_best_model=True,random_seed=42)

In [None]:
# make the x for train and test (also called validat ion data)
xtrain,xval, ytrain, yval = train_test_split(x_transform, y,train_size=0.8,random_state=42)
# sanity check to ensure all features are categories. In our case, yes.

In [None]:
clf.fit(xtrain, ytrain, use_best_model=True, eval_set=[(xval, yval)])

In [None]:
best_param = clf.best_params_
best_param

In [None]:
best_estimator = clf.best_estimator_
best_estimator

In [None]:
best_score = clf.best_score_
best_score

In [None]:
def plot_feature_importance(importance,names,model_type):
    
    #Create arrays from feature importance and feature names
    feature_importance = np.array(importance)
    feature_names = np.array(names)
    
    #Create a DataFrame using a Dictionary
    data={'feature_names':feature_names,'feature_importance':feature_importance}
    fi_df = pd.DataFrame(data)
    
    #Sort the DataFrame in order decreasing feature importance
    fi_df.sort_values(by=['feature_importance'], ascending=False,inplace=True)
    
    #Define size of bar plot
    plt.figure(figsize=(10,8))
    #Plot Searborn bar chart
    sns.barplot(x=fi_df['feature_importance'], y=fi_df['feature_names'])
    #Add chart labels
    plt.title(model_type + 'FEATURE IMPORTANCE')
    plt.xlabel('FEATURE IMPORTANCE')
    plt.ylabel('FEATURE NAMES')

In [None]:
#plot the catboost result
#plot_feature_importance(clf.get_feature_importance(),train.columns,'CATBOOST')

In [None]:
# use_best_model params to prevent model overfitting
besthyperparameters = CatBoostClassifier(iterations=1000,
                           loss_function=best_param['loss_function'],
                           depth=best_param['depth'],
                            learning_rate=best_param['learning_rate'],
                          # l2_leaf_reg=best_param['l2_leaf_reg'],
                           eval_metric='Accuracy',
                           #leaf_estimation_iterations=10,
                           use_best_model=True,
                           # logging_level='Silent',
                        task_type="GPU",
                           random_seed=42
                          )

In [None]:
# make the x for train and test (also called validat ion data)
#xtrain,xval, ytrain, yval = train_test_split(x_transform, y,train_size=0.8,random_state=42)
# sanity check to ensure all features are categories. In our case, yes.


In [None]:
besthyperparameters.fit(xtrain, ytrain, eval_set=(xval, yval))

In [None]:
import scikitplot as skplt

In [None]:
y_pred = besthyperparameters.predict(x_transform)
skplt.metrics.plot_confusion_matrix(y, y_pred, normalize=True)

In [None]:
test_data_df

In [None]:
sample_df

### Data Submission

In [None]:
import janestreet
env = janestreet.make_env() # initialize the environment
iter_test = env.iter_test() # an iterator which loops over the test set

In [None]:
# X_test_transform


In [None]:
def fillna_npwhere(array, values):
    if np.isnan(array.sum()):
        array = np.where(np.isnan(array), values, array)
    return array

In [None]:
%%time
for (test_df, sample_prediction_df) in iter_test:
    wt = test_df.iloc[0].weight
    if(wt == 0):
        sample_prediction_df.action = 0 
    else:
        sample_prediction_df.action = besthyperparameters.predict(pca.transform(scaler.transform(fillna_npwhere(test_df[features].values,train_df_median[features].values))))
    env.predict(sample_prediction_df)

In [None]:
#%%time
#for (test_df, #sample_prediction_df) in iter_test:
    #sample_prediction_df["action"] = clf.predict(X_test_transform[features]).astype(int)
 #   sample_prediction_df["action"] = clf.predict(np.transpose(X_test_transform)).astype(int)
    #env.predict(sample_prediction_df)