In [None]:
import pickle
import janestreet
import pdb

def test_model(model, cols_to_drop):
    env = janestreet.make_env() # initialize the environment
    iter_test = env.iter_test() # an iterator which loops over the test set
    for (test_df, sample_prediction_df) in iter_test:
        test_df = test_df.drop(columns = ['date'])
        test_df = test_df.drop(columns = cols_to_drop)
        sample_prediction_df["action"] = model.predict(test_df).astype(int)
        env.predict(sample_prediction_df)

In [None]:
import pandas as pd
import numpy as np
import pdb

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.model_selection import cross_val_score
from xgboost.sklearn import XGBClassifier
import xgboost as xgb
import lightgbm as lgb
from scipy.stats import randint as sp_randint
from scipy.stats import uniform as sp_uniform
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV


#from sklearn import cross_validation, metrics   #Additional scklearn functions
#from sklearn.grid_search import GridSearchCV   #Perforing grid search
import matplotlib.pylab as plt

from matplotlib.pylab import rcParams
rcParams['figure.figsize'] = 12, 4

def linear_reg(X_train, y_train):
	model = LinearRegression()
	model.fit(X_train, y_train)
	return model

def logistic_regression(X_train, y_train):
	model = LogisticRegression(random_state = 0)
	model.fit(X_train, y_train)
	return model

def knn(X_train, y_train):
	model = KNeighborsClassifier(n_neighbors = 5, metric = 'minkowski', p =2)
	model.fit(X_train, y_train)
	return model

def svm(X_train, y_train, kernel):
	model = SVC(kernel = kernel, random_state = 0)
	model.fit(X_train, y_train)
	return model

def naive_bayes(X_train, y_train):
	model = GaussianNB()
	model.fit(X_train, y_train)
	return model

def random_forest(X_train, y_train):
	model = RandomForestClassifier(n_estimators = 10, criterion = 'entropy', random_state = 0)
	model.fit(X_train, y_train)
	return model


def random_forest_time(X_train,y_train, X_test, y_test):
	classifier = TimeSeriesForest()
	classifier.fit(X_train, y_train)
	return classifier


#Best params for LGBM after hyper parameter tuning with Optuna
def light_gbm(X_train, y_train):

    model = lgb.LGBMClassifier(boosting_type= 'gbdt',
    lambda_l1= 0.9656,
    lambda_l2= 3.164298687541461e-06,
    num_leaves= 58,
    feature_fraction= 0.5677801334905795,
    bagging_fraction= 0.5522201790927705,
    bagging_freq= 2,
    min_child_samples= 83,
    min_child_weight= 100.0,
    scale_pos_weight= 1)
    
    model.fit(X_train, y_train)
    return model
    
#Best params for XGB after hyper parameter tuning with Optuna
def xgb(X_train, y_train):
    model = XGBClassifier(n_estimators= 453,
	max_depth= 9, 
	learning_rate= 0.02509726938336729, 
	subsample=0.5231595958438151, 
	colsample_bytree= 0.5572267372335459,
	gamma= 19, 
	scale_pos_weight= 0.9992243147907729,
	missing= -999,
	nthread= 4,
	verbosity= 0,
	objective= 'binary:logistic', 
	use_label_encoder= False)
    
    model.fit(X_train, y_train)
    return model



In [None]:
import numpy as np
import pandas as pd
import datatable as dt
import time
import pdb
import seaborn as sns
import matplotlib.pyplot as plt

from scipy import stats

from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

#This function creates a pickle file for faster reading. Only needs to be called once.
def read_data():
    '''
    Read data as datatable
    Convert to dataframe and store as pkl file
    '''
    train_data_datatable = dt.fread('data/train.csv')
    train_data = train_data_datatable.to_pandas()
    train_data.to_pickle('data/train_pickle.pkl')
    return train_data

#This function transforms all resp values to 1 when positive and 0 when negative
def discretization_resp(data):
    '''
    Create action column based on resp values
    Drop columns: resp, resp_1, resp_2, resp_3, resp_4
    '''
    data['action'] = np.where(data["resp"] <= 0, 0, 1)
    drop_features = ['date','resp','resp_1', 'resp_2', 'resp_3', 'resp_4'] 
    data = data.drop(columns = drop_features)
    #data = data.drop(index = data[data["weight"] == 0].index)
    return data

#This function replaces all missing values with mean of that column/feature
def missing_values(X):
    imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
    X = imputer.fit_transform(X)
    X = pd.DataFrame(X, columns = col_names)
    return X
    
#This function reduces the dimensions using PCA
def dimensionality_reduction(X, pca_obj = None):
    '''
    If a PCA object is passed in arguements, transform data with this object - Used to fit test data with previous PCA object
    Else, fit the data on new object and return data and PCA Object - used for train data
    '''
    col_names = X.columns
    if pca_obj != None:
        pca = pca_obj
        X = pca.transform(X)
    else:
        pca = PCA(n_components = 0.9)
        X = pca.fit_transform(X)
    n_pcs= pca.n_components_ 
    most_important = [np.abs(pca.components_[i]).argmax() for i in range(n_pcs)]
    most_important_names = [col_names[most_important[i]] for i in range(n_pcs)]
    
    return pd.DataFrame(X), pca

#This function extracts information such as mean and variances of attributes
def featrue_engineering(df):
    means = []
    variances = []
    for column in df.columns:
        means.append(np.mean(df[column]))
        variances.append(np.var(df[column]))
    return df

#This function scales features within the range of [0,1]
def feature_scaling(X):
    col_names = X.columns
    std_scaler = StandardScaler()
    X = std_scaler.fit_transform(X)
    return pd.DataFrame(X, columns = col_names)

#This function reduces the dimensions by computing feature correlations 
def feature_correlations(X):
    '''
    Extract related features info from features.csv
    Compute feature correlations and remove features which have high correlations (> 0.9)
    '''
    columns_to_drop = []
    features = pd.read_csv("/kaggle/input/jane-street-market-prediction/features.csv")
    for column in features.columns:
        correlations = abs(X[features[features[column] == True]["feature"] ].corr())
        for col in correlations.columns:
            if col in columns_to_drop:
                continue
            else:
                remove_col = correlations[correlations[col] > 0.9].index
                if col in remove_col:
                    remove_col = remove_col.drop(col)
                for element in remove_col:
                    if element in columns_to_drop:
                        pass
                    else:
                        columns_to_drop.append(element)
    return columns_to_drop

#This function computes the outliers in the data
def remove_outliers(data, threshold = 4):
    '''
    Compute the z-score of all rows in the data
    Remove rows that have z score  > 4 
    '''
    z = np.abs(stats.zscore(data, nan_policy='omit'))
    row_index = np.where(z > threshold)
    data = data[(z < threshold).all(axis=1)].reset_index(drop=True)
    return data


def main():
    data = pd.read_pickle('/kaggle/input/reduce-train-data/red_train_pickle.pkl')
    data = data.sort_values(by = ['date','ts_id'])
    data = data.set_index('ts_id')

    data = discretization_resp(data)

    data = remove_outliers(data)
    #test_data = remove_outliers(test_data)
    X_train = data.loc[:, data.columns != 'action']
    y_train = data['action']

    '''
    X_test = test_data.loc[:, test_data.columns != 'action']
    y_test = test_data['action']
    '''
    
    cols_to_remove = feature_correlations(X_train)
    X_train = X_train.drop(columns = cols_to_remove)
    
    #X_train = feature_scaling(X_train)
    #X_train,pca_obj = dimensionality_reduction(X_train)

    #X_test = feature_scaling(X_test)
    #X_test, pca_obj = dimensionality_reduction(X_test, pca_obj)
    
    models_list = ['lin','lgbm','xgb']
    for model_name in models_list:
        print("Model: ",model_name)
        if model_name == 'lr':
            model = logistic_regression(X_train, y_train)
        elif model_name == 'lin':
            model = linear_reg(X_train, y_train)
        elif model_name == 'knn':
            model = knn(X_train, y_train)
        elif model_name == 'svm':
            model = svm(X_train, y_train, kernel = 'linear')
        elif model_name == 'rbf_svm':
            model = svm(X_train, y_train, kernel = 'rbf')
        elif model_name == 'nb':
            model = naive_bayes(X_train, y_train)
        elif model_name == 'rf':
            model = random_forest(X_train, y_train)
        elif model_name == 'xgb':
            model = xgb(X_train, y_train)
        elif model_name == 'lgbm':
            model = light_gbm(X_train, y_train)
        else:
            pass
        #pdb.set_trace()
        test_model(model, cols_to_remove)
    
if __name__ == "__main__":
    main()
