In [None]:
# import data
import numpy as np
import pandas as pd
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
from scipy.stats import probplot, pearsonr
import matplotlib.pyplot as plt
import seaborn as sns

train_dtypes = {f'f_{i}': np.float32 for i in range(300)}
train_dtypes['investment_id'] = np.uint16
train_dtypes['time_id'] = np.uint16
train_dtypes['target'] = np.float32

df_train = pd.read_csv('../input/ubiquant-market-prediction/train.csv', usecols=list(train_dtypes.keys()), dtype=train_dtypes)
print(f'Training Set Shape: {df_train.shape} - Memory Usage: {df_train.memory_usage().sum() / 1024 ** 2:.2f} MB')


## Reduce memory usage of dataset

In [None]:
#reduce memory usage of data
from scipy.stats import pearsonr
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GroupKFold
from sklearn.preprocessing import MinMaxScaler, LabelEncoder, StandardScaler
from collections import defaultdict
from argparse import Namespace
import lightgbm as lgb
args = Namespace(
    seed=21,
    folds=5,
    workers=4,
    samples=2500000,
)
    
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df



## Preprocessing data

In [None]:
#preprocessing data
df_train.drop('f_124', axis=1, inplace=True) # since the variance of it is much lower than 0.1
df_train.shape

#delete row with time_ID from 350 to 550 since its oscillation part from EDA
df_train = df_train.drop(df_train[(df_train['time_id'] < 551) & (df_train['time_id'] > 349)].index)

num_features = list(df_train.filter(like="f_").columns)
features = num_features
df_train = reduce_mem_usage(df_train)
len(features)


## correlation coefficient

In [None]:
# find the correlation coefficient between features and target
df=df_train
cor_t=df[features].apply(lambda x: x.corr(df['target']))
cr=pd.DataFrame(columns = ['feature','correlation'],)
cr['feature'] = pd.Series(features)
cr['correlation'] = pd.Series(list(abs(cor_t)))
temp=cr.sort_values(by="correlation", ascending=False ).head(100)
print(temp.head(20))
features_first100=list(temp['feature'])
df_train_filtered=df_train[features_first100]
del df
df_train_filtered.shape

In [None]:
#isomap embedding 

#from sklearn import manifold

#iso = manifold.Isomap(n_neighbors=80, n_components=50)
#df_features_ISO= iso.fit_transform(df_features_filtered.head(10000))

#iso.reconstruction_error()

#LLEembeding=manifold.LocallyLinearEmbedding(n_neighbors=100, n_components=2)
#df_features_LLE=LLEembeding.fit_transform(df_features_filtered.head(10000))



## XGBRegressor

In [None]:

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score, KFold
from matplotlib import pyplot as plt
##X = df_train_filtered[features_first100]
X = df_train[features]
y = df_train['target']

# creating the training and validation set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.25, random_state=42)
del X,y
from xgboost.sklearn import XGBRegressor
xgb_model = XGBRegressor(objective='reg:squarederror',n_estimators=100, max_depth=8, learning_rate=0.1, random_state=0)
xgb_model.fit(X_train, y_train)
y_predict = xgb_model.predict(X_test)

score = xgb_model.score(X_train, y_train)  
print("Training score: ", score)

mse = mean_squared_error(y_test,y_predict)
print("MSE: ", mse)
#scores = cross_val_score(xgb_model, X_train, y_train)
#print("Mean cross-validation score: %.2f" % scores.mean())

#kfold = KFold(n_splits=5, shuffle=True)
#kf_cv_scores = cross_val_score(xgb_model, X_train, y_train, cv=kfold )
#print("K-fold CV average score: %.2f" % kf_cv_scores.mean())

#sorted_idx = xgb_model.feature_importances_.argsort()
#plt.barh(X_train.columns[sorted_idx], xgb_model.feature_importances_[sorted_idx])
#plt.xlabel("Xgboost Feature Importance")


features_importance_df= pd.DataFrame({'feature': features, 'importance': xgb_model.feature_importances_}).sort_values(by="importance", ascending=False)
plt.figure(figsize=(14, 8))
plt.subplot(1,2,1)
sns.barplot(x="importance", y="feature", data=features_importance_df.head(25))
plt.title(f'50 Head XGBRegressor Features ')
plt.subplot(1,2,2)
sns.barplot(x="importance", y="feature", data=features_importance_df.tail(25))
plt.title(f'50 Tail XGBRegressor Features ')
plt.tight_layout()
plt.show()                                     


In [None]:
#from sklearn.kernel_ridge import KernelRidge
#from sklearn.metrics.pairwise import rbf_kernel
#KR = KernelRidge(kernel ='rbf', alpha=1.0,gamma=0.8)

#KR.fit(X_train,y_train)
#preds = KR.predict(X_test)
#mse = np.sqrt(mean_squared_error(y_test,preds))
#print("MSE: ", mse)

In [None]:
#import shap
#shap.initjs()
#explainer = shap.TreeExplainer(xgb_model)
#shap_values = explainer.shap_values(X_train)

#shap.summary_plot(shap_values, features=X_train, feature_names=X_train.columns)

## Submission

In [None]:
import ubiquant
env = ubiquant.make_env()  
iter_test = env.iter_test()

for (test_df, sample_prediction_df) in iter_test:
    final_pred =xgb_model.predict(test_df[features]) 
    sample_prediction_df['target'] = final_pred
    env.predict(sample_prediction_df) 
    display(sample_prediction_df)