In [None]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

import lightgbm as lgbm
from lightgbm import *

One of the most commonly used terms in financial market data analysis is "regime". A regime refers to a market environment. In general, the market environment is known to vary depending on the time of year, and can be divided into markets with a continuing uptrend, markets with little fluctuation, and so on. When this regime is known, it is useful for forecasting, as it allows us to infer that a bull market is more likely to rise than the usual market environment.

There are various methods to estimate the market regime, but in this study, I used clustering by the k-means method, which is one of the unsupervised learning methods, to estimate the regime.


In [None]:
df = pd.read_parquet('../input/ubiquant-parquet/train_low_mem.parquet')

In [None]:
def reduce_mem_usage(df):
  
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
     
    return df

df = reduce_mem_usage(df)

Divide the data frame into train, val, and test data.

In [None]:
from sklearn.model_selection import KFold, train_test_split


features = [f'f_{i}' for i in range(300)]
target = 'target'

df = df.drop(range(0,10001)) 

df_features = df[features]

X_train, X, Y_train, Y = train_test_split(df_features, df[target], train_size=0.6, shuffle=False)

df = [[]]
df_features = [[]]

X_val, X_test, Y_val, Y_test = train_test_split(X, Y, train_size=0.5, shuffle=False)

Find the optimal number of clusters for k-means.

In [None]:

from sklearn.cluster import KMeans
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import display

# Elbow Method
wcss = []

for i in range(1, 10):
    kmeans = KMeans(n_clusters = i, init = 'k-means++', max_iter = 300, n_init = 30, random_state = 0)
    kmeans.fit(X_train.head(10000))
    wcss.append(kmeans.inertia_)


plt.plot(range(1, 10), wcss)
plt.title('The elbow method')
plt.xlabel('Number of clusters')
plt.ylabel('WCSS') 
plt.show()

In the elbow method, the best cluster is the one where the slope is gentle. In this case, the optimal number of clusters is 5.

Now that we know that the optimal number of clusters is 5, I model with n_clusters=5 and add the result to the feature set.

In [None]:
# modeling
clf = KMeans(n_clusters=5, random_state=0)
clf.fit(X_train.head(10000))

clf.labels_




y_pred_train = clf.predict(X_train)
X_train['k-means'] = y_pred_train
y_pred_val = clf.predict(X_val)
X_val['k-means'] = y_pred_val
y_pred_test = clf.predict(X_test)
X_test['k-means'] = y_pred_test

In [None]:
import warnings
import numpy as np
import lightgbm as lgb
from scipy.stats import pearsonr

warnings.simplefilter('ignore')

lgb_train = lgb.Dataset(X_train, Y_train)
lgb_eval = lgb.Dataset(X_val, Y_val, reference=lgb_train)

params = {'seed': 1,
          'verbose' : -1,
           'objective': "regression",
           'learning_rate': 0.02,
           'bagging_fraction': 0.1,
           'bagging_freq': 1,
           'feature_fraction': 0.1,
           'max_depth': 6,
           'min_child_samples': 50,
           'num_leaves': 64}
        
        
gbm = lgb.train(params,
                lgb_train,
                num_boost_round=1100,
                valid_sets=lgb_eval,
                verbose_eval=False,
                early_stopping_rounds=10,
                )


Y_pred = gbm.predict(X_test, num_iteration=gbm.best_iteration)

score_tuple = pearsonr(Y_test, Y_pred)
score = score_tuple[0]
print(f"Validation Pearsonr score : {score:.4f}")

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report


feature = gbm.feature_importance(importance_type='gain')


f = pd.DataFrame({'number': range(0, len(feature)),
             'feature': feature[:]})
f2 = f.sort_values('feature',ascending=False)

#features' name
label = X_train.columns[0:]

#feature rank
indices = np.argsort(feature)[::-1]

for i in range(len(feature)):
    print(str(i + 1) + "   " + str(label[indices[i]]) + "   " + str(feature[indices[i]]))


When we check the 282th position of the feature importance, we can see that "k-means" is indeed effective.

