In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
!pip install talib-binary

In [None]:
import talib

In [None]:
from datetime import datetime
import time
totimestamp = lambda s: np.int32(time.mktime(datetime.strptime(s, "%d/%m/%Y").timetuple()))

In [None]:
data_folder = "../input/g-research-crypto-forecasting/"
crypto_df = pd.read_csv(data_folder + 'train.csv')
asset_details_df = pd.read_csv(data_folder + 'asset_details.csv')
supp_train_df = pd.read_csv(data_folder + 'supplemental_train.csv')
expl_test = pd.read_csv(data_folder + 'example_test.csv')

In [None]:
asset_details_df["Asset_Name"] = asset_details_df["Asset_Name"].str.replace(' ','_')
asset_details_df["Asset_Name"] = asset_details_df["Asset_Name"].str.replace('.','_')

In [None]:
asset_details = asset_details_df.copy()
asset_details_df = asset_details_df.set_index("Asset_ID")

In [None]:
crypto_df

In [None]:
#Create a dictionary of data frames assigned to each coin 
dataframes = {}
for asset_id, asset_name in zip(asset_details.Asset_ID, asset_details.Asset_Name):    
    vars()[asset_name] = crypto_df[crypto_df["Asset_ID"]==asset_id].set_index("timestamp")#.merge(asset_details, how='left', on='Asset_ID')
    dataframes[asset_id] = vars()[asset_name]


cleaned_dataframes = {}
for i in list(dataframes):
    #cols = ['Asset_ID','Asset_Name','Count','Volume','Open','High','Low', 'Close','VWAP','Target']
    cleaned_dataframes[i] = dataframes[i].reindex(range(dataframes[i].index[0],dataframes[i].index[-1]+60,60),method='pad')
    #cleaned_dataframes[i] = cleaned_dataframes[i][cols]
    

In [None]:
#Reduce the datastes to ease the correlation calculation
reduced_dataframes = {}
for i in list(dataframes):
    #reduced_dataframes[i] = cleaned_dataframes[i].loc[totimestamp('01/01/2021'):totimestamp('01/05/2021')]
    reduced_dataframes[i] = cleaned_dataframes[i].iloc[-10000:]

In [None]:
reduced_dataframes_base = reduced_dataframes
reduced_dataframes_base[1]

In [None]:
original_columns = cleaned_dataframes[2].columns.tolist()

Here let's generate each Technical indicators grouped by the type of the indicator

In [None]:
momentum_indicators_columns = ['adx_mm','adxr_mm','aroondown_mm','aroonup_mm','bop_mm','mfi_mm','cci_mm','cmo_mm','rsi_mm','mdi_mm','pdi_mm','mdm_mm','pdm_mm','dx_mm','roc_mm','rocp_mm','will_mm']
def get_momentum_indicators(df) :
    
    df["adx_mm"] = talib.ADX(df['High'], df['Low'], df['Close'], timeperiod=14) #Average Directional Movement Index
    df["adxr_mm"] = talib.ADXR(df['High'], df['Low'], df['Close'], timeperiod=14) #Average Directional Movement Index Rating
    df["aroondown_mm"], df["aroonup_mm"] = talib.AROON(df['High'], df['Low'],timeperiod=14) 
    df["bop_mm"] = talib.BOP(df['Open'], df['High'], df['Low'], df['Close']) #Balance Of Power
    df["mfi_mm"] = talib.MFI(df['High'], df['Low'], df['Close'], df['Volume'], timeperiod=14) #Money Flow Index
    df["cci_mm"] = talib.CCI(df['High'], df['Low'], df['Close'], timeperiod=14) #Commodity Channel Index
    df["cmo_mm"] = talib.CMO(df['Close'], timeperiod=14) #Chande Momentum Oscillator
    df["rsi_mm"] = talib.RSI(df['Close'], timeperiod=14) #Relative Strenght Index
    df["mdi_mm"] = talib.MINUS_DI(df['High'], df['Low'], df['Close'], timeperiod=14) #Minus Directional Indicator
    df["pdi_mm"] = talib.PLUS_DI(df['High'], df['Low'], df['Close'], timeperiod=14)  #Plus Directional Indicator
    df["mdm_mm"] = talib.MINUS_DM(df['High'], df['Low'],timeperiod=14)  #Minus Directional Movement
    df["pdm_mm"] = talib.PLUS_DM(df['High'], df['Low'],timeperiod=14)  #Plus Directional Movement
    df["dx_mm"] = talib.DX(df['High'], df['Low'], df['Close'], timeperiod=14) #Directional Movement Index
    df["roc_mm"] = talib.ROC(df['Close'], timeperiod=10) #Rate of change
    df["rocp_mm"] = talib.ROCP(df['Close'], timeperiod=10) #Rate of change Percentage
    df["will_mm"] = talib.WILLR(df['High'], df['Low'], df['Close'], timeperiod=14)  #Williams' %R
    

In [None]:
volatility_indicators_columns = ['atr_vlt','natr_vlt','trange_vlt']
def get_volatility_indicators(df) :
    
    df['atr_vlt'] = talib.ATR(df['High'], df['Low'], df['Close'], timeperiod=14)
    df['natr_vlt'] = talib.NATR(df['High'], df['Low'], df['Close'], timeperiod=14)
    df['trange_vlt'] = talib.TRANGE(df['High'], df['Low'], df['Close'])

In [None]:
volume_indicators_columns = ['ad_vol','adosc_vol','obv_vol']
def get_volume_indicators(df) :
    
    df['ad_vol'] = talib.AD(df['High'], df['Low'], df['Close'], df['Volume'])
    df['adosc_vol'] = talib.ADOSC(df['High'], df['Low'], df['Close'], df['Volume'], fastperiod=5, slowperiod=20)
    df['obv_vol'] = talib.OBV(df['Close'], df['Volume'])

In [None]:
cycle_indicators_columns = ['htdcpe_cyc','htdcpa_cyc','htsn_cyc','htinph_cyc','httrnd_cyc']
def get_cycle_indicators(df) :
    
    df['htdcpe_cyc'] = talib.HT_DCPERIOD(df['Close'])
    df['htdcpa_cyc'] = talib.HT_DCPHASE(df['Close'])
    df['htsn_cyc'], df['htldsn_cyc'] = talib.HT_SINE(df['Close'])
    df['htinph_cyc'], df['htquad_cyc'] = talib.HT_PHASOR(df['Close'])
    df['httrnd_cyc'] = talib.HT_TRENDLINE(df['Close'])

In [None]:
statitics_functions_columns = ['beta_stat','correl_stat','stddev_stat','tsf_stat','var_stat']
def get_statitics_functions(df) :
    
    df['beta_stat'] = talib.BETA(df['High'], df['Low'], timeperiod=5)
    df['correl_stat'] = talib.CORREL(df['High'], df['Low'], timeperiod=30)
    df['stddev_stat'] = talib.STDDEV(df['Close'], timeperiod=5, nbdev=1)
    df['tsf_stat'] = talib.TSF(df['Close'], timeperiod=14)
    df['var_stat'] = talib.VAR(df['Close'], timeperiod=5, nbdev=1)

In [None]:
overlap_functions_columns = ['bbup_ovlp','kama_ovlp','sar_ovlp']
def get_overlap_functions(df) :
    
    df['bbup_ovlp'], df['bbmidl_ovlp'], df['bblow_ovlp'] = talib.BBANDS(df['Close'], timeperiod=10, nbdevup=2, nbdevdn=2, matype=0)
    df['kama_ovlp']= talib.KAMA(df['Close'], timeperiod=30)
    #df['mama_ovlp'], df['fama'] = talib.MAMA(df['Close'], fastlimit=0, slowlimit=0)
    df['sar_ovlp']= talib.SAR(df['High'], df['Low'], acceleration=0, maximum=0)
    

In [None]:
price_transformations = ['avg_price','med_price','typ_price','weight_price']
def get_price_transformations(df) :
    
    df['avg_price'] = talib.AVGPRICE(df['Open'], df['High'], df['Low'], df['Close'])
    df['med_price'] = talib.MEDPRICE(df['High'], df['Low'])
    df['typ_price'] = talib.TYPPRICE(df['High'], df['Low'], df['Close'])
    df['weight_price'] = talib.WCLPRICE(df['High'], df['Low'], df['Close'])

In [None]:
pattern_recognition_columns = ['cdl2crows',
       'cdl3blackcrows', 'cdl3inside', 'cdl3linestrike', 'cdl3outside',
       'cdl3starsinsouth', 'cdl3whitesoldiers', 'cdlabandonedbaby',
       'cdladvanceblock', 'cdlbelthold', 'cdlbreakaway', 'cdlclosingmarubozu',
       'cdlconcealbabyswall', 'cdlcounterattack', 'cdldarkcloudcover',
       'cdldoji', 'cdldojistar', 'cdldragonflydoji', 'cdlengulfing',
       'cdleveningdojistar', 'cdleveningstar', 'cdlgapsidesidewhite',
       'cdlgravestonedoji', 'cdlhammer', 'cdlhangingman', 'cdlharami',
       'cdlharamicross', 'cdlhighwave', 'cdlhikkake', 'cdlhikkakemod',
       'cdlhomingpigeon', 'cdlidentical3crows', 'cdlinneck',
       'cdlinvertedhammer', 'cdlkicking', 'cdlkickingbylength',
       'cdlladderbottom', 'cdllongleggeddoji', 'cdllongline', 'cdlmarubozu',
       'cdlmatchinglow', 'cdlmathold', 'cdlmorningdojistar', 'cdlmorningstar',
       'cdlonneck', 'cdlpiercing', 'cdlrickshawman', 'cdlrisefall3methods',
       'cdlseparatinglines', 'cdlshootingstar', 'cdlshortline',
       'cdlspinningtop', 'cdlstalledpattern', 'cdlsticksandwich', 'cdltakuri',
       'cdltasukigap', 'cdlthrusting', 'cdltristar', 'cdlunique3river',
       'cdlupsidegap2crows', 'cdlxsidegap3methods']

def get_pattern_recognition(df) :
    
    df['cdl2crows'] = talib.CDL2CROWS(df['Open'], df['High'], df['Low'], df['Close'])
    df['cdl3blackcrows'] = talib.CDL3BLACKCROWS(df['Open'], df['High'], df['Low'], df['Close'])
    df['cdl3blackcrows'] = talib.CDL3BLACKCROWS(df['Open'], df['High'], df['Low'], df['Close'])
    df['cdl3inside'] = talib.CDL3INSIDE(df['Open'], df['High'], df['Low'], df['Close'])
    df['cdl3linestrike'] = talib.CDL3LINESTRIKE(df['Open'], df['High'], df['Low'], df['Close'])
    df['cdl3outside'] = talib.CDL3OUTSIDE(df['Open'], df['High'], df['Low'], df['Close'])
    df['cdl3starsinsouth'] = talib.CDL3STARSINSOUTH(df['Open'], df['High'], df['Low'], df['Close'])
    df['cdl3whitesoldiers'] = talib.CDL3WHITESOLDIERS(df['Open'], df['High'], df['Low'], df['Close'])
    df['cdlabandonedbaby'] = talib.CDLABANDONEDBABY(df['Open'], df['High'], df['Low'], df['Close'], penetration=0)
    df['cdladvanceblock'] = talib.CDLADVANCEBLOCK(df['Open'], df['High'], df['Low'], df['Close'])
    df['cdlbelthold'] = talib.CDLBELTHOLD(df['Open'], df['High'], df['Low'], df['Close'])
    df['cdlbreakaway'] = talib.CDLBREAKAWAY(df['Open'], df['High'], df['Low'], df['Close'])
    df['cdlclosingmarubozu'] = talib.CDLCLOSINGMARUBOZU(df['Open'], df['High'], df['Low'], df['Close'])
    df['cdlconcealbabyswall'] = talib.CDLCONCEALBABYSWALL(df['Open'], df['High'], df['Low'], df['Close'])
    df['cdlcounterattack'] = talib.CDLCOUNTERATTACK(df['Open'], df['High'], df['Low'], df['Close'])
    df['cdldarkcloudcover'] = talib.CDLDARKCLOUDCOVER(df['Open'], df['High'], df['Low'], df['Close'], penetration=0)
    df['cdldoji'] = talib.CDLDOJI(df['Open'], df['High'], df['Low'], df['Close'])
    df['cdldojistar'] = talib.CDLDOJISTAR(df['Open'], df['High'], df['Low'], df['Close'])
    df['cdldragonflydoji'] = talib.CDLDRAGONFLYDOJI(df['Open'], df['High'], df['Low'], df['Close'])
    df['cdlengulfing'] = talib.CDLENGULFING(df['Open'], df['High'], df['Low'], df['Close'])
    df['cdleveningdojistar'] = talib.CDLEVENINGDOJISTAR(df['Open'], df['High'], df['Low'], df['Close'], penetration=0)
    df['cdleveningstar'] = talib.CDLEVENINGSTAR(df['Open'], df['High'], df['Low'], df['Close'], penetration=0)
    df['cdlgapsidesidewhite'] = talib.CDLGAPSIDESIDEWHITE(df['Open'], df['High'], df['Low'], df['Close'])
    df['cdlgravestonedoji'] = talib.CDLGRAVESTONEDOJI(df['Open'], df['High'], df['Low'], df['Close'])
    df['cdlhammer'] = talib.CDLHAMMER(df['Open'], df['High'], df['Low'], df['Close'])
    df['cdlhangingman'] = talib.CDLHANGINGMAN(df['Open'], df['High'], df['Low'], df['Close'])
    df['cdlharami'] = talib.CDLHARAMI(df['Open'], df['High'], df['Low'], df['Close'])
    df['cdlharamicross'] = talib.CDLHARAMICROSS(df['Open'], df['High'], df['Low'], df['Close'])
    df['cdlhighwave'] = talib.CDLHIGHWAVE(df['Open'], df['High'], df['Low'], df['Close'])
    df['cdlhikkake'] = talib.CDLHIKKAKE(df['Open'], df['High'], df['Low'], df['Close'])
    df['cdlhikkakemod'] = talib.CDLHIKKAKEMOD(df['Open'], df['High'], df['Low'], df['Close'])
    df['cdlhomingpigeon'] = talib.CDLHOMINGPIGEON(df['Open'], df['High'], df['Low'], df['Close'])
    df['cdlidentical3crows'] = talib.CDLIDENTICAL3CROWS(df['Open'], df['High'], df['Low'], df['Close'])
    df['cdlinneck'] = talib.CDLINNECK(df['Open'], df['High'], df['Low'], df['Close'])
    df['cdlinvertedhammer'] = talib.CDLINVERTEDHAMMER(df['Open'], df['High'], df['Low'], df['Close'])
    df['cdlkicking'] = talib.CDLKICKING(df['Open'], df['High'], df['Low'], df['Close'])
    df['cdlkickingbylength'] = talib.CDLKICKINGBYLENGTH(df['Open'], df['High'], df['Low'], df['Close'])
    df['cdlladderbottom'] = talib.CDLLADDERBOTTOM(df['Open'], df['High'], df['Low'], df['Close'])
    df['cdllongleggeddoji'] = talib.CDLLONGLEGGEDDOJI(df['Open'], df['High'], df['Low'], df['Close'])
    df['cdllongline'] = talib.CDLLONGLINE(df['Open'], df['High'], df['Low'], df['Close'])
    df['cdlmarubozu'] = talib.CDLMARUBOZU(df['Open'], df['High'], df['Low'], df['Close'])
    df['cdlmatchinglow'] = talib.CDLMATCHINGLOW(df['Open'], df['High'], df['Low'], df['Close'])
    df['cdlmathold'] = talib.CDLMATHOLD(df['Open'], df['High'], df['Low'], df['Close'], penetration=0)
    df['cdlmorningdojistar'] = talib.CDLMORNINGDOJISTAR(df['Open'], df['High'], df['Low'], df['Close'], penetration=0)
    df['cdlmorningstar'] = talib.CDLMORNINGSTAR(df['Open'], df['High'], df['Low'], df['Close'], penetration=0)
    df['cdlonneck'] = talib.CDLONNECK(df['Open'], df['High'], df['Low'], df['Close'])
    df['cdlpiercing'] = talib.CDLPIERCING(df['Open'], df['High'], df['Low'], df['Close'])
    df['cdlrickshawman'] = talib.CDLRICKSHAWMAN(df['Open'], df['High'], df['Low'], df['Close'])
    df['cdlrisefall3methods'] = talib.CDLRISEFALL3METHODS(df['Open'], df['High'], df['Low'], df['Close'])
    df['cdlseparatinglines'] = talib.CDLSEPARATINGLINES(df['Open'], df['High'], df['Low'], df['Close'])
    df['cdlshootingstar'] = talib.CDLSHOOTINGSTAR(df['Open'], df['High'], df['Low'], df['Close'])
    df['cdlshortline'] = talib.CDLSHORTLINE(df['Open'], df['High'], df['Low'], df['Close'])
    df['cdlspinningtop'] = talib.CDLSPINNINGTOP(df['Open'], df['High'], df['Low'], df['Close'])
    df['cdlstalledpattern'] = talib.CDLSTALLEDPATTERN(df['Open'], df['High'], df['Low'], df['Close'])
    df['cdlsticksandwich'] = talib.CDLSTICKSANDWICH(df['Open'], df['High'], df['Low'], df['Close'])
    df['cdltakuri'] = talib.CDLTAKURI(df['Open'], df['High'], df['Low'], df['Close'])
    df['cdltasukigap'] = talib.CDLTASUKIGAP(df['Open'], df['High'], df['Low'], df['Close'])
    df['cdlthrusting'] = talib.CDLTHRUSTING(df['Open'], df['High'], df['Low'], df['Close'])
    df['cdltristar'] = talib.CDLTRISTAR(df['Open'], df['High'], df['Low'], df['Close'])
    df['cdlunique3river'] = talib.CDLUNIQUE3RIVER(df['Open'], df['High'], df['Low'], df['Close'])
    df['cdlupsidegap2crows'] = talib.CDLUPSIDEGAP2CROWS(df['Open'], df['High'], df['Low'], df['Close'])
    df['cdlxsidegap3methods'] = talib.CDLXSIDEGAP3METHODS(df['Open'], df['High'], df['Low'], df['Close'])

In [None]:
for i in list(reduced_dataframes) :
    get_momentum_indicators(reduced_dataframes[i])
    get_volatility_indicators(reduced_dataframes[i])
    get_volume_indicators(reduced_dataframes[i])
    get_cycle_indicators(reduced_dataframes[i])
    get_statitics_functions(reduced_dataframes[i])
    get_overlap_functions(reduced_dataframes[i])
    get_price_transformations(reduced_dataframes[i])
    get_pattern_recognition(reduced_dataframes[i])

Instead of using all the KPIs let's use only what is useful,below we will stard by deleting the indecators that are either zeros or null across all the dataset

In [None]:
reduced_dataframes_full = reduced_dataframes
reduced_dataframes_full[1].info()

In [None]:
# Here we will remove the irrelevant features that does not have any predicting power
# if a features is non-variant we will remove it
# then we will use only pttrn_cols_keep

pttrn_cols_remove = {}
pttrn_cols_keep = {}
for i in list(reduced_dataframes) :
    inter = []
    inter = reduced_dataframes[i].loc[:,pattern_recognition_columns].fillna(0).astype(bool).sum().astype(bool)
    pttrn_cols_remove[i] = inter.loc[inter.values == False ].index.tolist()
    pttrn_cols_keep[i] = inter.loc[inter.values == True ].index.tolist()
    reduced_dataframes[i].drop(columns=pttrn_cols_remove[i], inplace = True)

In [None]:
reduced_dataframes_full[1]

In [None]:
# Drop the rows with null target and start the EDA
for i in list(reduced_dataframes) :
    reduced_dataframes[i].drop(reduced_dataframes[i][original_columns].loc[pd.isnull(reduced_dataframes[i][original_columns]).Target == True ].index, inplace=True)
    reduced_dataframes[i].drop(reduced_dataframes[i][cycle_indicators_columns].loc[pd.isnull(reduced_dataframes[i][cycle_indicators_columns]).htsn_cyc == True ].index, inplace=True)

In [None]:
#check that we don't have nulls

sum_nulls = []
for i in list(reduced_dataframes) :
    sum_nulls.append(reduced_dataframes[i].isnull().sum().sum())
sum_nulls

In [None]:
studied_columns = ['Target'] + momentum_indicators_columns + statitics_functions_columns + volatility_indicators_columns + price_transformations

In [None]:
reduced_dataframes[1].loc[:,studied_columns].describe()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

sns.kdeplot(
    data=reduced_dataframes[1], x="ad_vol",
    cumulative=False, common_norm=False, common_grid=True,
)

sns.pairplot(reduced_dataframes[1][studied_columns],   
    x_vars= studied_columns,
    y_vars=['Target'], size = 10)

plt.savefig('output.png')


sns.relplot(
    data=reduced_dataframes[1], x="obv_vol", y="Target"
    ,palette=["b", "r"], sizes=(10, 100)
)

In [None]:
import plotly.graph_objects as go

fig = go.Figure(data=[go.Candlestick(x=reduced_dataframes[1].index, open=reduced_dataframes[1]['Open'], high=reduced_dataframes[1]['High'], low=reduced_dataframes[1]['Low'], close=reduced_dataframes[1]['Close'])])
fig.show()

In [None]:
sns.kdeplot( data=reduced_dataframes[1], x="Target", cumulative=False, common_norm=False, common_grid=True, )

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
#print(scaler.fit(studied_columns))

X = reduced_dataframes[1][studied_columns].copy()
y = X.pop('Target')
X_scaled = scaler.fit_transform(X)

In [None]:
X

In [None]:
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=30, n_init=10, random_state=0, verbose=1, )
X["Cluster"] = kmeans.fit_predict(X_scaled)

In [None]:
Xy = X.copy()
Xy["Cluster"] = Xy.Cluster.astype("category")
Xy["Target"] = y
#sns.relplot(
#    x="value", y="Target", hue="Cluster", col="variable",
#    height=4, aspect=1, facet_kws={'sharex': False}, col_wrap=3,
#    data=Xy.melt(
#        value_vars=studied_columns, id_vars=["Target", "Cluster"],
#    ),
#);

In [None]:
good_clusters = []
for i in Xy.Cluster.value_counts().index :
    lup = len(Xy.loc[(Xy.Cluster == i) & (Xy.Target > 0)]['Target'])
    ldown = len(Xy.loc[(Xy.Cluster == i) & (Xy.Target <= 0)]['Target'])
    m = (lup - ldown) / (lup + ldown)
    good_clusters.append(m)

In [None]:
X['Cluster'].value_counts()

In [None]:
[good_clusters,Xy.Cluster.value_counts().index.tolist()]

In [None]:
Xy = X.copy()
Xy["Cluster"] = Xy.Cluster.astype("category")
Xy["Target"] = y
sns.relplot(
    x="value", y="Target", hue="Cluster", col="variable",
    height=4, aspect=1, facet_kws={'sharex': False}, col_wrap=3,
    data=Xy.loc[Xy.Cluster == 14].melt(
        value_vars=studied_columns, id_vars=["Target", "Cluster"],
    ),
);

In [None]:
Xy.loc[(Xy.Cluster == 2) & (Xy.Target >= 0)].

In [None]:
sns.kdeplot( data=Xy.loc[(Xy.Cluster == 2) & (Xy.Target >= 0)], x="Target", cumulative=False, common_norm=False, common_grid=True, )

In [None]:
sns.kdeplot( data=Xy.loc[(Xy.Cluster == 3)], x="Target", cumulative=False, common_norm=False, common_grid=True, )

In [None]:
good_clusters = []
for i in Xy.Cluster.value_counts().index :
    lup = len(Xy.loc[(Xy.Cluster == i) & (Xy.Target > 0)]['Target'])
    ldown = len(Xy.loc[(Xy.Cluster == i) & (Xy.Target <= 0)]['Target'])
    m = (lup - ldown) / (lup + ldown)
    good_clusters.append(m)
    

In [None]:
Xy.Cluster.value_counts().index.tolist()

In [None]:
[good_clusters,Xy.Cluster.value_counts().index.tolist()]