# Random Forest Clustering.

### GOAL: To find the significant passive and kinetics parameters which contribute in differentiation of different sub-types of neurons.
### Data: npz file generated by executing save_params function in neuroRD ajustador project.

###### STEP-1: Data preparation.

In [145]:
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline

In [146]:
data_arky_1 = np.load('data/fitgp-arky-chan_arky120938.npz')
data_arky_2 = np.load('data/fitgp-arky-chan_arky1382938.npz')
data_arky_3 = np.load('data/fitgp-arky-chan_arky140938.npz')
data_proto_1 = np.load('data/fitgp-proto-chan_proto0792938.npz')
data_proto_2 = np.load('data/fitgp-proto-chan_proto1542938.npz')

In [147]:
print(data_arky_1['paramnames'].shape, data_arky_2['paramnames'].shape, data_arky_3['paramnames'].shape, 
      data_proto_1['paramnames'].shape, data_proto_2['paramnames'].shape) # Verify shapes of datasets

(48,) (48,) (48,) (48,) (48,)


In [148]:
list(data_arky_1.keys())

['params', 'paramnames', 'fitvals', 'features']

In [149]:
data_arky_1['fitvals'][0].shape, data_arky_1['features'].shape

((13,), (15,))

In [150]:
def get_feature_df(np_data):
    features = [text.partition('=')[0] for text in np_data['features'][:-3]]
    features.append(np_data['features'][-3].split(':')[0])
    df = pd.DataFrame(np_data['fitvals'], columns=features)
    df['neuron'] = np_data['features'][-1].split('=')[-1]
    df['model'] = np_data['features'][-2].split('=')[-1]
    return df

In [151]:
arky_features_1 = get_feature_df(data_arky_1)
arky_features_2 = get_feature_df(data_arky_2)
arky_features_3 = get_feature_df(data_arky_3)
proto_features_1 = get_feature_df(data_proto_1)
proto_features_2 = get_feature_df(data_proto_2)

In [152]:
arky_params_1 = pd.DataFrame(data_arky_1['params'], columns = data_arky_1['paramnames'])
arky_params_2 = pd.DataFrame(data_arky_2['params'], columns = data_arky_2['paramnames'])
arky_params_3 = pd.DataFrame(data_arky_3['params'], columns = data_arky_3['paramnames'])
proto_params_1 = pd.DataFrame(data_arky_1['params'], columns = data_arky_1['paramnames'])
proto_params_2 = pd.DataFrame(data_arky_2['params'], columns = data_arky_2['paramnames'])

In [153]:
arky_df_1 = pd.concat([arky_params_1, arky_features_1], axis=1, sort=False)
arky_df_2 = pd.concat([arky_params_2, arky_features_2], axis=1, sort=False)
arky_df_3 = pd.concat([arky_params_3, arky_features_3], axis=1, sort=False)
proto_df_1 = pd.concat([proto_params_1, proto_features_1], axis=1, sort=False)
proto_df_2 = pd.concat([proto_params_2, proto_features_2], axis=1, sort=False)

In [154]:
proto_features_2.shape

(5600, 15)

In [155]:
proto_params_2.shape

(6000, 48)

In [156]:
df = arky_df_1.append(arky_df_2)
df = df.append(arky_df_3)
df = df.append(proto_df_1)
#df = df.append(proto_df_2) # features and parameters individual rows didn't match.
df.shape

(25200, 63)

###### STEP-2: Random forest classification model training.

In [157]:
df.columns

Index(['junction_potential', 'RA', 'RM', 'CM', 'Eleak', 'Cond_KDr_0',
       'Cond_KDr_1', 'Cond_KDr_2', 'Cond_Kv3_0', 'Cond_Kv3_1', 'Cond_Kv3_2',
       'Cond_KvF_0', 'Cond_KvF_1', 'Cond_KvF_2', 'Cond_KvS_0', 'Cond_KvS_1',
       'Cond_KvS_2', 'Cond_NaF_0', 'Cond_NaF_1', 'Cond_NaF_2', 'Cond_HCN1_0',
       'Cond_HCN1_1', 'Cond_HCN2_0', 'Cond_HCN2_1', 'Cond_KCNQ', 'Cond_NaS_0',
       'Cond_NaS_1', 'Cond_NaS_2', 'Cond_Ca_0', 'Cond_Ca_1', 'Cond_SKCa_0',
       'Cond_SKCa_1', 'Cond_BKCa_0', 'Cond_BKCa_1', 'Chan_HCN1_taumul',
       'Chan_HCN2_taumul', 'Chan_HCN1_vshift', 'Chan_HCN2_vshift',
       'Chan_NaF_vshift', 'Chan_NaF_taumul', 'Chan_NaS_vshift',
       'Chan_NaS_taumul', 'Chan_Kv3_vshift', 'Chan_Kv3_taumul',
       'Chan_KvS_vshift', 'Chan_KvS_taumul', 'Chan_KvF_vshift',
       'Chan_KvF_taumul', 'response_fitness', 'baseline_pre_fitness',
       'baseline_post_fitness', 'rectification_fitness',
       'falling_curve_time_fitness', 'spike_time_fitness',
       'spike_width_fitnes

In [158]:
print('Model: ',pd.unique(df['model']),'\n','Neuron_types: ', pd.unique(df['neuron']))

Model:  ['gp'] 
 Neuron_types:  ['arky' 'proto']


In [159]:
df = df.dropna()
exclude = ['model', 'total']
columns = [column for column in list(df.columns) if column not in exclude]
columns_no_label = [column for column in list(df.columns) if column not in exclude or column not in ['neuron']]

In [160]:
df_labels = df[columns_no_label]
labels = df['neuron']

In [161]:
exclude = set(['neuron', 'model', 'total'])
df_no_labels = df.loc[:, set(df.columns) - exclude]

In [162]:
df_no_labels.head()

Unnamed: 0,Cond_KDr_2,CM,Cond_HCN2_1,Cond_KvS_1,Cond_KvS_2,rectification_fitness,Cond_NaS_1,response_fitness,Cond_KvS_0,Cond_KvF_0,...,falling_curve_time_fitness,Cond_HCN1_0,Cond_Kv3_2,Cond_HCN2_0,Cond_BKCa_1,Cond_KDr_1,Cond_Ca_1,Cond_NaS_2,Chan_NaF_vshift,spike_height_fitness
0,175.201521,0.010075,0.427183,4.178451,10.320133,8.210518,1.659697,1.660941,0.921858,10.747251,...,0.47982,0.304674,383.846595,0.00012,2.066036,6.840888,0.132926,1.030384,-0.000747,0.001113
1,16.009271,0.019054,0.329575,3.441697,10.37616,1.495024,1.119214,3.654072,3.338746,8.84067,...,0.519837,0.320233,271.872956,0.282693,1.740351,8.355165,1.254689,2.325295,-0.005511,1.5
2,53.405161,0.008673,1.131724,2.122247,8.848264,2.329753,0.182122,2.713499,0.358007,9.407996,...,2.214393,0.01719,354.763742,0.924143,2.368642,7.812411,0.042572,0.225693,-0.003099,1.5
3,128.888614,0.025445,0.670469,1.95114,10.323037,4.408574,0.433948,5.569647,0.472506,8.566726,...,0.203141,0.086651,292.893299,0.693917,1.794903,9.850698,0.317223,1.456858,0.006463,1.5
4,195.231279,0.011546,0.729558,3.465987,10.679765,7.313001,0.361717,7.23581,0.009319,10.901112,...,1.5,1.408522,245.590405,2.023085,2.152569,9.850924,0.268413,0.087798,0.008843,0.348866


In [163]:
from sklearn.ensemble import RandomForestClassifier

In [188]:
rtc = RandomForestClassifier(10)

In [189]:
rte.fit(df_no_labels)

RandomTreesEmbedding(max_depth=5, max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
           random_state=None, sparse_output=True, verbose=0,
           warm_start=False)

In [190]:
rtc.fit(df_no_labels, labels)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

###### Feature importances are sorted in decending order along with feature names.

In [198]:
import operator
sorted({feature : importance for feature, importance in zip(list(df_no_labels.columns), list(rtc.feature_importances_))}.items(), key=operator.itemgetter(1), reverse=True)

[('spike_count_fitness', 0.09897477197878705),
 ('Cond_KDr_0', 0.09718447534182983),
 ('ahp_curve_fitness', 0.09708089418997809),
 ('spike_range_y_histogram_fitness', 0.07199798211013785),
 ('falling_curve_time_fitness', 0.06846900305965417),
 ('Cond_Ca_0', 0.05395345034071961),
 ('spike_ahp_fitness', 0.05116053001736296),
 ('Cond_KvS_2', 0.04301742511475009),
 ('Cond_NaF_0', 0.03871246775750917),
 ('spike_height_fitness', 0.0360724427492171),
 ('spike_time_fitness', 0.03016422630445418),
 ('RA', 0.02623532762048465),
 ('baseline_post_fitness', 0.026102273401102428),
 ('Cond_BKCa_0', 0.02604040832588241),
 ('response_fitness', 0.02537895310857948),
 ('rectification_fitness', 0.019538122388770087),
 ('Cond_KvS_1', 0.015627689918803537),
 ('Cond_NaF_2', 0.015403271486085692),
 ('Cond_KvF_0', 0.011964813419179793),
 ('Cond_NaS_1', 0.009725070759306627),
 ('Cond_KvF_2', 0.009414900620676313),
 ('Cond_Kv3_0', 0.008629687740186789),
 ('spike_width_fitness', 0.008285485134063606),
 ('Cond_Kv3

##### Plots one tree out of 10 trees in the forest

In [200]:
from sklearn.tree import export_graphviz
import os

In [195]:
estimator = rtc.estimators_[5]

In [197]:
export_graphviz(estimator, out_file='tree.dot', 
                feature_names = df_no_labels.columns,
                class_names = list(labels.unique()),
                rounded = True, proportion = False, 
                precision = 2, filled = True)

In [201]:
os.system('dot -Tpng tree.dot -o tree.png')

1

In [203]:
# TODO: How to evaluate the random forest classified did good.
# TODO: How to further simplify tree to comment on the entire forest behaviour.
# TODO: Plot high importance feature data and label them based on neuron type('proto', 'arky')

In [None]:
# How to embedding rondom tree method to compare the difference between random forest.
# Difference between random forest and tree embedding.
# Random Forest -> Trees in forest are fixed.
# Tree Embedding -> Trees are added to forest cluster based on need. 
# Need to search on Tree embedding techique.

In [None]:
from sklearn.ensemble import RandomTreesEmbedding
rte = RandomTreesEmbedding(10)