In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Import Packages and Data

In [None]:
import numpy as np # data analysis
import pandas as pd # data analysis
import matplotlib.pyplot as plt # data vizualization
import seaborn as sns # data visualization
from sklearn.model_selection import train_test_split # machine learning
from sklearn.ensemble import RandomForestClassifier # machine learning
from sklearn.cluster import KMeans

example_clinical_data_path_1 = '/kaggle/input/end-als/end-als/clinical-data/filtered-metadata/metadata/clinical/Demographics.csv'
example_clinical_data_path_2 = '/kaggle/input/end-als/end-als/clinical-data/filtered-metadata/metadata/clinical/ALSFRS_R.csv'
example_transcriptomics_DESEQ2_data_path_1 = '/kaggle/input/end-als/end-als/transcriptomics-data/DESeq2/bulbar_vs_limb.csv'
example_transcriptomics_DESEQ2_data_path_2 = '/kaggle/input/end-als/end-als/transcriptomics-data/DESeq2/ctrl_vs_case.csv'
example_transcriptomics_3counts_data_path = '/kaggle/input/end-als/end-als/transcriptomics-data/L3_counts/CASE-NEUZX521TKK/CASE-NEUZX521TKK-5793-T/CASE-NEUZX521TKK-5793-T_P85.exon.txt'

demographics = pd.read_csv(example_clinical_data_path_1)
demographics.to_csv('/kaggle/working/demographics.csv')
alsfrs_scores = pd.read_csv(example_clinical_data_path_2)
alsfrs_scores.to_csv('/kaggle/working/alsfrs_scores.csv')
bulbar_vs_limb = pd.read_csv(example_transcriptomics_DESEQ2_data_path_1)
bulbar_vs_limb.to_csv('/kaggle/working/bulbar_vs_limb.csv')
ctrl_vs_case = pd.read_csv(example_transcriptomics_DESEQ2_data_path_2)
ctrl_vs_case.to_csv('/kaggle/working/ctrl_vs_case.csv')
example_transcriptomics_3counts_data = pd.read_csv(example_transcriptomics_3counts_data_path,delim_whitespace=True,skiprows=1,low_memory=False)
example_transcriptomics_3counts_data.to_csv('/kaggle/working/L3_counts.csv')

# Functions

In [None]:
def sort_feature_importances(df, visualize = False):
    '''
    Adapted from https://github.com/WillKoehrsen/feature-selector
    '''
    #Sort features according to importance
    df = df.sort_values('importance', ascending = False).reset_index()
    #Normalise the feature importances to add up to one
    df['importance_normalized'] = df['importance'] / df['importance'].sum()
    #Make a horizontal bar chart of feature importances
    
    if(visualize):
        plt.figure(figsize = (10,6))
        ax = plt.subplot()
        #Need to reverse the index to plot most important on top
        ax.barh(list(reversed(list(df.index[:15]))),
               df['importance_normalized'].head(15),
               align = 'center', edgecolor = 'k')
        #Set the yticks and labels
        ax.set_yticks(list(reversed(list(df.index[:15]))))
        ax.set_yticklabels(df['feature'].head(15))
        #Plot labeling
        plt.xlabel('Normalized Importance'); plt.title('Feature Importance')
        plt.show()
    
    return df

def important_clusters(XClusterLabels, Y, numClusters, threshold = 0.8, labelOfInterest = 1):
    """ Check which clusters express a given class label in a ratio greater than a threshold
    
    Arguments:
        XClusterLabels: ndarray of shape (n_samples,) cluster predictions for the training data
        Y: ndarray of shape (n_samples,), training labels
        numCluster: an integer representing the number of clusters
        threshold: a float representing the ratio threshold for a cluster to be significant, defaults to 0.8
        labelOfInterest: an integer representing the class label of interest
    
    Returns:
        An ndarray containing 0 (not exceeding the threshold) or 1 (exceeding the threshold) for each cluster,
        and an ndarray containing the ratio for each cluster
    """
    meaningfulList = np.zeros((numClusters))
    ratioList = np.zeros((numClusters))
    
    for i in np.arange(numClusters):
        YClusterLabels = Y[XClusterLabels == i]
        ratio = YClusterLabels[YClusterLabels == labelOfInterest].shape[0] / YClusterLabels.shape[0]
        if ratio >= threshold:
            meaningfulList[i] = 1
        ratioList[i] = ratio
    return meaningfulList, ratioList

def extract_important_features(X, XClusterLabels, clusterOfInterest, numFeatures=5000, visualize=False):
    """ Find which features are important in a random forest classifier with two classes: 
    being in the cluster of interest, and not being in it.

    Arguments:
        X: Pandas DataFrame containing the training data
        XClusterLabels: ndarray of shape (n_samples,) cluster predictions for the training data
        clusterOfInterest: an integer representing the cluster of interest
        numFeatures: an integer representing the number of important features to return, defaults to 5000
        visualize: a boolean representing whether to visualize the important features, defaults to False

    Returns:
        A Pandas DataFrame containing the top numFeatures most important features
    """
    
    clf = RandomForestClassifier()
    newClusterLabels = np.zeros(XClusterLabels.shape)
    newClusterLabels[XClusterLabels == clusterOfInterest] = 1
    clf.fit(X, newClusterLabels)

    feature_importance_values = clf.feature_importances_
    features = list(X.columns)
    feature_importances = pd.DataFrame({'feature': features, 'importance':feature_importance_values})
    return sort_feature_importances(feature_importances, visualize)[:numFeatures]

# Test Run

In [None]:
training_dataTest = bulbar_vs_limb.drop(['SiteOnset_Class','Participant_ID'],axis=1)
labelsTest = bulbar_vs_limb['SiteOnset_Class']
X_train, X_test, y_train, y_test = train_test_split(training_dataTest, labelsTest, train_size=0.9)

numClusters=10
kmeans = KMeans(n_clusters=numClusters).fit(X_train)
XclusterLabels = kmeans.predict(X_train)
meaningList, ratioList = important_clusters(XclusterLabels, y_train, numClusters, threshold = 0.9)

for i in np.arange(numClusters):
    if meaningList[i] == 1:
        test = extract_important_features(X_train, XclusterLabels, i, visualize=True)
        print(test[:10])