# Feature Selection Process

## Imports

In [1]:
import pandas as pd
import numpy as np
import re

## Import Data

In [102]:
data = pd.read_csv("/Users/steliosrammos/Documents/Education/Maastricht/DKE-Year3/Bachelor Thesis/data/client_scores_full.csv", sep=";", decimal=",")
data.rename(columns={'client_id': 'uuid'}, inplace=True)
data = data.set_index('uuid')

data = data[data['survey_date'] != '2915-10-20']
data = data[data['survey_date'] >= '2017-01-01']
data = data[data['survey_date'] <= '2018-06-30']
data['survey_date'] = pd.to_datetime(data['survey_date'],infer_datetime_format=True)
print('Max: {},\nMin {}'.format(data['survey_date'].max(), data['survey_date'].min()))
data['survey_name'] = data['survey_name'].replace(['Vervallen - Algemene Intake Rughuis', 'Algemene Intake V3.0', 'Vervallen - Algemene Intake'], 'Algemene Intake')
data = data.drop('survey_date', axis=1)
data.head()

Max: 2018-06-29 00:00:00,
Min 2017-01-02 00:00:00


Unnamed: 0_level_0,survey_name,score_name,score_value
uuid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1348001455490850631,Algemene Intake,AlgIntakeOpleidingsniveauScore_Raw,3
1348001455490850631,Algemene Intake,AlgIntakeWoonsituatieScore_Raw,3
1348001455490850631,Algemene Intake,educationLevel,32
1348001455490850631,Algemene Intake,livingConditions,40
1348001455490850631,EQ-5D-5L,EQ-5D-5L-Profile,22332


In [103]:
data.loc[(data['survey_name'] == 'PDI') & (data['score_name'] == 'BeperkingScore'),'score_name'] = 'PDI_BeperkingScore'
data.loc[(data['survey_name'] == 'QBPDS') & (data['score_name'] == 'BeperkingScore'),'score_name'] = 'QBPDS_BeperkingScore'
data.loc[(data['survey_name'] == 'NDI') & (data['score_name'] == 'BeperkingScore'),'score_name'] = 'NDI_BeperkingScore'

In [104]:
def count_types_in_row(series):
    
    string = 0
    num_string = 0
    numerical = 0
    
    for cell in data.iloc[:, -1]:
        if type(cell) == str:
            if cell.isdecimal():
                pass
#                 num_string += 1
            string +=1
            
        elif type(cell) == float:
            numerical += 1
        else:
            print(type(cell))

    print("Strings {} \n".format(string))
    print("Numerical Strings {} \n".format(num_string))
    print("Numerical {} \n".format(numerical))
    
    return 0 

In [105]:
count_types_in_row(data['score_value'])
# count_types_in_row(data_clean['ScoreResult'])

Strings 457813 

Numerical Strings 0 

Numerical 1896 



0

## Pivot And Drop Columns/ Rows Given Criteria

1. Pivot the table
2. Drop columns and rows with too many null values

In [106]:
pivoted = data.groupby('uuid')['score_name'].value_counts().unstack().fillna(np.nan)
pivoted.head()

score_name,ActScore,Act_Age_Score,Act_All_Score,Act_Raw_Score,ActieveCopingPercentage,ActieveCopingScore,Activiteit,AfhankelijkheidOnbekwaamheidGemSom,AfhankelijkheidOnbekwaamheidHogeTurf,AfhankelijkheidOnbekwaamheidScorePerc,...,ZelfopofferingScorePerc,ZichRechtenToeEigenenGemSom,ZichRechtenToeEigenenHogeTurf,ZichRechtenToeEigenenScorePerc,arbeidsSituatie_Raw,bmiLength_Raw,bmiWeight_Raw,bmi_Raw,educationLevel,livingConditions
uuid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
-9214014786609792531,,1.0,1.0,1.0,1.0,1.0,,,,,...,,,,,1.0,1.0,1.0,1.0,1.0,1.0
-9204323589684605317,,1.0,1.0,1.0,1.0,1.0,,,,,...,,,,,,,,,1.0,1.0
-9189315961929324040,,1.0,1.0,1.0,1.0,1.0,,,,,...,,,,,,,,,1.0,1.0
-9187839909081422277,,1.0,1.0,1.0,1.0,1.0,1.0,,,,...,,,,,1.0,1.0,1.0,1.0,1.0,1.0
-9184078185923068786,,1.0,1.0,1.0,1.0,1.0,,,,,...,,,,,1.0,1.0,1.0,1.0,1.0,1.0


### Drop Columns With Too Many Missing Values

In [107]:
# If more than 10% of the values are missing in that column, add it to the drop_columns

def columns_to_drop(df):

    drop_columns = []
    
    for column in df.columns:

        num_nulls = df[column].isna().value_counts(sort=False)[1]
        size = pivoted.shape[0]
        
        if num_nulls > (size * 0.1):
            drop_columns.append(column)

    return drop_columns

In [108]:
drop_columns = columns_to_drop(pivoted)
pivoted_shrinked = pivoted.drop(drop_columns, axis=1)
pivoted_shrinked.shape

(2675, 161)

In [109]:
pivoted_shrinked.head()

score_name,Act_Age_Score,Act_All_Score,Act_Raw_Score,ActieveCopingPercentage,ActieveCopingScore,Age,AlgIntakeOpleidingsniveauScore_Raw,AlgIntakeWoonsituatieScore_Raw,AlgemeneGezondheidsbelevingScore,Average,...,TotaalScore,TransformerenScore,Ver_Age_Score,Ver_All_Score,Ver_Raw_Score,VerminderingVanEisen,VitaliteitScore,Vraag34,educationLevel,livingConditions
uuid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
-9214014786609792531,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,2.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
-9204323589684605317,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,2.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
-9189315961929324040,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,2.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
-9187839909081422277,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,2.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
-9184078185923068786,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,2.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


### Drop Rows With Too Many Missing Values

In [110]:
# If more than 5% of the vlaues are missing in that row, add it to the drop_rows

def drop_rows(df):
    
    size = df.shape[0]
    mask = []
    
    for i in range(0, df.shape[0]):
        row = df.iloc[i, :]
        num_nulls = row.isna().value_counts(sort=False)
        
        if num_nulls.shape[0] == 2:
            if num_nulls[1] > 0.09:
#                 print("Row {} has {} null values".format(i, num_nulls[1]))
                mask.append(False)
            else: 
                mask.append(True)
        
        else:
            mask.append(True)
        
    return mask    
        

In [111]:
mask = drop_rows(pivoted_shrinked)
pivoted_shrinked_v2 = pivoted_shrinked[mask]
pivoted_shrinked_v2.shape

(2376, 161)

In [112]:
pivoted_shrinked_v2.head()

score_name,Act_Age_Score,Act_All_Score,Act_Raw_Score,ActieveCopingPercentage,ActieveCopingScore,Age,AlgIntakeOpleidingsniveauScore_Raw,AlgIntakeWoonsituatieScore_Raw,AlgemeneGezondheidsbelevingScore,Average,...,TotaalScore,TransformerenScore,Ver_Age_Score,Ver_All_Score,Ver_Raw_Score,VerminderingVanEisen,VitaliteitScore,Vraag34,educationLevel,livingConditions
uuid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
-9214014786609792531,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,2.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
-9204323589684605317,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,2.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
-9189315961929324040,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,2.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
-9187839909081422277,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,2.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
-9184078185923068786,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,2.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [113]:
remaining_scores = pivoted_shrinked_v2.columns
data[data['score_name'].isin(remaining_scores)]['survey_name'].unique()

array(['Algemene Intake', 'Rand 36', 'PDI', 'TSK', 'PHODA-SeV', 'NPRS',
       'SBT', 'QBPDS', 'PCI', 'OQ-45.2', 'BSI', 'UCL', 'PCS', 'NDI'],
      dtype=object)

### Drop More Columns

In [114]:
# Drop columns from BSI that have scale
pivoted_shrinked_v2 = pivoted_shrinked_v2.drop(pivoted_shrinked_v2.filter(regex=("Amb.*")).columns, axis =1)

# Drop the normalized scores columns from BSI
pivoted_shrinked_v2 = pivoted_shrinked_v2.drop(pivoted_shrinked_v2.filter(regex=("_Norm_.*")).columns, axis =1)

# Drop columns that show an age score
pivoted_shrinked_v2 = pivoted_shrinked_v2.drop(pivoted_shrinked_v2.filter(regex=("Age_Score.*")).columns, axis =1)

# Drop columns that show a normal score
pivoted_shrinked_v2 = pivoted_shrinked_v2.drop(pivoted_shrinked_v2.filter(regex=("Normal_Score.*")).columns, axis =1)

# Drop columns that show a pain score
pivoted_shrinked_v2 = pivoted_shrinked_v2.drop(pivoted_shrinked_v2.filter(regex=("Pain_Score.*")).columns, axis =1)

# Drop the UCL all_score to keep only the raw score
pivoted_shrinked_v2 = pivoted_shrinked_v2.drop(pivoted_shrinked_v2.filter(regex=("All_Score.*")).columns, axis =1)

# Drop all Phoda questions scores and keep the average
pivoted_shrinked_v2 = pivoted_shrinked_v2.drop(pivoted_shrinked_v2.filter(regex=("PhodaQuestion_.*")).columns, axis =1)

# Drop QBPDS percentage score
pivoted_shrinked_v2 = pivoted_shrinked_v2.drop('BeperkingPercentage', axis=1)

# Drop open text question
pivoted_shrinked_v2 = pivoted_shrinked_v2.drop('Vraag34', axis=1)

In [115]:
remaining_scores = pivoted_shrinked_v2.columns
data[data['score_name'].isin(remaining_scores)]['survey_name'].unique()

array(['Algemene Intake', 'Rand 36', 'PDI', 'TSK', 'PHODA-SeV', 'NPRS',
       'SBT', 'QBPDS', 'PCI', 'OQ-45.2', 'BSI', 'UCL', 'PCS'],
      dtype=object)

In [116]:
pivoted_shrinked_v2.shape

(2376, 65)

In [117]:
import time

In [120]:
def fill_cell_value(pivoted_df, values_df):
    df = pivoted_df.copy()
    
    for i in range(0,df.shape[0]):
        for j in range(0,df.shape[1]):
            
            client_nr = df.index[i]
            score_name = df.columns[j]
            client = data.loc[client_nr]
            cell_val = client.loc[client['score_name'] == score_name, 'score_value']
            
            if len(cell_val.values) > 0:
                df.iloc[i, j] = cell_val.values[0]
            else:
                df.iloc[i, j] = np.nan
    return df

In [121]:
# fill_cell_value(pivoted_shrinked_v2, data)
pivoted_scores = fill_cell_value(pivoted_shrinked_v2, data)

In [124]:
pivoted_scores.head()

score_name,Act_Raw_Score,ActieveCopingPercentage,ActieveCopingScore,Age,AlgIntakeOpleidingsniveauScore_Raw,AlgIntakeWoonsituatieScore_Raw,AlgemeneGezondheidsbelevingScore,Average,BewegingsangstScore,Bsi_Age,...,Terugtrekken,TotaalLevel,TotaalScore,TransformerenScore,Ver_Raw_Score,VerminderingVanEisen,VitaliteitScore,Vraag34,educationLevel,livingConditions
uuid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
-9214014786609792531,16,3889,26,44,1,4,70,39,27,44,...,14,Subklinisch,6,9,15,4,80,,20,20
-9204323589684605317,14,4722,29,40,6,1,25,31,40,40,...,18,Subklinisch,5,11,16,7,35,,42,10
-9189315961929324040,18,6111,34,30,5,2,75,48,39,30,...,13,Hoog,4,14,20,6,70,,41,30
-9187839909081422277,18,7222,38,48,9,4,55,50,37,48,...,22,Hoog,5,14,20,7,60,ik probeer dan zoveel mogelijk te gaan mediter...,43,20
-9184078185923068786,16,5556,32,69,3,3,30,79,51,69,...,9,Hoog,9,11,19,8,55,met een rugband en een magage kussen,32,40


In [127]:
pivoted_scores.to_csv('/Users/steliosrammos/Documents/Education/Maastricht/DKE-Year3/Bachelor Thesis/data/pivoted_scores_phi_v2.csv', sep=';')

## Map Categorical Data

Here we first remove the categorical data that has a numerical equivalent as we are not interested in the aggregated/processed data and we want the raw outputs. Then, we take in the remaining categorical data and map it to a numerical data type. 

In [179]:
scores = pd.read_csv('/Users/steliosrammos/Documents/Education/Maastricht/DKE-Year3/Bachelor Thesis/data/v2/pivoted_scores_phi_v2.csv', sep=';', decimal=',')
scores = scores.drop('Vraag34',axis=1)
scores.head()

Unnamed: 0,uuid,Act_Raw_Score,ActieveCopingPercentage,ActieveCopingScore,Age,AlgIntakeOpleidingsniveauScore_Raw,AlgIntakeWoonsituatieScore_Raw,AlgemeneGezondheidsbelevingScore,Average,BewegingsangstScore,...,SubuitslagScore,Terugtrekken,TotaalLevel,TotaalScore,TransformerenScore,Ver_Raw_Score,VerminderingVanEisen,VitaliteitScore,educationLevel,livingConditions
0,-9214014786609792531,16.0,38.89,26.0,44.0,1.0,4.0,70.0,39.0,27.0,...,3.0,14.0,Subklinisch,6.0,9.0,15.0,4.0,80.0,20.0,20.0
1,-9204323589684605317,14.0,47.22,29.0,40.0,6.0,1.0,25.0,31.0,40.0,...,2.0,18.0,Subklinisch,5.0,11.0,16.0,7.0,35.0,42.0,10.0
2,-9189315961929324040,18.0,61.11,34.0,30.0,5.0,2.0,75.0,48.0,39.0,...,2.0,13.0,Hoog,4.0,14.0,20.0,6.0,70.0,41.0,30.0
3,-9187839909081422277,18.0,72.22,38.0,48.0,9.0,4.0,55.0,50.0,37.0,...,2.0,22.0,Hoog,5.0,14.0,20.0,7.0,60.0,43.0,20.0
4,-9184078185923068786,16.0,55.56,32.0,69.0,3.0,3.0,30.0,79.0,51.0,...,5.0,9.0,Hoog,9.0,11.0,19.0,8.0,55.0,32.0,40.0


In [180]:
def count_null_in_columns(df):
    
    for column in df.columns:
        num_null = df[column].isna().value_counts(sort=False)
        
        if num_null.shape[0] == 2:
            print("Row {} has {} null values".format(column, num_null[1]))
        else:
            print("Row {} has {} null values".format(column, num_null[0]))

In [181]:
# Remove non alphabetical characters 
for column in scores.columns:
    
    if scores[column].dtype == 'object':
        scores[column] = scores[column].str.replace("[^a-zA-Z ]+", "")
        

In [182]:
for column in scores:
    if scores[column].dtype == 'object':
        mst_common = scores[column].value_counts().index[0]
        scores[column] = scores[column].fillna(mst_common)
        
    

In [183]:
categories = []
cat_columns = []

for column in scores.columns:
    
    if scores[column].dtype == 'object':
        cat_columns.append(column)
        categories.append(scores[column].unique())

***The following code section maps categorical data to a number***

In [185]:
map_1 = {'Subklinisch':1, 'Hoog':2}
map_2 = {'Laag risico':1, 'Middelmatig risico':2, 'Hoog risico':3}


scores['HulpeloosheidLevel'] = scores['HulpeloosheidLevel'].map(map_1)
scores['MagnificatieLevel'] = scores['MagnificatieLevel'].map(map_1)
scores['RuminatieLevel'] = scores['RuminatieLevel'].map(map_1)
scores['TotaalLevel'] = scores['TotaalLevel'].map(map_1)

scores['Risico'] = scores['Risico'].map(map_2)

In [187]:
scores.shape

(2376, 65)

***The following code shows the remaining questionnaires***

In [188]:
remaining_scores = scores.columns
data[data['score_name'].isin(remaining_scores)]['score_name'].unique()

array(['AlgIntakeOpleidingsniveauScore_Raw',
       'AlgIntakeWoonsituatieScore_Raw', 'educationLevel',
       'livingConditions', 'AlgemeneGezondheidsbelevingScore',
       'FysiekFunctionerenScore', 'GezondheidsveranderingScore',
       'MentaleGezondheidScore', 'PijnScore',
       'RolbeperkingenEmotioneelScore', 'RolbeperkingenFysiekScore',
       'SociaalFunctionerenScore', 'VitaliteitScore',
       'PDI_BeperkingScore', 'BewegingsangstScore', 'Average',
       'GemiddeldScore', 'SlechtsteMomentScore', 'Risico', 'TotaalScore',
       'SubuitslagScore', 'QBPDS_BeperkingScore', 'ActieveCopingScore',
       'ActieveCopingPercentage', 'TransformerenScore', 'OntspannenScore',
       'VerminderingVanEisen', 'PassieveCopingScore',
       'PassieveCopingPercentage', 'Terugtrekken', 'Catastroferen',
       'Rusten', 'OQ452_SD_Raw_Score', 'OQ452_IR_Raw_Score',
       'OQ452_SR_Raw_Score', 'OQ452_ASD_Raw_Score', 'OQ452_TOT_Raw_Score',
       'Bsi_Age', 'Bsi_Som_Raw_Score', 'Bsi_Cog_Raw_Score

In [189]:
scores.to_csv('/Users/steliosrammos/Documents/Education/Maastricht/DKE-Year3/Bachelor Thesis/data/v2/formated_scores_v2.csv', sep=';', index=False)

### Correlate Features And Target Variable

In [244]:
import matplotlib.pyplot as plt
import seaborn as sns
import math

In [190]:
scores = pd.read_csv('/Users/steliosrammos/Documents/Education/Maastricht/DKE-Year3/Bachelor Thesis/data/v2/formated_scores_v2.csv', sep=';')
client_info = pd.read_csv('/Users/steliosrammos/Documents/Education/Maastricht/DKE-Year3/Bachelor Thesis/data/v2/clients_go_nogo_finished.csv', sep=';')

scores.rename(columns={'ClientNr': 'uuid'}, inplace=True)
client_info.rename(columns={'client_id': 'uuid'}, inplace=True)

data = scores.merge(client_info, on='uuid')

In [194]:
data.drop(['finished_treatment','start_date'],axis=1)
data.head()

Unnamed: 0,uuid,Act_Raw_Score,ActieveCopingPercentage,ActieveCopingScore,Age,AlgIntakeOpleidingsniveauScore_Raw,AlgIntakeWoonsituatieScore_Raw,AlgemeneGezondheidsbelevingScore,Average,BewegingsangstScore,...,TotaalScore,TransformerenScore,Ver_Raw_Score,VerminderingVanEisen,VitaliteitScore,educationLevel,livingConditions,got_go,start_date,finished_treatment
0,-9214014786609792531,16.0,38.89,26.0,44.0,1.0,4.0,70.0,39.0,27.0,...,6.0,9.0,15.0,4.0,80.0,20.0,20.0,1,2017-12-20,1
1,-9204323589684605317,14.0,47.22,29.0,40.0,6.0,1.0,25.0,31.0,40.0,...,5.0,11.0,16.0,7.0,35.0,42.0,10.0,1,2017-10-23,1
2,-9189315961929324040,18.0,61.11,34.0,30.0,5.0,2.0,75.0,48.0,39.0,...,4.0,14.0,20.0,6.0,70.0,41.0,30.0,1,2017-09-07,0
3,-9187839909081422277,18.0,72.22,38.0,48.0,9.0,4.0,55.0,50.0,37.0,...,5.0,14.0,20.0,7.0,60.0,43.0,20.0,0,,0
4,-9184078185923068786,16.0,55.56,32.0,69.0,3.0,3.0,30.0,79.0,51.0,...,9.0,11.0,19.0,8.0,55.0,32.0,40.0,1,2017-12-18,0


In [245]:
X = data.iloc[:,1:-3]
s = data.iloc[:,-3]
y= data.iloc[:,-1]

In [246]:
X = X.fillna(X.mean().apply(lambda x: math.floor(x)))

In [253]:
from scipy.spatial.distance import pdist, squareform
import numpy as np
import copy


def distcorr(Xval, Yval, pval=True, nruns=500):
    """ Compute the distance correlation function, returning the p-value.
    Based on Satra/distcorr.py (gist aa3d19a12b74e9ab7941)

    >>> a = [1,2,3,4,5]
    >>> b = np.array([1,2,9,4,4])
    >>> distcorr(a, b)
    (0.76267624241686671, 0.404)
    """
    X = np.atleast_1d(Xval)
    Y = np.atleast_1d(Yval)
    if np.prod(X.shape) == len(X):
        X = X[:, None]
    if np.prod(Y.shape) == len(Y):
        Y = Y[:, None]
    X = np.atleast_2d(X)
    Y = np.atleast_2d(Y)
    
    n = X.shape[0]
    if Y.shape[0] != X.shape[0]:
        raise ValueError('Number of samples must match')
    a = squareform(pdist(X))
    b = squareform(pdist(Y))
    
    A = a - a.mean(axis=0)[None, :] - a.mean(axis=1)[:, None] + a.mean()
    B = b - b.mean(axis=0)[None, :] - b.mean(axis=1)[:, None] + b.mean()
    
    dcov2_xy = (A * B).sum() / float(n * n)
    dcov2_xx = (A * A).sum() / float(n * n)
    dcov2_yy = (B * B).sum() / float(n * n)
    dcor = np.sqrt(dcov2_xy) / np.sqrt(np.sqrt(dcov2_xx) * np.sqrt(dcov2_yy))

    if pval:
        greater = 0
        for i in range(nruns):
            Y_r = copy.copy(Yval)
            np.random.shuffle(Y_r)
            if distcorr(Xval, Y_r, pval=False) >= dcor:
                greater += 1
        print(greater)
        return (dcor, greater / float(nruns))
    else:
        return dcor

In [254]:
distcorr(X,s, pval=True,nruns=100)

0


(0.21543800232916974, 0.0)