In [None]:
import pandas as pd
import numpy as np 
import scipy as sc 
import matplotlib.pyplot as plt 
import seaborn as sns
import statsmodels.api as sm
from pandas import Series, DataFrame
from sklearn.linear_model import Lasso, LassoCV
sns.set(style="whitegrid")
%matplotlib inline

In [None]:
def age_to_numeric(x):
    if x=='초1':
        return 1
    if x=='초2':
        return 2
    if x=='초3':
        return 3
    if x=='초4':
        return 4
    if x=='초5':
        return 5
    if x=='초6':
        return 6
    if x=='중1':
        return 7
    
def ID_to_IDNum(x):
    temp = str(x).strip().split('_')
    if len(temp) < 2:
        return np.nan
    return int(temp[2])

In [None]:
# Child List 
list_child = pd.read_csv('list.csv', encoding = "euc_kr", na_values = 'empty')
list_child['IDNum'] = list_child['UID'].apply(ID_to_IDNum)
list_child['AgeNum'] = list_child['Age'].apply(age_to_numeric)

In [None]:
# Feature List 
data_whole = pd.read_csv('fnirs_feature_0322.csv', encoding = "euc_kr", na_values = 'empty')
data_whole['age_num'] = data_whole['fnirs_age'].apply(age_to_numeric)
data_whole.rename({'fnirs_idnum' : 'IDNum'}, inplace = True, axis = 'columns')

In [None]:
## Feature selection 

for task in set(data_whole['fnirs_task']):
    #rint(task)
    if task == 'REST' or task == 'CBTTF':
        continue
    
    data_rest = data_whole.query('fnirs_task == "' + task + '"')
    
    #Calculate Performance 
    if task == 'CBTTB':
        data_rest[task + '_' + 'Performance'] = data_rest['fnirs_acc']
        data_rest[task + '_' + 'Performance' + '_zscore'] = (data_rest[task + '_' + 'Performance'] - np.nanmean(data_rest[task + '_' + 'Performance'])) / np.nanstd(data_rest[task + '_' + 'Performance'])
        temp = data_rest[task + '_' + 'Performance' + '_zscore']
    elif task == 'VFT':
        pass 
    else:
        data_rest[task + '_Performance'] = data_rest['fnirs_acc'] / data_rest['fnirs_rt']
        data_rest[task + '_Performance_zscore'] = (data_rest[task + '_' + 'Performance'] - np.nanmean(data_rest[task + '_' + 'Performance'])) / np.nanstd(data_rest[task + '_' + 'Performance'])
    
    # Save performance data to list_child, except VFT 
    if task != 'VFT':    
        list_child = list_child.merge(data_rest[['IDNum', task + '_' + 'Performance', task + '_' + 'Performance' + '_zscore']], on = 'IDNum', how = 'outer')
    
    target_var = task + '_Performance_zscore'
    if task == 'VFT':
        target_var = 'age_num'
    
    ### Feature 1 : Average 
    featureName = 'block_avg'
    features = []
    
    for i in data_whole.columns:
        if featureName in i and 'taskAvg' in i and 'HbO' in i:
            features.append(i)
    
    data_reduced = data_rest[features]
    data_reduced.fillna(0, inplace = True)
    
    X = sm.add_constant(data_reduced)
    y = data_rest[target_var]
    mod = sm.OLS(y, X, missing = 'drop')
    res = mod.fit()

    temp_pvalues = res.pvalues[res.params > 0][1:]
    temp_pvalues.sort_values(inplace=True)
    feature_selected = temp_pvalues.index[0]
    print(featureName, temp_pvalues)
    
    data_final = data_rest[[feature_selected, 'IDNum']]
    data_final[feature_selected + '_zscore'] = (data_final[feature_selected] - np.nanmean(data_final[feature_selected])) / np.nanstd(data_final[feature_selected])
    data_final.rename({feature_selected + '_zscore' : task + '_' + featureName + '_zscore', feature_selected : task + '_' + feature_selected}, inplace = True, axis = 'columns')
    list_child = list_child.merge(data_final, on = 'IDNum', how = 'outer')
    
    ### Feature 2 : Variance 
    featureName = 'block_variance'
    features = []
    
    for i in data_whole.columns:
        if featureName in i and 'taskAvg' in i and 'HbO' in i:
            features.append(i)
    
    data_reduced = data_rest[features]
    data_reduced.fillna(0, inplace = True)
    
    X = sm.add_constant(data_reduced)
    y = data_rest[target_var]
    mod = sm.OLS(y, X, missing = 'drop')
    res = mod.fit()

    temp_pvalues = res.pvalues[res.params < 0][1:]
    temp_pvalues.sort_values(inplace=True)
    feature_selected = temp_pvalues.index[0]
    print(featureName, temp_pvalues)
    
    data_final = data_rest[[feature_selected, 'IDNum']]
    data_final[feature_selected + '_zscore'] = (data_final[feature_selected] - np.nanmean(data_final[feature_selected])) / np.nanstd(data_final[feature_selected])
    data_final.rename({feature_selected + '_zscore' : task + '_' + featureName + '_zscore', feature_selected : task + '_' + feature_selected}, inplace = True, axis = 'columns')
    list_child = list_child.merge(data_final, on = 'IDNum', how = 'outer')
    
    
    ### Feature 3 : Connectivity 
    featureName = 'fnirs_feature_HbO_all_taskAvg_conn_conn_bin_con_den'
    feature_selected = 'fnirs_feature_HbO_all_taskAvg_conn_conn_bin_con_den'
    data_final = data_rest[[feature_selected, 'IDNum']]
    data_final[feature_selected + '_zscore'] = (data_final[feature_selected] - np.nanmean(data_final[feature_selected])) / np.nanstd(data_final[feature_selected])
    data_final.rename({feature_selected + '_zscore' : task + '_' + featureName + '_zscore', feature_selected : task + '_' + feature_selected}, inplace = True, axis = 'columns')
    list_child = list_child.merge(data_final, on = 'IDNum', how = 'outer')

In [None]:
for col in list_child.columns:
    if 'zscore' in col:
        cur100 = col.replace('zscore', '100')
        temp = list_child[col]
        ttt = 75 + 25 * temp
        for idx in range(len(ttt)):
            if ttt[idx] > 100:
                ttt[idx] = 100
            elif ttt[idx] < 50:
                ttt[idx] = 100
            
        list_child[cur100] = ttt
        

In [None]:
list_child.to_csv('20190410_Result.csv')