In [2]:
import os
import pandas as pd
import numpy as np 
import scipy as sc 
import matplotlib.pyplot as plt 
import seaborn as sns
import statsmodels.api as sm
from pandas import Series, DataFrame
from sklearn.linear_model import Lasso, LassoCV
sns.set(style="whitegrid")
%matplotlib inline

In [3]:
def age_to_numeric(x):
    if x=='초1':
        return 1
    if x=='초2':
        return 2
    if x=='초3':
        return 3
    if x=='초4':
        return 4
    if x=='초5':
        return 5
    if x=='초6':
        return 6
    if x=='중1':
        return 7
    
def ID_to_IDNum(x):
    temp = str(x).strip().split('_')
    if len(temp) < 2:
        return np.nan
    return int(temp[2])

In [4]:
# Child List 

list_child = pd.read_csv('./data/list.csv', encoding = "euc_kr", na_values = 'empty')
list_child['IDNum'] = list_child['UID'].apply(ID_to_IDNum)
list_child['AgeNum'] = list_child['Age'].apply(age_to_numeric)

In [5]:
# Feature List 
# path = os.getcwd()
# file = os.path.join(path, "data", 'fnirs_feature_0322.xlsx')
# data_whole = pd.read_excel(open(file, 'rb'), sheet_name = 'ex_csv', header=1 )
data_whole = pd.read_csv('./data/fnirs_feature_0322.csv', encoding = "euc_kr", na_values = 'empty')
data_whole['age_num'] = data_whole['fnirs_age'].apply(age_to_numeric)
data_whole.rename({'fnirs_idnum' : 'IDNum'}, inplace = True, axis = 'columns')

In [9]:
## Feature selection
for task in set(data_whole['fnirs_task']): 
    if task == 'REST' or task == 'CBTTF': #task가 REST와 CBTTF면 data_rest = data_whole.fnirs_task 가 rest와 cbttf를 그대로 가져온다.
        continue
    
    data_rest = data_whole.query('fnirs_task == "' + task + '"') 
    
    #Calculate Performance 
    if task == 'CBTTB': #CBTTB면 정답률을 가져오고, zscore는 nan이 아닌 데이터 평균/분산; temp는 zscore 
        data_rest[task + '_' + 'Performance'] = data_rest['fnirs_acc']
        data_rest[task + '_' + 'Performance' + '_zscore'] = (data_rest[task + '_' + 'Performance'] - np.nanmean(data_rest[task + '_' + 'Performance'])) / np.nanstd(data_rest[task + '_' + 'Performance'])
        temp = data_rest[task + '_' + 'Performance' + '_zscore']
    elif task == 'VFT': # 행동데이터 없음
        pass 
    else: #stroop/ gng는 정답에 반응 속도 나눔.
        data_rest[task + '_Performance'] = data_rest['fnirs_acc'] / data_rest['fnirs_rt']
        data_rest[task + '_Performance_zscore'] = (data_rest[task + '_' + 'Performance'] - np.nanmean(data_rest[task + '_' + 'Performance'])) / np.nanstd(data_rest[task + '_' + 'Performance'])
    
    # Save performance data to list_child, except VFT 
    if task != 'VFT':    
        list_child = list_child.merge(data_rest[['IDNum', task + '_' + 'Performance', task + '_' + 'Performance' + '_zscore']], on = 'IDNum', how = 'outer')
    
    target_var = task + '_Performance_zscore'
    
    if task == 'VFT':
        target_var = 'age_num'
        
    ### Feature selection 1 : Average 
    featureName = 'block_avg'
    features = []
    
    for i in data_whole.columns:
        if featureName in i and 'taskAvg' in i and 'HbO' in i:
            features.append(i)
    data_reduced = data_rest[features]
    data_reduced.fillna(0, inplace = True) #  train 데이터 행렬 전처리
    
    X = sm.add_constant(data_reduced) #staticmodel 상수 정의 -- 필수 인지듯 보임.
    y = data_rest[target_var]
    mod = sm.OLS(y, X, missing = 'drop') 
    res = mod.fit()

    temp_pvalues = res.pvalues[res.params > 0][1:]
    temp_pvalues.sort_values(inplace=True)
#     print(temp_pvalues)
    feature_selected = temp_pvalues.index[0]
#     print(featureName, temp_pvalues)
    
    ### 개인별 Feature score 1 : Average 
    data_final = data_rest[[feature_selected, 'IDNum']]
    data_final[feature_selected + '_zscore'] = (data_final[feature_selected] - np.nanmean(data_final[feature_selected])) / np.nanstd(data_final[feature_selected])
    data_final.rename({feature_selected + '_zscore' : task + '_' + featureName + '_zscore', feature_selected : task + '_' + feature_selected}, inplace = True, axis = 'columns')
    list_child = list_child.merge(data_final, on = 'IDNum', how = 'outer')
    
    ### Feature selection 2 : Variance 
    featureName = 'block_variance'
    features = []
    
    for i in data_whole.columns:
        if featureName in i and 'taskAvg' in i and 'HbO' in i:
            features.append(i)
    
    data_reduced = data_rest[features]
    data_reduced.fillna(0, inplace = True)
    
    X = sm.add_constant(data_reduced)
    y = data_rest[target_var]
    mod = sm.OLS(y, X, missing = 'drop')
    res = mod.fit()
#     print(res.summary())
    
    temp_pvalues = res.pvalues[res.params < 0][1:]
    temp_pvalues.sort_values(inplace=True)
    feature_selected = temp_pvalues.index[0]
#     print(featureName, temp_pvalues)
    
    ### individual score 2 : variance
    data_final = data_rest[[feature_selected, 'IDNum']]
    data_final[feature_selected + '_zscore'] = (data_final[feature_selected] - np.nanmean(data_final[feature_selected])) / np.nanstd(data_final[feature_selected])
    data_final.rename({feature_selected + '_zscore' : task + '_' + featureName + '_zscore', feature_selected : task + '_' + feature_selected}, inplace = True, axis = 'columns')
    list_child = list_child.merge(data_final, on = 'IDNum', how = 'outer')
    
    
    ### Feature selection 3 : Connectivity 
    featureName = 'fnirs_feature_HbO_all_taskAvg_conn_conn_bin_con_den'
    feature_selected = 'fnirs_feature_HbO_all_taskAvg_conn_conn_bin_con_den'
    
    ### individual score 3 : Connectivity
    data_final = data_rest[[feature_selected, 'IDNum']]
    data_final[feature_selected + '_zscore'] = (data_final[feature_selected] - np.nanmean(data_final[feature_selected])) / np.nanstd(data_final[feature_selected])
    data_final.rename({feature_selected + '_zscore' : task + '_' + featureName + '_zscore', feature_selected : task + '_' + feature_selected}, inplace = True, axis = 'columns')
    list_child = list_child.merge(data_final, on = 'IDNum', how = 'outer')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


fnirs_feature_HbO_brodmann7_taskAvg_block_avg    0.023559
fnirs_feature_HbO_brodmann5_taskAvg_block_avg    0.071045
fnirs_feature_HbO_brodmann3_taskAvg_block_avg    0.076222
fnirs_feature_HbO_brodmann8_taskAvg_block_avg    0.591149
fnirs_feature_HbO_brodmann4_taskAvg_block_avg    0.617647
fnirs_feature_HbO_brodmann6_taskAvg_block_avg    0.633227
fnirs_feature_HbO_right_taskAvg_block_avg        0.669810
dtype: float64


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


fnirs_feature_HbO_brodmann8_taskAvg_block_avg    0.036502
fnirs_feature_HbO_brodmann5_taskAvg_block_avg    0.538613
fnirs_feature_HbO_brodmann1_taskAvg_block_avg    0.557850
fnirs_feature_HbO_all_taskAvg_block_avg          0.677517
fnirs_feature_HbO_brodmann6_taskAvg_block_avg    0.692698
fnirs_feature_HbO_brodmann2_taskAvg_block_avg    0.696405
fnirs_feature_HbO_brodmann7_taskAvg_block_avg    0.780894
fnirs_feature_HbO_brodmann4_taskAvg_block_avg    0.851227
fnirs_feature_HbO_left_taskAvg_block_avg         0.917746
dtype: float64


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  # Remove the CWD from sys.path while we load stuff.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  # This is added back by InteractiveShellApp.init_path()


fnirs_feature_HbO_brodmann4_taskAvg_block_avg    0.164142
fnirs_feature_HbO_brodmann3_taskAvg_block_avg    0.240565
fnirs_feature_HbO_brodmann7_taskAvg_block_avg    0.422418
fnirs_feature_HbO_brodmann2_taskAvg_block_avg    0.589480
fnirs_feature_HbO_brodmann8_taskAvg_block_avg    0.923529
dtype: float64
fnirs_feature_HbO_all_taskAvg_block_avg          0.402196
fnirs_feature_HbO_brodmann8_taskAvg_block_avg    0.646150
fnirs_feature_HbO_brodmann7_taskAvg_block_avg    0.683160
fnirs_feature_HbO_brodmann6_taskAvg_block_avg    0.893124
fnirs_feature_HbO_brodmann3_taskAvg_block_avg    0.910607
dtype: float64
fnirs_feature_HbO_left_taskAvg_block_avg     0.002925
fnirs_feature_HbO_all_taskAvg_block_avg      0.503277
fnirs_feature_HbO_right_taskAvg_block_avg    0.591235
dtype: float64


In [7]:
for col in list_child.columns:
    if 'zscore' in col:
        cur100 = col.replace('zscore', '100')
        temp = list_child[col]
        ttt = 75 + 25 * temp
        for idx in range(len(ttt)):
            if ttt[idx] > 100:
                ttt[idx] = 100
            elif ttt[idx] < 50:
                ttt[idx] = 50
            
        list_child[cur100] = ttt

In [27]:
list_child.to_csv('20190410_Result.csv')

In [9]:
list_child.to_excel('20190410_Result.xlsx')

In [8]:
list_child.columns

Index(['Date', 'UID', 'Age', 'IDNum', 'AgeNum', 'STROOP_Performance',
       'STROOP_Performance_zscore',
       'STROOP_fnirs_feature_HbO_brodmann7_taskAvg_block_avg',
       'STROOP_block_avg_zscore',
       'STROOP_fnirs_feature_HbO_all_taskAvg_block_variance',
       'STROOP_block_variance_zscore',
       'STROOP_fnirs_feature_HbO_all_taskAvg_conn_conn_bin_con_den',
       'STROOP_fnirs_feature_HbO_all_taskAvg_conn_conn_bin_con_den_zscore',
       'VFT_fnirs_feature_HbO_brodmann8_taskAvg_block_avg',
       'VFT_block_avg_zscore',
       'VFT_fnirs_feature_HbO_right_taskAvg_block_variance',
       'VFT_block_variance_zscore',
       'VFT_fnirs_feature_HbO_all_taskAvg_conn_conn_bin_con_den',
       'VFT_fnirs_feature_HbO_all_taskAvg_conn_conn_bin_con_den_zscore',
       'CBTTB_Performance', 'CBTTB_Performance_zscore',
       'CBTTB_fnirs_feature_HbO_brodmann4_taskAvg_block_avg',
       'CBTTB_block_avg_zscore',
       'CBTTB_fnirs_feature_HbO_all_taskAvg_block_variance',
       'CBTT