# 4_1_Feature_extraction_SE.ipynb
In SE, create training and test data for response variable and sample training and test data for explanatory variables based on the connection between Path ID and SE.

### input
- 3_Calc_Edit_Distance/output/Calc_Edit_Distance.csv.gz : A file that calculates the edit distance between Paths.
- 9_Integration_SE_TI_Target_datafile/Y_binary_SE.npz : A file with Path ID and SE linked.

### output
- 5_X_train_test_datafile/Y/Y_train_SE.npz : Training data for response variable in SE.
- 5_X_train_test_datafile/Y/Y_test_SE.npz : Test data for response variable in SE.
- 5_X_train_test_datafile/X_Feature_Extraction/train/X_train_SE_*.npz : Training data for explanatory variables after sampling in SE.
- 5_X_train_test_datafile/X_Feature_Extraction/test/X_test_SE_*.npz : Test data for explanatory variables after sampling in SE.

In [1]:
import pandas as pd
from scipy.sparse import csr_matrix
from scipy.sparse import save_npz, load_npz
from sklearn.model_selection import train_test_split

In [2]:
df = pd.read_csv('../3_Calc_Edit_Distance/output/Calc_Edit_Distance.csv.gz', header = 0, index_col=0)

  mask |= (ar1 == a)


In [3]:
df_same = pd.DataFrame(range(67481), columns = ['down_index_new_0'])

In [4]:
df_same['down_index_new_1'] = df_same['down_index_new_0']
df_same['value'] = 1

In [5]:
df = pd.concat([df, df.rename(columns = {'down_index_new_0':'a'}).rename(columns = {'down_index_new_1':'down_index_new_0'}).rename(columns = {'a':'down_index_new_1'})])

In [6]:
df = pd.concat([df, df_same])

In [7]:
df

Unnamed: 0,down_index_new_0,down_index_new_1,value
0,3,2,0.333333
1,4,2,0.500000
2,4,3,0.333333
3,5,2,0.333333
4,5,3,0.333333
...,...,...,...
67476,67476,67476,1.000000
67477,67477,67477,1.000000
67478,67478,67478,1.000000
67479,67479,67479,1.000000


In [8]:
train_id, test_id = train_test_split(pd.DataFrame(range(67481)), test_size=0.1, random_state = 0)

In [9]:
X_train = pd.merge(df, train_id.reset_index(drop = True), left_on = 'down_index_new_1', right_on = 0).drop(columns = 0)

In [10]:
X_test = pd.merge(df, test_id.reset_index(drop = True), left_on = 'down_index_new_1', right_on = 0).drop(columns = 0)

In [11]:
X_test['id_1'] = X_test['down_index_new_1'].rank(method='dense') - 1
X_test['id_1'] = X_test['id_1'].astype(int)

In [12]:
X_train['id_1'] = X_train['down_index_new_1'].rank(method='dense') - 1
X_train['id_1'] = X_train['id_1'].astype(int)

In [13]:
X_train 

Unnamed: 0,down_index_new_0,down_index_new_1,value,id_1
0,3,2,0.333333,2
1,4,2,0.500000,2
2,5,2,0.333333,2
3,3046,2,0.250000,2
4,55173,2,0.500000,2
...,...,...,...,...
387742671,55547,55547,1.000000,50007
387742672,59393,59393,1.000000,53466
387742673,60813,60813,1.000000,54736
387742674,61738,61738,1.000000,55574


In [14]:
X_test

Unnamed: 0,down_index_new_0,down_index_new_1,value,id_1
0,4,3,0.333333,0
1,5,3,0.333333,0
2,3046,3,0.750000,0
3,3047,3,0.333333,0
4,2,3,0.333333,0
...,...,...,...,...
42752640,67476,67480,0.666667,6748
42752641,67477,67480,0.750000,6748
42752642,67478,67480,0.800000,6748
42752643,67479,67480,0.750000,6748


In [15]:
del df

In [16]:
y = pd.DataFrame(load_npz('../9_Integration_SE_TI_Target_datafile/Y_binary_SE.npz').toarray())

In [17]:
y

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,167,168,169,170,171,172,173,174,175,176
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
67476,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
67477,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
67478,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
67479,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [18]:
Y_train = pd.merge(y.reset_index(), train_id.reset_index(drop = True).rename(columns = {0:'index'}), left_on = 'index', right_on = 'index').sort_values('index').set_index('index')
Y_test = pd.merge(y.reset_index(), test_id.reset_index(drop = True).rename(columns = {0:'index'}), left_on = 'index', right_on = 'index').sort_values('index').set_index('index')

In [19]:
save_npz('../5_X_train_test_datafile/Y/Y_train_SE.npz', csr_matrix(Y_train))
save_npz('../5_X_train_test_datafile/Y/Y_test_SE.npz', csr_matrix(Y_test))

In [20]:
for i in range(177):
    y_index = pd.DataFrame(y[y[i]==1].index).rename(columns = {0:'ID'})
    y_index['ID_sub'] = y_index['ID'].rank(method='dense')-1
    y_index['ID_sub']= y_index['ID_sub'].astype(int)
    df_y = pd.merge(X_train, y_index, left_on = 'down_index_new_0', right_on = 'ID').drop(columns = 'ID')
    
    X_train_m = csr_matrix((list(df_y['value']), (list(df_y['id_1']), list(df_y['ID_sub']))), shape=(len(train_id), len(y_index)))
    
    df_y = pd.merge(X_test, y_index, left_on = 'down_index_new_0', right_on = 'ID').drop(columns = 'ID')
    
    X_test_m = csr_matrix((list(df_y['value']), (list(df_y['id_1']), list(df_y['ID_sub']))), shape=(len(test_id), len(y_index)))

    save_npz('../5_X_train_test_datafile/X_Feature_Extraction/train/X_train_SE_'+ str(i) +'.npz', csr_matrix(X_train_m))
    save_npz('../5_X_train_test_datafile/X_Feature_Extraction/test/X_test_SE_'+ str(i) +'.npz', csr_matrix(X_test_m))