# Setup

In [1]:
import os
import pandas as pd
from functools import partial
from sklearn.datasets import load_iris
from sklearn.feature_selection import mutual_info_classif, SelectKBest

In [2]:
def create_dir(dir):
    try:
       os.makedirs(dir)
    except FileExistsError:
       pass

In [3]:
def import_dict(metadatapath):
    with open(metadatapath) as myfile:
    	indep_contents = myfile.read()
    return json.loads(indep_contents)

In [4]:
def indep_info(df_indep, indep_dict):
    df_info = pd.DataFrame({'variable': df_indep.head().columns})
    df_info['type'] = df_info['variable'].apply(lambda attr: indep_dict[attr]['type'])
    minmax = df_indep.agg(['min','max']).values.tolist()
    df_info['min'] = minmax[0]
    df_info['max'] = minmax[1]
    del minmax
    return df_info

# Given Information

In [5]:
year = 20
sel_num = 8
train_eachclass_num = 20

# Created Directories

In [6]:
create_dir("../select")
create_dir("../select/features")
create_dir("../select/proc")
create_dir(f"../select/traineach{train_eachclass_num}")
create_dir(f"../select/testexc{train_eachclass_num}")

# Univariate Feature Selection

In [7]:
df_proc = pd.read_csv(f"../processed/proc{year}enc.csv")
df_info = pd.read_csv(f"../info/proc{year}info.csv")

In [8]:
discrete_feat_idx = df_info.index[df_info['type']=='Categorical']
score_func = partial(mutual_info_classif, discrete_features=discrete_feat_idx)
feat_selector = SelectKBest(score_func, k=sel_num)
feat_selector.fit(df_proc.drop('class', axis=1), df_proc['class'])

In [9]:
feat_scores = pd.DataFrame()
feat_scores["Attribute"] = df_proc.drop('class', axis=1).columns
feat_scores['Type'] = df_info['type']
feat_scores["Support"] = feat_selector.get_support()
feat_scores["F Score"] = feat_selector.scores_
feat_scores["P Value"] = feat_selector.pvalues_
sel_feat_scores = feat_scores[feat_scores['Support']].drop('Support', axis=1)
df_sel_proc = df_proc[sel_feat_scores['Attribute']].join(df_proc['class'])

In [10]:
indep_dict = import_dict(metadatapath=f"../metadata/full/meta-indep-{year}.json")
df_sel_proc_info = indep_info(df_sel_proc.loc[:, df_sel_proc.columns != 'class'], 
                              indep_dict)

In [11]:
sel_feat_scores

Unnamed: 0,Attribute,Type,F Score,P Value
1,A_AGE,Continuous,0.159482,
43,PTOTVAL,Continuous,0.136694,
53,SS_VAL,Continuous,0.132981,
72,PEMLR,Categorical,0.14542,
145,RESNSS1,Categorical,0.13274,
154,RSNNOTW,Categorical,0.125123,
158,SS_YN,Categorical,0.139356,
181,FILESTAT,Categorical,0.118896,


# Train-Test Split

In [12]:
df_sel_train = df_sel_proc.groupby('class', group_keys=False).apply(lambda x: x.sample(train_eachclass_num))
df_sel_test = df_sel_proc.drop(df_sel_train.index)

In [13]:
df_sel_train.head()

Unnamed: 0,A_AGE,PTOTVAL,SS_VAL,PEMLR,RESNSS1,RSNNOTW,SS_YN,FILESTAT,class
62585,55,24602,0,1,0,0,2,3,0
81416,8,0,0,0,0,0,0,5,0
80584,85,28000,0,7,0,0,2,2,0
104013,19,0,0,7,0,4,2,5,0
79406,38,1002,0,7,0,4,2,5,0


In [14]:
df_sel_train['class'].value_counts()

class
0    20
1    20
2    20
3    20
4    20
Name: count, dtype: int64

# Exported Results

In [15]:
feat_scores.to_csv(f"../select/features/score{year}num{sel_num}.csv", header=True, index=False)
sel_feat_scores.to_csv(f"../select/features/feature{year}num{sel_num}.csv", header=True, index=False)

In [16]:
df_sel_proc.to_csv(f"../select/proc/selproc{year}num{sel_num}.csv", header=True, index=False)
df_sel_proc.to_csv(f"../select/proc/selproc{year}num{sel_num}noh.csv", header=False, index=False)

In [17]:
df_sel_proc_info.index = df_sel_proc_info.index + 1
df_sel_proc_info.to_csv(f"../select/proc/selproc{year}num{sel_num}info.csv", index_label="id")
df_sel_proc_info.to_csv(f"../select/proc/selproc{year}num{sel_num}infonoh.csv", index_label="id", header=False)

In [18]:
seltraindir = f"../select/traineach{train_eachclass_num}"
seltestdir = f"../select/testexc{train_eachclass_num}"
df_sel_train.to_csv(f"{seltraindir}/seltrain{year}num{sel_num}each{train_eachclass_num}.csv", 
                    header=True, index=False)
df_sel_test.to_csv(f"{seltestdir}/seltest{year}num{sel_num}exc{train_eachclass_num}.csv", 
                   header=True, index=False)
df_sel_train.to_csv(f"{seltraindir}/seltrain{year}num{sel_num}each{train_eachclass_num}noh.csv", 
                    header=False, index=False)
df_sel_test.to_csv(f"{seltestdir}/seltest{year}num{sel_num}exc{train_eachclass_num}noh.csv", 
                   header=False, index=False)