### Loading packages

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
#from sklearn.preprocessing import Imputer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import VarianceThreshold
from sklearn.feature_selection import SelectFromModel
from sklearn.utils import shuffle
from sklearn.ensemble import RandomForestClassifier

pd.set_option('display.max_columns', 100)

### Loading data

In [4]:
train = pd.read_csv('../PortoSeguro/train_reduce.csv')
#test = pd.read_csv('../PortoSeguro/test.csv')

### Data at first sight

* Features that belong to similar groupings are tagged as such in the feature names (e.g., ind, reg, car, calc).  
* Feature names include the postfix bin to indicate binary features and cat to indicate categorical features.  
* Features without these designations are either continuous or ordinal.  
* Values of -1 indicate that the feature was missing from the observation.  
* The target columns signifies whether or not a claim was filed for that policy holder.   

* Binary features == 여부([0,1], [true, false])  
* Categorical(범주형) features   
  * 명목형 자료(Nominal data) == 성별(남/여), 혈액형(A/B/O/AB)  # 순서의 의미가 없다.
  * 순서형 자료(Ordinal data) == 효과(없음/조금있음/매우있음)   # 순서의 의미가 있다.
  
* Numerical(수치형) features  
  * 이산형 자료(Discrete data)   == 일정기간 동안의 발생횟수, 출산횟수  # 이산적인 값을 가짐 
  * 연속형 자료(Continuous data) == 신장, 체중, 협압                    # 연속적인 값을 가짐

### We indeed see the following ###   
* binary variables
* categorical variables of which the category values are integers
* other variables with integer or float values
* variables with -1 representing missing values
* the target variable and an ID variable

In [29]:
train.shape

(1950, 59)

In [30]:
train.drop_duplicates()
train.shape

(1950, 59)

No duplicate rows, so that's fine

In [None]:
train.info()

### Metadata ###  
To facilitate the data management, we'll store meta-information about the variables in a DataFrame.   
This will be helpful when we want to select specific variables for analysis, visualization, modeling, ...


Concretely we will store:

* role: input, ID, target
* level: nominal, interval, ordinal, binary
* keep: True or False
* dtype: int, float, str

In [45]:
data = []
for f in train.columns:
    
    # Defining the role 
    if f == 'target':
        role = 'target'
    elif f == 'id':
        role = 'id'
    else:
        role = 'input'
    
    # Defining the level
    if 'bin' in f or f == 'target':
        level = 'binary'
    elif 'cat' in f or f == 'id':
        level = 'nominal'
    elif train[f].dtype == float:
        level = 'interval'
    elif train[f].dtype == int:
        level = 'ordinal'
    
    # Initialize keep to True for all variables except for id 
    keep = True 
    if f == 'id':
        keep = False
    
    # Defining the data type
    dtype = train[f].dtype
    
    # Creating a Dict that contains all the metadata for the variable 
    f_dict = {
        'varname': f,
        'role': role,
        'level': level,
        'keep': keep,
        'dtype': dtype
    }
    data.append(f_dict)

meta = pd.DataFrame(data, columns=['varname', 'role', 'level', 'keep', 'dtype'])
meta.set_index('varname', inplace=True)

In [None]:
meta