In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.impute import SimpleImputer 
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import VarianceThreshold
from sklearn.feature_selection import SelectFromModel
from sklearn.utils import shuffle
from sklearn.ensemble import RandomForestClassifier

pd.set_option('display.max_columns', 100)

디버깅 함으로써 전체 데이터 다 불러와서 읽기 x / 일부반 추출 ( nrows -> 가 설정하는거 ) 

In [None]:
DEBUG = True

In [None]:
if DEBUG:
    NROWS = 50000 # 5만개 읽음 
else:
    NROWS = None

In [None]:
NROWS

In [None]:
%%time
train = pd.read_csv('../input/porto-seguro-safe-driver-prediction/train.csv',nrows=NROWS)
test = pd.read_csv('../input/porto-seguro-safe-driver-prediction/test.csv',nrows=NROWS)
# train = train.sample(frac=0.2)

In [None]:
train.shape

In [None]:
train.head()

In [None]:
cat_cols = [ col for col in train.columns if 'cat' in col]

In [None]:
for col in cat_cols:
    print(train[col].value_counts().shape[0])

In [None]:
for col in cat_cols:
    print(col,train[col].nunique())#카테고리와 카테고리의 유니크한 갯수가 몇개인지

In [None]:
train.shape # row 갯수가 왜 다르지..? 

In [None]:
test.shape

In [None]:
train.drop_duplicates() # 중복칼럼 삭제 (-> 없음)
train.shape 

In [None]:
train.info()

- Metadata (# 데이터를 정리해두는것 중요 -> 정리된 데이터를 csv 파일로 재저장 )

In [None]:
data =[]
for f in train.columns:
    if f == 'target':
        role = 'target' # role 정의하기 
    elif f == 'id':
        role = 'id'
    else:
        role = 'input'
        
        # level 정의하기 
    if 'bin' in f or f == 'target':
        level = 'binary'
    elif 'cat' in f or f =='id':
        level = 'nominal'
    elif train[f].dtype == float:
        level = 'interval'
    elif train[f].dtype == int:
        level = 'ordinal'
        
    keep = True
    if f == 'id':
        keep = False 
        
    dtype = train[f].dtype
    
    f_dict = {
        'varname' :f,
        'role' : role,
        'level':level,
        'keep':keep,
        'dtype':dtype
    }
    data.append(f_dict) 
    #datatype 과 메타정보 저장 

In [None]:
#meat 데이터프레임 넣기& 컬럼순서잡기 
meta = pd.DataFrame(data, columns=['varname', 'role', 'level', 'keep', 'dtype'])
meta.set_index('varname',inplace=True)

In [None]:
meta

In [None]:
meta.loc[(meta.level=='nominal')&(meta.keep)].index

In [None]:
pd.DataFrame({'count' : meta.groupby(['role','level'])['role'].size()}).reset_index()

- Descriptive statistics

In [None]:
v =meta[(meta.level == 'interval') & (meta.keep)].index

In [None]:
train[v].describe()

- reg value 중에는 ps_reg_03이 있음 -> -1.0000 때문에

Ordinal variables
- ps_car_11 만 missing value

In [None]:
v = meta[(meta.level == 'ordinal') & (meta.keep)].index

In [None]:
train[v].describe()

Binary variables
- train 데이터의 target 값 보니 너무 imbalaced => 머리아픔! 
- => 이말은 즉슨 대부분이 보험청구가 되지 않았음을 의미 

In [None]:
v = meta[(meta.level == 'binary') & (meta.keep)].index
train[v].describe()

**Handling imbalanced classes**
- target =1 이 숫자가 너무작아서 ( ex. 100개중에 96개가 0, 4개가 1인거임 )
- 전부다 0임은 96% 정확도 , but 얼마나 1을 잘 맞추느냐가 중요함 
- 그러므로 accuracy를 쓰면 안됨 그래서 지니나 ROC 같은것을 쓰는거임!! (오호 +0+ )

Over Sampling & Under Sampling 
- over : 96 ->0, 4->1 이면 4를 늘려서 50개로 부풀린다음 1 
- under 는 반대! 
- * SMOTE ? 
-oversampling records with target=1
-undersampling records with target=0

In [None]:
desired_apriori = 0.1

In [None]:
idx_0 = train[train.target == 0].index
idx_1 = train[train.target == 1].index

In [None]:
nb_0 = len(train.loc[idx_0])
nb_1 = len(train.loc[idx_1])

In [None]:
undersampling_rate = ((1-desired_apriori)*nb_1)/(nb_0*desired_apriori)
undersampled_nb_0 = int(undersampling_rate*nb_0)
print('Rate to undersample records with target=0: {}'.format(undersampling_rate))
print('Number of records with target=0 after undersampling: {}'.format(undersampled_nb_0))


In [None]:
undersampled_idx = shuffle(idx_0,random_state=37, n_samples=undersampled_nb_0)
idx_list = list(undersampled_idx) + list(idx_1)

In [None]:
train = train.loc[idx_list].reset_index(drop=True)

-Data 퀄리티 쳌 

- missing value 를 찾아서 지워주기 

In [None]:
vars_with_missing = []

for f in train.columns:
    missings = train[train[f] == -1][f].count()
    if missings > 0:
        vars_with_missing.append(f)
        missings_perc = missings/train.shape[0]
        
        print('Variable {} has {} records ({:.2%}) with missing values'.format(f, missings, missings_perc))
        
print('In total, there are {} variables with missing values'.format(len(vars_with_missing)))

In [None]:
# 지워도 되는값인지 확인하기 ( 왜 지워도 되는거징..?)

train[['ps_car_05_cat','target']].groupby('ps_car_05_cat').mean()

- missing value 도 함부로 채우면 안됨! 

In [None]:
vars_to_drop = ['ps_car_03_cat', 'ps_car_05_cat']
train.drop(vars_to_drop,axis=1,inplace=True)

In [None]:
meta.loc[(vars_to_drop),'keep'] = False

In [None]:
#Imputing   
mean_imp = SimpleImputer(missing_values=-1, strategy='mean')
mode_imp = SimpleImputer(missing_values=-1, strategy='most_frequent')


In [None]:
train['ps_reg_03'] = mean_imp.fit_transform(train[['ps_reg_03']]).ravel()
train['ps_car_12'] = mean_imp.fit_transform(train[['ps_car_12']]).ravel()
train['ps_car_14'] = mean_imp.fit_transform(train[['ps_car_14']]).ravel()
train['ps_car_11'] = mode_imp.fit_transform(train[['ps_car_11']]).ravel()

- category 만들기 

In [None]:
v = meta[(meta.level == 'nominal') & (meta.keep)].index

for f in v:
    dist_values = train[f].value_counts().shape[0]
    print('Variable {} has {} distinct values'.format(f, dist_values))

In [None]:
sum

In [None]:
#노이즈값 확인 
np.random.randn(ft_trn_series.shape[0])

In [None]:
# Script by https://www.kaggle.com/ogrellier
# Code: https://www.kaggle.com/ogrellier/python-target-encoding-for-categorical-features
def add_noise(series, noise_level):
    return series * (1 + noise_level * np.random.randn(len(series)))

def target_encode(trn_series=None, 
                  tst_series=None, 
                  target=None, 
                  min_samples_leaf=1, 
                  smoothing=1,
                  noise_level=0):
   
    assert len(trn_series) == len(target)
    assert trn_series.name == tst_series.name
    temp = pd.concat([trn_series, target], axis=1)
    # Compute target mean 
    averages = temp.groupby(by=trn_series.name)[target.name].agg(["mean", "count"])
    # Compute smoothing
    smoothing = 1 / (1 + np.exp(-(averages["count"] - min_samples_leaf) / smoothing))
    # Apply average function to all target data
    prior = target.mean()
    # The bigger the count the less full_avg is taken into account
    averages[target.name] = prior * (1 - smoothing) + averages["mean"] * smoothing
    averages.drop(["mean", "count"], axis=1, inplace=True)
    # Apply averages to trn and tst series
    ft_trn_series = pd.merge(
        trn_series.to_frame(trn_series.name),
        averages.reset_index().rename(columns={'index': target.name, target.name: 'average'}),
        on=trn_series.name,
        how='left')['average'].rename(trn_series.name + '_mean').fillna(prior)
    # pd.merge does not keep the index so restore it
    ft_trn_series.index = trn_series.index 
    ft_tst_series = pd.merge(
        tst_series.to_frame(tst_series.name),
        averages.reset_index().rename(columns={'index': target.name, target.name: 'average'}),
        on=tst_series.name,
        how='left')['average'].rename(trn_series.name + '_mean').fillna(prior)
    # pd.merge does not keep the index so restore it
    ft_tst_series.index = tst_series.index
    return add_noise(ft_trn_series, noise_level), add_noise(ft_tst_series, noise_level)

In [None]:
train_encoded, test_encoded = target_encode(train['ps_car_11_cat'],
                                            test['ps_car_11_cat'],
                                            target=train.target,
                                            min_sample_leaf=100,
                                            smoothing=10,
                                            noise_level=0.01)

train['ps_car_11_cat_te'] = train_encoded
train.drop('ps_car_11_cat', axis=1, inplace=True)
meta.loc['ps_car_11_cat','keep'] = False  # Updating the meta
test['ps_car_11_cat_te'] = test_encoded
test.drop('ps_car_11_cat', axis=1, inplace=True)

In [None]:
# pd.merge(trn_series.to_frame(trn_series.name),
#         averages.reset_index().rename(columns={'index': target.name, target.name: 'average'}),
#                          on=trn_series.name,
#         how='left') # left 의 모든기준들을 살리면서 붙여라 

**EDA**

In [None]:
sns.set(font_scale=2) # 폰트 크기 일정하게 정해주는것 

- 그래프 나오는 완전꿀 코딩임 ( 밑에 )

In [None]:
v = meta[(meta.level == 'nominal') & (meta.keep)].index

for f in v:
    plt.figure()
    fig, ax = plt.subplots(figsize=(20,10))
    
    cat_perc = train[[f, 'target']].groupby([f],as_index=False).mean()
    cat_perc.sort_values(by='target', ascending=False, inplace=True)
    
    sns.barplot(ax=ax, x=f, y='target', data=cat_perc, order=cat_perc[f]) # order -> 순서는 cat_perc의 f 로 
    plt.ylabel('% target')
    plt.xlabel(f)
    plt.tick_params(axis='both', which='major', labelsize=18)
    plt.show();

In [None]:
# Script by https://www.kaggle.com/ogrellier
# Code: https://www.kaggle.com/ogrellier/python-target-encoding-for-categorical-features
def add_noise(series, noise_level):
    return series * (1 + noise_level * np.random.randn(len(series)))

def target_encode(trn_series=None, 
                  tst_series=None, 
                  target=None, 
                  min_samples_leaf=1, 
                  smoothing=1,
                  noise_level=0):
    """
    Smoothing is computed like in the following paper by Daniele Micci-Barreca
    https://kaggle2.blob.core.windows.net/forum-message-attachments/225952/7441/high%20cardinality%20categoricals.pdf
    trn_series : training categorical feature as a pd.Series
    tst_series : test categorical feature as a pd.Series
    target : target data as a pd.Series
    min_samples_leaf (int) : minimum samples to take category average into account
    smoothing (int) : smoothing effect to balance categorical average vs prior  
    """ 
    assert len(trn_series) == len(target)
    assert trn_series.name == tst_series.name
    temp = pd.concat([trn_series, target], axis=1)
    # Compute target mean 
    averages = temp.groupby(by=trn_series.name)[target.name].agg(["mean", "count"])
    # Compute smoothing
    smoothing = 1 / (1 + np.exp(-(averages["count"] - min_samples_leaf) / smoothing))
    # Apply average function to all target data
    prior = target.mean()
    # The bigger the count the less full_avg is taken into account
    averages[target.name] = prior * (1 - smoothing) + averages["mean"] * smoothing
    averages.drop(["mean", "count"], axis=1, inplace=True)
    # Apply averages to trn and tst series
    ft_trn_series = pd.merge(
        trn_series.to_frame(trn_series.name),
        averages.reset_index().rename(columns={'index': target.name, target.name: 'average'}),
        on=trn_series.name,
        how='left')['average'].rename(trn_series.name + '_mean').fillna(prior)
    # pd.merge does not keep the index so restore it
    ft_trn_series.index = trn_series.index 
    ft_tst_series = pd.merge(
        tst_series.to_frame(tst_series.name),
        averages.reset_index().rename(columns={'index': target.name, target.name: 'average'}),
        on=tst_series.name,
        how='left')['average'].rename(trn_series.name + '_mean').fillna(prior)
    # pd.merge does not keep the index so restore it
    ft_tst_series.index = tst_series.index
    return add_noise(ft_trn_series, noise_level), add_noise(ft_tst_series, noise_level)

In [None]:
train_encoded, test_encoded = target_encode(train["ps_car_11_cat"], 
                             test["ps_car_11_cat"], 
                             target=train.target, 
                             min_samples_leaf=100,
                             smoothing=10,
                             noise_level=0.01)
    
train['ps_car_11_cat_te'] = train_encoded
train.drop('ps_car_11_cat', axis=1, inplace=True)
meta.loc['ps_car_11_cat','keep'] = False  # Updating the meta
test['ps_car_11_cat_te'] = test_encoded
test.drop('ps_car_11_cat', axis=1, inplace=True)

**Exploratory Data Visualization**

In [None]:
v = meta[(meta.level == 'nominal') & (meta.keep)].index

for f in v:
    plt.figure()
    fig, ax = plt.subplots(figsize=(20,10))
    # Calculate the percentage of target=1 per category value
    cat_perc = train[[f, 'target']].groupby([f],as_index=False).mean()
    cat_perc.sort_values(by='target', ascending=False, inplace=True)
    # Bar plot
    # Order the bars descending on target mean
    sns.barplot(ax=ax, x=f, y='target', data=cat_perc, order=cat_perc[f])
    plt.ylabel('% target', fontsize=18)
    plt.xlabel(f, fontsize=18)
    plt.tick_params(axis='both', which='major', labelsize=18)
    plt.show();

In [None]:
def corr_heatmap(v):
    correlations = train[v].corr()

    # Create color map ranging between two colors
    cmap = sns.diverging_palette(220, 10, as_cmap=True)

    fig, ax = plt.subplots(figsize=(10,10))
    sns.heatmap(correlations, cmap=cmap, vmax=1.0, center=0, fmt='.2f',
                square=True, linewidths=.5, annot=True, cbar_kws={"shrink": .75})
    plt.show();
    
v = meta[(meta.level == 'interval') & (meta.keep)].index
corr_heatmap(v)


In [None]:
s = train.sample(frac=0.1)

In [None]:
sns.lmplot(x='ps_reg_02', y='ps_reg_03', data=s, hue='target', palette='Set1', scatter_kws={'alpha':0.3})
plt.show()

In [None]:
sns.lmplot(x='ps_car_12', y='ps_car_13', data=s, hue='target', palette='Set1', scatter_kws={'alpha':0.3})
plt.show()