In [1]:
# Label Encoding & Binary Encoding
# Bir değişkenin iki sınıfı varsa ve 0-1 olarak kodlanırsa buna binary encoding denir.
# Bir kategorik değişken label encoder'a sokulursa ve ikiden fazla sınıfı varsa bu durumda label encoding yapılmış olur.

In [47]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
#!pip install missingno
#import missingno as msno
from datetime import date
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.neighbors import LocalOutlierFactor
from sklearn.preprocessing import MinMaxScaler, LabelEncoder,StandardScaler,RobustScaler

def grab_col_names(df,cat_th=10,car_th=20):
    cat_cols = [col for col in df.columns if df[col].dtypes == "O"]
    num_but_cat = [col for col in df.columns if df[col].nunique() < cat_th and df[col].dtypes != "O"]
    cat_but_car = [col for col in df.columns if df[col].nunique() > cat_th and df[col].dtypes == "O"]
    cat_cols = cat_cols + num_but_cat
    cat_cols = [col for col in cat_cols if col not in cat_but_car]
    
    #numumerik
    num_cols = [col for col in df.columns if df[col].dtypes != "O"]
    num_cols = [col for col in num_cols if col not in num_but_cat]
    
    print(f"Observations: {df.shape[0]}")
    print(f"Variables: {df.shape[1]}")
    print(f"cat_cols: {len(cat_cols)}") # Kategorik
    print(f"num_cols: {len(num_cols)}") # Numerik
    print(f"cat_but_car: {len(cat_but_car)}") # Kategorik ama kardinal
    print(f"num_but_cat: {len(num_but_cat)}")
    return cat_cols, num_cols, cat_but_car

def load():
    data = pd.read_csv("titanic.csv")
    return data

In [4]:
df = load()
df["Sex"].head()

0      male
1    female
2    female
3    female
4      male
Name: Sex, dtype: object

In [5]:
# Değerler dönüştürülüyor. Alfabetik şekilde ilk değere 0 verir.
le = LabelEncoder()
le.fit_transform(df["Sex"])[0:5]

array([1, 0, 0, 0, 1])

In [6]:
# Bu 0-1'ler nedir.
le.inverse_transform([0,1])

array(['female', 'male'], dtype=object)

In [7]:
def label_encoder(df,binary_col):
    labelencoder = LabelEncoder()
    df[binary_col] = labelencoder.fit_transform(df[binary_col])
    return df

In [8]:
df = load()

In [9]:
#Değişkenin tipi int ya da float değilse ve eşsiz 2 ise onu seç
binary_cols = [col for col in df.columns if df[col].dtype not in [int,float] and df[col].nunique() == 2] 

In [12]:
for col in binary_cols:
    label_encoder(df,col)

In [14]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",1,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",0,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",1,35.0,0,0,373450,8.05,,S


In [25]:
df = pd.read_csv("application_train.csv")
df.shape

(307511, 122)

In [26]:
binary_cols = [col for col in df.columns if df[col].dtype not in [int,float] and df[col].nunique() == 2]

In [28]:
df[binary_cols].head()

Unnamed: 0,TARGET,NAME_CONTRACT_TYPE,FLAG_OWN_CAR,FLAG_OWN_REALTY,FLAG_MOBIL,FLAG_EMP_PHONE,FLAG_WORK_PHONE,FLAG_CONT_MOBILE,FLAG_PHONE,FLAG_EMAIL,...,FLAG_DOCUMENT_12,FLAG_DOCUMENT_13,FLAG_DOCUMENT_14,FLAG_DOCUMENT_15,FLAG_DOCUMENT_16,FLAG_DOCUMENT_17,FLAG_DOCUMENT_18,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21
0,1,Cash loans,N,Y,1,1,0,1,1,0,...,0,0,0,0,0,0,0,0,0,0
1,0,Cash loans,N,N,1,1,0,1,1,0,...,0,0,0,0,0,0,0,0,0,0
2,0,Revolving loans,Y,Y,1,1,1,1,1,0,...,0,0,0,0,0,0,0,0,0,0
3,0,Cash loans,N,Y,1,1,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,Cash loans,N,Y,1,1,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0


In [29]:
for col in binary_cols:
    label_encoder(df,col)

In [32]:
df = pd.read_csv("titanic.csv")

In [35]:
# n/a eşsiz değerleri alırken n/a yı da alıyor
df["Embarked"].unique()
#len(df["Embarked"].unique())

4

In [None]:
# ------------------------------------------------------------------------------ #

In [36]:
# One Hot Encoding
# Kategorik verileri sayısal verilere dönüştürmek için kullanılan bir kodlama tekniğidir.

In [38]:
df = load()
df["Embarked"].value_counts()

S    644
C    168
Q     77
Name: Embarked, dtype: int64

In [42]:
# İlk sınıftan(C) kurtularak 0 1 lere ayırdık.
pd.get_dummies(df,columns=["Embarked"], drop_first=True).head()
# Eksikleri bir sınıf gibi görmek istersek.
# pd.get_dummies(df,columns=["Embarked"], dummy_na=True).head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked_Q,Embarked_S
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,0,1
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,0,0
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,0,1
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,0,1
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,0,1


In [43]:
# Cinsiyeti de ayırdık.
pd.get_dummies(df,columns=["Sex"], drop_first=True).head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Sex_male
0,1,0,3,"Braund, Mr. Owen Harris",22.0,1,0,A/5 21171,7.25,,S,1
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",38.0,1,0,PC 17599,71.2833,C85,C,0
2,3,1,3,"Heikkinen, Miss. Laina",26.0,0,0,STON/O2. 3101282,7.925,,S,0
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",35.0,1,0,113803,53.1,C123,S,0
4,5,0,3,"Allen, Mr. William Henry",35.0,0,0,373450,8.05,,S,1


In [45]:
# get_dummies methodu kullanarak hem label encoding işlemini (2 sınıflı değişken) hem de one-hot encoding işlemini yapabiliriz.
# Eğer bir değişkenin sınıf sayısı 2 ise drop_first'ü True yaptığımda direkt zaten o değişken binary encode ediliyor.
pd.get_dummies(df,columns=["Sex","Embarked"], drop_first=True).head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Age,SibSp,Parch,Ticket,Fare,Cabin,Sex_male,Embarked_Q,Embarked_S
0,1,0,3,"Braund, Mr. Owen Harris",22.0,1,0,A/5 21171,7.25,,1,0,1
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",38.0,1,0,PC 17599,71.2833,C85,0,0,0
2,3,1,3,"Heikkinen, Miss. Laina",26.0,0,0,STON/O2. 3101282,7.925,,0,0,1
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",35.0,1,0,113803,53.1,C123,0,0,1
4,5,0,3,"Allen, Mr. William Henry",35.0,0,0,373450,8.05,,1,0,1


In [46]:
def one_hot_encoder(df,categorical_cols,drop_first=True):
    df = pd.get_dummies(df,columns= categorical_cols, drop_first=drop_first)
    return df

In [51]:
# cat_cols, num_cols, cat_but_car = grab_col_names(df)

# Cinsiyet ve Survivedden kurtuldum, 2 sınıflılar zaten
ohe_cols = [col for col in df.columns if 10 >= df[col].nunique() > 2]
ohe_cols

['Pclass', 'SibSp', 'Parch', 'Embarked']

In [52]:
one_hot_encoder(df,ohe_cols).head()

Unnamed: 0,PassengerId,Survived,Name,Sex,Age,Ticket,Fare,Cabin,Pclass_2,Pclass_3,...,SibSp_5,SibSp_8,Parch_1,Parch_2,Parch_3,Parch_4,Parch_5,Parch_6,Embarked_Q,Embarked_S
0,1,0,"Braund, Mr. Owen Harris",male,22.0,A/5 21171,7.25,,0,1,...,0,0,0,0,0,0,0,0,0,1
1,2,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,PC 17599,71.2833,C85,0,0,...,0,0,0,0,0,0,0,0,0,0
2,3,1,"Heikkinen, Miss. Laina",female,26.0,STON/O2. 3101282,7.925,,0,1,...,0,0,0,0,0,0,0,0,0,1
3,4,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,113803,53.1,C123,0,0,...,0,0,0,0,0,0,0,0,0,1
4,5,0,"Allen, Mr. William Henry",male,35.0,373450,8.05,,0,1,...,0,0,0,0,0,0,0,0,0,1


In [53]:
# ------------------------------------------------------------------------------ #

In [None]:
# ------------------------------------------------------------------------------ #

In [54]:
# Rare Encoding -> Bonus, Orta-İleri
# Nadir görülen kategorik değerleri daha genel bir kategori altında gruplar.

In [55]:
# 1- Kategorik değişkenlerin azlık çokluk durumu analizi
df = pd.read_csv("application_train.csv")

In [56]:
df["NAME_EDUCATION_TYPE"].value_counts()

Secondary / secondary special    218391
Higher education                  74863
Incomplete higher                 10277
Lower secondary                    3816
Academic degree                     164
Name: NAME_EDUCATION_TYPE, dtype: int64

In [57]:
cat_cols, num_cols, cat_but_car = grab_col_names(df)

Observations: 307511
Variables: 122
cat_cols: 53
num_cols: 67
cat_but_car: 2
num_but_cat: 39


In [59]:
# Kategorik değişkenlerin kendi içinde dağılımlarını gördük.
def cat_summary(df,col_name,plot=False):
    print(pd.DataFrame({col_name:df[col_name].value_counts(), # Değişkenin ismi ve oranları
                       "Ratio":100*df[col_name].value_counts() / len(df)}))
    
    print("#############################")
    
    if plot:
        sns.countplot(x=df[col_name],data=df)
        plt.show()

In [60]:
# Bütün kategorik değişkenler geldi.
for col in cat_cols:
    cat_summary(df,col)

                 NAME_CONTRACT_TYPE      Ratio
Cash loans                   278232  90.478715
Revolving loans               29279   9.521285
#############################
     CODE_GENDER      Ratio
F         202448  65.834393
M         105059  34.164306
XNA            4   0.001301
#############################
   FLAG_OWN_CAR      Ratio
N        202924  65.989184
Y        104587  34.010816
#############################
   FLAG_OWN_REALTY      Ratio
Y           213312  69.367275
N            94199  30.632725
#############################
                 NAME_TYPE_SUITE      Ratio
Unaccompanied             248526  80.818572
Family                     40149  13.056118
Spouse, partner            11370   3.697429
Children                    3267   1.062401
Other_B                     1770   0.575589
Other_A                      866   0.281616
Group of people              271   0.088127
#############################
                      NAME_INCOME_TYPE      Ratio
Working                 

In [62]:
# 2- Rare kategoriler ile bağımlı değişken arasındaki ilişkinin analizi

# Target tarafından bakıyoruz.
df.groupby("NAME_INCOME_TYPE")["TARGET"].mean()

NAME_INCOME_TYPE
Businessman             0.000000
Commercial associate    0.074843
Maternity leave         0.400000
Pensioner               0.053864
State servant           0.057550
Student                 0.000000
Unemployed              0.363636
Working                 0.095885
Name: TARGET, dtype: float64

In [68]:
# Yukarıdaki iki işlemi birleştirdik.
def rare_analyser(df,target,cat_cols):
    for col in cat_cols:
        print(col,":",len(df[col].value_counts())) # Kaç sınıfı var bilgisi
        
        print(pd.DataFrame({"COUNT": df[col].value_counts(), # sınıf frekansları
                            "RATIO": df[col].value_counts() / len(df), # Sınıf oranları
                            "TARGET_MEAN": df.groupby(col)[target].mean()}),end="\n\n\n") # Bağımlı değişkene göre grup işlemi

In [69]:
# Bütün kategorik sınıflar için rare işlemi gerçekleştirdim.
rare_analyser(df,"TARGET",cat_cols)

NAME_CONTRACT_TYPE : 2
                  COUNT     RATIO  TARGET_MEAN
Cash loans       278232  0.904787     0.083459
Revolving loans   29279  0.095213     0.054783


CODE_GENDER : 3
      COUNT     RATIO  TARGET_MEAN
F    202448  0.658344     0.069993
M    105059  0.341643     0.101419
XNA       4  0.000013     0.000000


FLAG_OWN_CAR : 2
    COUNT     RATIO  TARGET_MEAN
N  202924  0.659892     0.085002
Y  104587  0.340108     0.072437


FLAG_OWN_REALTY : 2
    COUNT     RATIO  TARGET_MEAN
N   94199  0.306327     0.083249
Y  213312  0.693673     0.079616


NAME_TYPE_SUITE : 7
                  COUNT     RATIO  TARGET_MEAN
Children           3267  0.010624     0.073768
Family            40149  0.130561     0.074946
Group of people     271  0.000881     0.084871
Other_A             866  0.002816     0.087760
Other_B            1770  0.005756     0.098305
Spouse, partner   11370  0.036974     0.078716
Unaccompanied    248526  0.808186     0.081830


NAME_INCOME_TYPE : 8
                  

In [70]:
# Rare Encoder Yazılması
# Düşük oranları bir araya getireceğiz.

In [71]:
def rare_encoder(df,rare_perc):
    temp_df = df.copy()
    
    # Eğer fonksiyona girilen rare oranından daha düşük sayıda herhnagi bir bu kategorik değişkenin oranı varsa 
    # ve aynı zamanda bir kategorik değişken ise bunları getir.
    rare_columns = [col for col in temp_df.columns if temp_df[col].dtypes == "O"
                    and (temp_df[col].value_counts() / len(temp_df) < rare_perc).any(axis=None)]
    
    for var in rare_columns:
        tmp = temp_df[var].value_counts() / len(temp_df) # Sınıf oranları hesaplanmış
        rare_labels = tmp[tmp<rare_perc].index # Eşik değerden düşük sınıfa sahip olan değerler bu indexler
        temp_df[var] = np.where(temp_df[var].isin(rare_labels), "Rare",temp_df[var])
        
    return temp_df    

In [72]:
new_df = rare_encoder(df,0.01)

In [74]:
rare_analyser(new_df,"TARGET",cat_cols)

NAME_CONTRACT_TYPE : 2
                  COUNT     RATIO  TARGET_MEAN
Cash loans       278232  0.904787     0.083459
Revolving loans   29279  0.095213     0.054783


CODE_GENDER : 3
       COUNT     RATIO  TARGET_MEAN
F     202448  0.658344     0.069993
M     105059  0.341643     0.101419
Rare       4  0.000013     0.000000


FLAG_OWN_CAR : 2
    COUNT     RATIO  TARGET_MEAN
N  202924  0.659892     0.085002
Y  104587  0.340108     0.072437


FLAG_OWN_REALTY : 2
    COUNT     RATIO  TARGET_MEAN
N   94199  0.306327     0.083249
Y  213312  0.693673     0.079616


NAME_TYPE_SUITE : 5
                  COUNT     RATIO  TARGET_MEAN
Children           3267  0.010624     0.073768
Family            40149  0.130561     0.074946
Rare               2907  0.009453     0.093911
Spouse, partner   11370  0.036974     0.078716
Unaccompanied    248526  0.808186     0.081830


NAME_INCOME_TYPE : 5
                       COUNT     RATIO  TARGET_MEAN
Commercial associate   71617  0.232892     0.074843
Pens