In [4]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
#!pip install missingno
#import missingno as msno
from datetime import date
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.neighbors import LocalOutlierFactor
from sklearn.preprocessing import MinMaxScaler, LabelEncoder,StandardScaler,RobustScaler

def grab_col_names(df,cat_th=10,car_th=20):
    cat_cols = [col for col in df.columns if df[col].dtypes == "O"]
    num_but_cat = [col for col in df.columns if df[col].nunique() < cat_th and df[col].dtypes != "O"]
    cat_but_car = [col for col in df.columns if df[col].nunique() > cat_th and df[col].dtypes == "O"]
    cat_cols = cat_cols + num_but_cat
    cat_cols = [col for col in cat_cols if col not in cat_but_car]
    
    #numumerik
    num_cols = [col for col in df.columns if df[col].dtypes != "O"]
    num_cols = [col for col in num_cols if col not in num_but_cat]
    
    print(f"Observations: {df.shape[0]}")
    print(f"Variables: {df.shape[1]}")
    print(f"cat_cols: {len(cat_cols)}") # Kategorik
    print(f"num_cols: {len(num_cols)}") # Numerik
    print(f"cat_but_car: {len(cat_but_car)}") # Kategorik ama kardinal
    print(f"num_but_cat: {len(num_but_cat)}")
    return cat_cols, num_cols, cat_but_car

def load():
    data = pd.read_csv("titanic.csv")
    return data

In [1]:
# 4-Özellik Ölçeklendirme (Feature Scaling)

In [2]:
# StandardScaler : Klasik standartlaşma. Tüm gözlem birimlerinden ortalamayı çıkar, standart sapmaya böl. z = (x-u) / s

In [5]:
df = load()

In [7]:
ss = StandardScaler()
df["Age_standart_scaler"] = ss.fit_transform(df[["Age"]])
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Age_standart_scaler
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,-0.530377
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,0.571831
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,-0.254825
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,0.365167
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,0.365167


In [8]:
# RobustScaler : Medyanı çıkar, iqr'a böl

In [9]:
rs = RobustScaler()
df["Age_robuts_scaler"] = rs.fit_transform(df[["Age"]])
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
PassengerId,891.0,446.0,257.353842,1.0,223.5,446.0,668.5,891.0
Survived,891.0,0.3838384,0.486592,0.0,0.0,0.0,1.0,1.0
Pclass,891.0,2.308642,0.836071,1.0,2.0,3.0,3.0,3.0
Age,714.0,29.69912,14.526497,0.42,20.125,28.0,38.0,80.0
SibSp,891.0,0.5230079,1.102743,0.0,0.0,0.0,1.0,8.0
Parch,891.0,0.3815937,0.806057,0.0,0.0,0.0,0.0,6.0
Fare,891.0,32.20421,49.693429,0.0,7.9104,14.4542,31.0,512.3292
Age_standart_scaler,714.0,2.388379e-16,1.000701,-2.016979,-0.659542,-0.117049,0.571831,3.465126
Age_robuts_scaler,714.0,0.09505553,0.812671,-1.542937,-0.440559,0.0,0.559441,2.909091


In [10]:
# MinMaxScaler : Verilen 2 değer arasında değişken dönüşümü

In [11]:
mms = MinMaxScaler()
df["Age_min_max_scaler"] = mms.fit_transform(df[["Age"]])
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
PassengerId,891.0,446.0,257.353842,1.0,223.5,446.0,668.5,891.0
Survived,891.0,0.3838384,0.486592,0.0,0.0,0.0,1.0,1.0
Pclass,891.0,2.308642,0.836071,1.0,2.0,3.0,3.0,3.0
Age,714.0,29.69912,14.526497,0.42,20.125,28.0,38.0,80.0
SibSp,891.0,0.5230079,1.102743,0.0,0.0,0.0,1.0,8.0
Parch,891.0,0.3815937,0.806057,0.0,0.0,0.0,0.0,6.0
Fare,891.0,32.20421,49.693429,0.0,7.9104,14.4542,31.0,512.3292
Age_standart_scaler,714.0,2.388379e-16,1.000701,-2.016979,-0.659542,-0.117049,0.571831,3.465126
Age_robuts_scaler,714.0,0.09505553,0.812671,-1.542937,-0.440559,0.0,0.559441,2.909091
Age_min_max_scaler,714.0,0.3679206,0.18254,0.0,0.247612,0.346569,0.472229,1.0


In [14]:
# Numeric to Categorical : Sayısal değerleri kategorik değişkene çevirmek. Binning

In [16]:
# qcut methodu, bir değişkenin değerlerini küçükten büyüğe sıralar ve çeyrek değerlere göre 5 parçaya böler
df["Age_qcut"] = pd.qcut(df["Age"],5)

In [17]:
df

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Age_standart_scaler,Age_robuts_scaler,Age_min_max_scaler,Age_qcut
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S,-0.530377,-0.335664,0.271174,"(19.0, 25.0]"
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,0.571831,0.559441,0.472229,"(31.8, 41.0]"
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S,-0.254825,-0.111888,0.321438,"(25.0, 31.8]"
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S,0.365167,0.391608,0.434531,"(31.8, 41.0]"
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S,0.365167,0.391608,0.434531,"(31.8, 41.0]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S,-0.185937,-0.055944,0.334004,"(25.0, 31.8]"
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S,-0.737041,-0.503497,0.233476,"(0.419, 19.0]"
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S,,,,
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C,-0.254825,-0.111888,0.321438,"(25.0, 31.8]"
