In [1]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from statsmodels.stats.proportion import proportions_ztest

In [2]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.float_format', lambda x: '%.3f' % x)
pd.set_option('display.width', 500)

In [15]:
def load():
    data = pd.read_csv("Dataset/diabetes.csv")
    data["Insulin"] = data["Insulin"].replace(0, np.nan)
    data["Glucose"] = data["Glucose"].replace(0, np.nan)
    return data

In [16]:
df = load()
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148.0,72,35,,33.6,0.627,50,1
1,1,85.0,66,29,,26.6,0.351,31,0
2,8,183.0,64,0,,23.3,0.672,32,1
3,1,89.0,66,23,94.0,28.1,0.167,21,0
4,0,137.0,40,35,168.0,43.1,2.288,33,1


In [17]:
#############################################
# Feature Extraction (Özellik Çıkarımı)
#############################################

#############################################
# Binary Features: Flag, Bool, True-False
#############################################

df["New_Is_Insulin_Missing"] = df["Insulin"].notnull().astype('int')
print(df.groupby("New_Is_Insulin_Missing").agg({"Outcome": "mean"}))
print(df.head())

                        Outcome
New_Is_Insulin_Missing         
0                         0.369
1                         0.330
   Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin    BMI  DiabetesPedigreeFunction  Age  Outcome  New_Is_Insulin_Missing
0            6  148.000             72             35      NaN 33.600                     0.627   50        1                       0
1            1   85.000             66             29      NaN 26.600                     0.351   31        0                       0
2            8  183.000             64              0      NaN 23.300                     0.672   32        1                       0
3            1   89.000             66             23   94.000 28.100                     0.167   21        0                       1
4            0  137.000             40             35  168.000 43.100                     2.288   33        1                       1


In [18]:
test_stat, pvalue = proportions_ztest(count=[df.loc[df["New_Is_Insulin_Missing"] == 1, "Outcome"].sum(),
                                             df.loc[df["New_Is_Insulin_Missing"] == 0, "Outcome"].sum()], #Outcome olanların sayısı
                                      nobs=[df.loc[df["New_Is_Insulin_Missing"] == 1, "Outcome"].shape[0],
                                            df.loc[df["New_Is_Insulin_Missing"] == 0, "Outcome"].shape[0]])#Gözlem sayısı

print('Test Stat = %.4f, p-value = %.4f' % (test_stat, pvalue))

Test Stat = -1.1344, p-value = 0.2566


In [19]:
#############################################
# Aykırı Değer Problemini Çözme
#############################################


def outlier_thresholds(dataframe, col_name, q1=0.25, q3=0.75):
    quartile1 = dataframe[col_name].quantile(q1)
    quartile3 = dataframe[col_name].quantile(q3)
    interquantile_range = quartile3 - quartile1
    up_limit = quartile3 + 1.5 * interquantile_range
    low_limit = quartile1 - 1.5 * interquantile_range
    return low_limit, up_limit


def check_outlier(dataframe, col_name):
    low_limit, up_limit = outlier_thresholds(dataframe, col_name)
    if dataframe[(dataframe[col_name] > up_limit) | (dataframe[col_name] < low_limit)].any(axis=None):
        return True
    else:
        return False


def grab_col_names(dataframe, cat_th=10, car_th=20):
    """
    Veri setindeki kategorik, numerik ve kategorik fakat kardinal değişkenlerin isimlerini verir.
    Not: Kategorik değişkenlerin içerisine numerik görünümlü kategorik değişkenler de dahildir.

    Parameters
    ------
        dataframe: dataframe
                Değişken isimleri alınmak istenilen dataframe
        cat_th: int, optional
                numerik fakat kategorik olan değişkenler için sınıf eşik değeri
        car_th: int, optinal
                kategorik fakat kardinal değişkenler için sınıf eşik değeri

    Returns
    ------
        cat_cols: list
                Kategorik değişken listesi
        num_cols: list
                Numerik değişken listesi
        cat_but_car: list
                Kategorik görünümlü kardinal değişken listesi

    Examples
    ------
        import seaborn as sns
        df = sns.load_dataset("iris")
        print(grab_col_names(df))


    Notes
    ------
        cat_cols + num_cols + cat_but_car = toplam değişken sayısı
        num_but_cat cat_cols'un içerisinde.
        Return olan 3 liste toplamı toplam değişken sayısına eşittir: cat_cols + num_cols + cat_but_car = değişken sayısı

    """
    cat_cols = [col for col in dataframe.columns if dataframe[col].dtypes == "O"] #değişken tipi object ise
    num_but_cat = [col for col in dataframe.columns if dataframe[col].nunique() < cat_th and
                   dataframe[col].dtypes != "O"]
    cat_but_car = [col for col in dataframe.columns if dataframe[col].nunique() > car_th and
                   dataframe[col].dtypes == "O"]
    cat_cols = cat_cols + num_but_cat
    cat_cols = [col for col in cat_cols if col not in cat_but_car]

    # num_cols
    num_cols = [col for col in dataframe.columns if dataframe[col].dtypes != "O"]
    num_cols = [col for col in num_cols if col not in num_but_cat]

    print(f"Observations: {dataframe.shape[0]}")
    print(f"Variables: {dataframe.shape[1]}")
    print(f'cat_cols: {len(cat_cols)}')
    print(f'num_cols: {len(num_cols)}')
    print(f'cat_but_car: {len(cat_but_car)}')
    print(f'num_but_cat: {len(num_but_cat)}')
    return cat_cols, num_cols, cat_but_car

In [20]:

###################
# Baskılama Yöntemi (re-assignment with thresholds)
###################


def replace_with_thresholds(dataframe, variable):
    low_limit, up_limit = outlier_thresholds(dataframe, variable)
    dataframe.loc[(dataframe[variable] < low_limit), variable] = low_limit
    dataframe.loc[(dataframe[variable] > up_limit), variable] = up_limit

In [21]:
cat_cols, num_cols, cat_but_car = grab_col_names(df)

print(cat_cols)
print(num_cols)
print(cat_but_car)

Observations: 768
Variables: 10
cat_cols: 2
num_cols: 8
cat_but_car: 0
num_but_cat: 2
['Outcome', 'New_Is_Insulin_Missing']
['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age']
[]


In [22]:
num_cols = [col for col in num_cols]
print(df.shape)

for col in num_cols:
    print(col, check_outlier(df, col))
    replace_with_thresholds(df, col)
    print(col, check_outlier(df, col))

(768, 10)
Pregnancies True
Pregnancies False
Glucose False
Glucose False
BloodPressure True
BloodPressure False
SkinThickness True
SkinThickness False
Insulin True
Insulin False
BMI True
BMI False
DiabetesPedigreeFunction True
DiabetesPedigreeFunction False
Age True
Age False


In [23]:
#############################################
# Missing Values (Eksik Değerler)
#############################################

# eksik gozlem var mı yok mu sorgusu
print(df.isnull().values.any())

# degiskenlerdeki eksik deger sayisi
print(df.isnull().sum().sort_values(ascending=False))

True
Insulin                     374
Glucose                       5
Pregnancies                   0
BloodPressure                 0
SkinThickness                 0
BMI                           0
DiabetesPedigreeFunction      0
Age                           0
Outcome                       0
New_Is_Insulin_Missing        0
dtype: int64


In [24]:
na_cols = [col for col in df.columns if df[col].isnull().sum() > 0]

print(na_cols)

['Glucose', 'Insulin']


In [25]:
#############################################
# Eksik Değer Problemini Çözme
#############################################

#############################################
# Medyan ile değiştirme
#############################################


def replace_with_mean(dataframe, variable):
    variable_mean = dataframe[variable].mean()
    dataframe[variable] = dataframe[variable].replace(np.nan, variable_mean)

In [26]:
for col in na_cols:
    replace_with_mean(df, col)

# eksik gozlem var mı yok mu sorgusu
print(df.isnull().values.any())

False


In [27]:
#############################################
# 3. Encoding (Label Encoding, One-Hot Encoding, Rare Encoding)
#############################################

#############################################
# One-Hot Encoding
#############################################


def one_hot_encoder(dataframe, categorical_cols, drop_first=True):
    dataframe = pd.get_dummies(dataframe, columns=categorical_cols, drop_first=drop_first)
    return dataframe

In [28]:
ohe_cols = [col for col in df.columns if 15 >= df[col].nunique() > 2]

print(ohe_cols)

print(one_hot_encoder(df, ohe_cols).head())

['Pregnancies']
   Glucose  BloodPressure  SkinThickness  Insulin    BMI  DiabetesPedigreeFunction    Age  Outcome  New_Is_Insulin_Missing  Pregnancies_1.0  Pregnancies_2.0  Pregnancies_3.0  Pregnancies_4.0  Pregnancies_5.0  Pregnancies_6.0  Pregnancies_7.0  Pregnancies_8.0  Pregnancies_9.0  Pregnancies_10.0  Pregnancies_11.0  Pregnancies_12.0  Pregnancies_13.0  Pregnancies_13.5
0  148.000             72             35  146.500 33.600                     0.627 50.000        1                       0            False            False            False            False            False             True            False            False            False             False             False             False             False             False
1   85.000             66             29  146.500 26.600                     0.351 31.000        0                       0             True            False            False            False            False            False            False           

In [29]:
#############################################
# Feature Scaling (Özellik Ölçeklendirme)
#############################################

###################
# StandardScaler: Klasik standartlaştırma. Ortalamayı çıkar, standart sapmaya böl. z = (x - u) / s
###################


def standart_scaler(dataframe, variable):
    ss = StandardScaler()
    dataframe[variable + '_standart_scaled'] = ss.fit_transform(dataframe[[variable]])

In [30]:
for col in num_cols:
    standart_scaler(df, col)

print(df.head())

   Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin    BMI  DiabetesPedigreeFunction    Age  Outcome  New_Is_Insulin_Missing  Pregnancies_standart_scaled  Glucose_standart_scaled  BloodPressure_standart_scaled  SkinThickness_standart_scaled  Insulin_standart_scaled  BMI_standart_scaled  DiabetesPedigreeFunction_standart_scaled  Age_standart_scaled
0        6.000  148.000             72             35  146.500 33.600                     0.627 50.000        1                       0                        0.647                    0.865                          0.093                          0.915                    0.000                0.209                                     0.589                1.446
1        1.000   85.000             66             29  146.500 26.600                     0.351 31.000        0                       0                       -0.849                   -1.206                         -0.330                          0.536                    0.000  

In [31]:
#############################################
# 8. Model
#############################################

y = df["Outcome"]
X = df.drop(["New_Is_Insulin_Missing", "Outcome"], axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)

rf_model = RandomForestClassifier(random_state=46).fit(X_train, y_train)

y_pred = rf_model.predict(X_test)

print(accuracy_score(y_pred, y_test))

0.7316017316017316


In [32]:
#############################################
# Hiç bir işlem yapılmadan elde edilecek skor?
#############################################

dff = load()
dff.dropna(inplace=True)
y = dff["Outcome"]
X = dff.drop(["Outcome"], axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=17)
rf_model = RandomForestClassifier(random_state=46).fit(X_train, y_train)
y_pred = rf_model.predict(X_test)
print(accuracy_score(y_pred, y_test))

0.7627118644067796
