### Kütüphanelerin Eklenmesi

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings("ignore")

%matplotlib inline

#### Verilerin Okunması

In [None]:
train_data = pd.read_csv("train.csv")
test_data = pd.read_csv("test.csv")

# İşimizi kolaylaştırması açısından etiket(label) değerlerini ayrı bir şekilde tutuyoruz.
y = train_data.iloc[:, 1]

In [None]:
train_data.info()

In [None]:
train_data.head()

In [None]:
train_data.columns

### Öznitelik Çıkarımı(Feature Extraction)

In [None]:
sex_pivot = train_data.pivot_table(index="Sex", values="Survived")
sex_pivot.plot.bar()
plt.show()

In [None]:
train_data["Age"].describe()

In [None]:
train_data.shape

In [None]:
survived = train_data[train_data["Survived"] == 1]
died = train_data[train_data["Survived"] == 0]

survived["Age"].plot.hist(alpha=0.5, color='red', bins=50)
died["Age"].plot.hist(alpha=0.5, color='blue', bins=50)
plt.legend(["Survived", "Died"])
plt.show()

In [None]:
cut_points = [-1, 0, 5, 12, 18, 35, 60, 80]
label_names = ["Missing", "Baby", "Child", "Teen", "Young_adult", "Adult", "Senior"]
train_data["Age"] = train_data["Age"].fillna(-0.5)
train_data["Age_categories"] = pd.cut(train_data["Age"], cut_points, label_names)

test_data["Age"] = test_data["Age"].fillna(-0.5)
test_data["Age_categories"] = pd.cut(test_data["Age"], cut_points, label_names)

In [None]:
train_data["Age_categories"][:10]

In [None]:
age_cat_pivot = train_data.pivot_table(index="Age_categories", values="Survived")

fig = plt.figure(figsize=(20, 10))
ax = fig.add_subplot(221)
age_cat_pivot.plot(kind='bar', ax=ax)

ax = fig.add_subplot(222)
sex_pivot.plot(kind='bar', ax=ax)
plt.show()

### One-Hot Encoding

Sınırlı sayıda değer alan verilere kategorik veri deniyor. Örneğin insanların hangi marka arabaya sahip oldukları hakkında bir anket yapsanız sonuçlar kategorik olurdu(Toyota, Renault vs.). Çoğu makine öğrenme algoritmasını uygularken bu değerleri *"encode"* etmez isek hata ile karşılaşırız. One-Hot Encoding, kategorik verileri encode etmek için kullanılan yöntemlerden birisi.

<table style="width:20%; margin-top:10px; margin-left:0px;">
    <tr>
    <td> 
        **Toyota ** : 
    </td>
    <td>
        1-0-0-0
    </td>
    </tr>
    <tr>
    <td> 
        **Renault ** : 
    </td>
    <td>
        0-1-0-0
    </td>
    </tr>
        <tr>
    <td> 
        **Honda ** : 
    </td>
    <td>
        0-0-1-0
    </td>
    </tr>
    <tr>
    <td> 
        **BMW ** : 
    </td>
    <td>
        0-0-0-1
    </td>
       </tr>
</table>
Yukarıda ki tabloda gördüğünüz üzere ilgili sınıfın olduğu indekse 1, geri kalanlara 0 yazıyoruz. Daha sonra oluşturduğumuz bu veriyi veri setimize yeni bir sütun olarak ekliyoruz.
Biz de bu yazı boyunca bu yönteme başvuracağız. Bu yöntemi uygulamak içinse [pd.get_dummies()](https://pandas.pydata.org/pandas-docs/stable/generated/pandas.get_dummies.html) fonksiyonunu kullanacağız.

Daha fazla bilgi almak isterseniz [bu](https://www.kaggle.com/dansbecker/using-categorical-data-with-one-hot-encoding) yazıyı okuyabilirsiniz.

In [None]:
def create_dummies(df, column_name):
    dummies = pd.get_dummies(df[column_name], prefix=column_name)
    df = pd.concat([df, dummies], axis=1)
    return df

In [None]:
train_data = create_dummies(train_data, "Age_categories")
test_data = create_dummies(test_data, "Age_categories")

train_data = create_dummies(train_data, "Sex")
test_data = create_dummies(test_data, "Sex")

train_data = create_dummies(train_data, "Pclass")
test_data = create_dummies(test_data, "Pclass")

In [None]:
train_data.columns

In [None]:
cols = ["SibSp", "Parch", "Fare", "Cabin", "Embarked"]
train_data[cols].describe(include='all')

In [None]:
train_data["Embarked"] = train_data["Embarked"].fillna("S")
test_data["Embarked"] = train_data["Embarked"].fillna("S")

train_data = create_dummies(train_data, "Embarked")
test_data = create_dummies(test_data, "Embarked")

In [None]:
explore_cols = ["SibSp", "Parch", "Survived"]
explore = train_data[explore_cols].copy()

explore["family_size"] = explore[["SibSp", "Parch"]].sum(axis=1)

pivot = explore.pivot_table(index="family_size", values="Survived")
pivot.plot.bar(ylim=(0,1), yticks=np.arange(0,1,.1))
plt.show()

In [None]:
def process_family(df):
    is_alone = []
    for val in df[["SibSp", "Parch"]].sum(axis=1):
        if val == 0:
            is_alone.append(1)
        else:
            is_alone.append(0)
    
    df["is_alone"] = is_alone
    return df

In [None]:
train_data = process_family(train_data)
test_data = process_family(test_data)

In [None]:
train_data = create_dummies(train_data, "is_alone")
test_data = create_dummies(test_data, "is_alone")

In [None]:
survived = train_data[train_data["Survived"] == 1]
died = train_data[train_data["Survived"] == 0]

survived["Fare"].plot.hist(alpha=0.5, range=[0, 200], color='red', bins=10)
died["Fare"].plot.hist(alpha=0.5, range=[0,200], color='blue', bins=10)

plt.legend(["Survived", "Died"])
plt.show()

In [None]:
cut_points = [0, 12, 50, 100, 1000]
label_names = ["0-12", "12-50", "50-100", "100+"]
def process_fare(df, cut_points, label_names):
    df["Fare"] = pd.cut(df["Fare"], cut_points, labels=label_names) 
    return df

In [None]:
train_data = process_fare(train_data, cut_points, label_names)
test_data = process_fare(test_data, cut_points, label_names)

train_data = create_dummies(train_data,"Fare")
test_data = create_dummies(test_data, "Fare")

In [None]:
train_data["Cabin"][:10]

In [None]:
train_data["Cabin_type"] = train_data["Cabin"].str[0]
test_data["Cabin_type"] = test_data["Cabin"].str[0]

In [None]:
train_data["Cabin_type"] = train_data["Cabin_type"].fillna("Unknown")
test_data["Cabin_type"] = test_data["Cabin_type"].fillna("Unknown")

train_data = create_dummies(train_data, "Cabin_type")
test_data = create_dummies(test_data, "Cabin_type")

In [None]:
train_data["Name"][:5]

In [None]:
# Büyük-küçük harfler ve nokta ile biten kelimeleri eşleştir.
pattern =  '([A-Za-z]+)\.'
extracted_titles = train_data["Name"].str.extract(pattern, expand=False)
print(extracted_titles[:5])
titles = {
    "Mr" :         "Mr",
    "Mme":         "Mrs",
    "Ms":          "Mrs",
    "Mrs" :        "Mrs",
    "Master" :     "Master",
    "Mlle":        "Miss",
    "Miss" :       "Miss",
    "Capt":        "Officer",
    "Col":         "Officer",
    "Major":       "Officer",
    "Dr":          "Officer",
    "Rev":         "Officer",
    "Jonkheer":    "Royalty",
    "Don":         "Royalty",
    "Sir" :        "Royalty",
    "Countess":    "Royalty",
    "Dona":        "Royalty",
    "Lady" :       "Royalty"
}
train_data["Title"] = extracted_titles.map(titles)
extracted_titles = test_data["Name"].str.extract(pattern, expand=False)
test_data["Title"] = extracted_titles.map(titles)

In [None]:
train_data = create_dummies(train_data, "Title")
test_data = create_dummies(test_data, "Title")

### Eşdoğrusallık(Collinearity)

In [None]:
def plot_correlation_heatmap(df):
    corr = df.corr()
    
    sns.set(style="white")
    mask = np.zeros_like(corr, dtype=np.bool)
    mask[np.triu_indices_from(mask)] = True

    f, ax = plt.subplots(figsize=(11, 9))
    cmap = sns.diverging_palette(220, 10, as_cmap=True)

    sns.heatmap(corr, mask=mask, cmap=cmap, vmax=.3, center=0,
            square=True, linewidths=.5, cbar_kws={"shrink": .5})
    plt.show()

In [None]:
plot_correlation_heatmap(train_data)

In [None]:
# Birazdan kaggle'a tahminlerimizi kaydedeceğiz. Orada kullanabilmek adına yolcuların id'lerini çıkarmadan
# önce saklayalım.
passenger_id = test_data["PassengerId"]
drop_cols = ['PassengerId','Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch',
             'Ticket', 'Fare', 'Cabin', 'Embarked', 'Age_categories', 'is_alone',
            'Cabin_type', 'Title']

train_data = train_data.drop(drop_cols, axis=1)
test_data = test_data.drop(drop_cols, axis=1)

train_data = train_data.drop(["Survived"], axis=1)


In [None]:
train_data = train_data.drop(["Cabin_type_Unknown", "Cabin_type_T"], axis=1)
test_data = test_data.drop(["Cabin_type_Unknown"], axis=1)

In [None]:
from sklearn.feature_selection import RFECV
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb


rf = RandomForestClassifier()
selector = RFECV(rf, cv=10)
selector.fit(train_data, y)

opt_cols = train_data.columns[selector.support_]

In [None]:
opt_cols

### Modellerin oluşturulması

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

In [None]:
x_train, x_test, y_train, y_test = train_test_split(train_data[opt_cols], y, test_size=0.2, random_state=42)

In [None]:
predictions = []
models = []

lr = LogisticRegression()
lr.fit(x_train, y_train)
lr_pred = lr.predict(x_test)
predictions.append([pd.DataFrame(lr_pred), 'LR'])
models.append(lr)

In [None]:
knn = KNeighborsClassifier()
knn.fit(x_train, y_train)
knn_pred = knn.predict(x_test)
predictions.append([pd.DataFrame(knn_pred), 'KNN'])
models.append(knn)

In [None]:
rfc = RandomForestClassifier()
rfc.fit(x_train, y_train)
rfc_pred = rfc.predict(x_test)
predictions.append([pd.DataFrame(rfc_pred), 'RFC'])
models.append(rfc)

In [None]:
for pred in predictions:
    acc = accuracy_score(y_test, pred[0])
    print(pred[1], ' : ', acc)

In [None]:
def save_submission_file(models):
    for model in models:
        prediction = model.predict(test_data[opt_cols])
        df = {"PassengerId": passenger_id, "Survived": prediction}
        submission = pd.DataFrame(df)
        name = "submission" + model.__class__.__name__ + ".csv"
        submission.to_csv(name, index=False)
        
save_submission_file(models)