In [None]:
# Import the necessary libraries
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.linear_model import LogisticRegression
from sklearn import svm

from sklearn.neighbors import KNeighborsClassifier


from sklearn.model_selection import cross_val_score, cross_val_predict, GridSearchCV, KFold, train_test_split 
from sklearn.metrics import roc_auc_score, accuracy_score, roc_curve 
from sklearn.metrics import confusion_matrix, classification_report

In [None]:
# plots' parameters with seaborn
sns.set_style("whitegrid")
sns.set_context("notebook", font_scale = 1, rc = {"lines.linewidth": 2, 'font_family': [u'times']})

In [None]:
# Load our dataset
df = pd.read_csv("../input/game-of-thrones/character-predictions.csv")

In [None]:
df.head()

In [None]:
# Check for NANs values
nans = df.isna().sum()
nans

The dataset contains data for 1946 characters. It indicates whether the character has died or not, and a number of variables, including the gender, nobility, etc. data for the character.
Data was not clean, so I had to do some changes. There were total 22108 NAs. It is always better to change the NA values that can be changed. Sometimes using a mean value for this, might work. When I looked at the age column, I saw something strange. 


In [None]:
# Mean age
print(df["age"].mean())

Mean age was a **negative** value. So, I had to fix it.  As you can see in the Figure 2 below, there were characters that have negative age. They were affecting the mean value. 

In [None]:
# Check which characters have a negative age and it's value.
print(df["name"][df["age"] < 0])
print(df['age'][df['age'] < 0])

In [None]:
#According to https://gameofthrones.fandom.com/wiki/Doreah Doreah is actually around 25
#and 
#accoding to https://gameofthrones.fandom.com/wiki/Rhaego Rhaego was never even born so;
# Replace negative ages
df.loc[1684, "age"] = 25.0
df.loc[1868, "age"] = 0.0

In [None]:
# Mean is correct now
print(df["age"].mean())

In [None]:
# Fill the nans we can
df["age"].fillna(df["age"].mean(), inplace=True)
df.fillna("", inplace=True)

In [None]:
nans = df.isna().sum()
nans

There are many ways to deal with NAN values. If there are few NAN values, it may be healthier to remove those values from the data and continue the analysis. However, there were too many NAN values in the data set we worked on. Filling the values that can be filled would be better in this case. 
I researched the characters an what I found was; according to https://gameofthrones.fandom.com/wiki/Doreah,  Doreah is actually around 25 and according to https://gameofthrones.fandom.com/wiki/Rhaego,  Rhaego was never even born. I replaced the negative ages so that mean was fixed. However, there were still nulls in the data. I replaced them with mean. Other null values in the data was replaced with empty string. As a result of these processes, there are no NAs in the data.  
By making exploratory data analysis, I will decide the variables I will put in my model.
Violin plots are almost the same as box plots. But they also show the probability density of the data at different values. Box plots show mean, median and interquartile ranges, the violin plots show the full distribution of the data.  


In [None]:
sns.violinplot("isPopular", "isNoble", hue="isAlive", data=df ,split=True).set_title('Noble and Popular vs Mortality')


In [None]:
sns.violinplot("isPopular", "isMarried", hue="isAlive", data=df ,split=True).set_title('Married and Popular vs Mortality')


In [None]:
sns.violinplot("isPopular", "book1", hue="isAlive", data=df ,split=True).set_title('Book_1 and Popular vs Mortality')

According to the violin plots:
•	Noble and popular characters are most likely to be alive,
•	Popular characters that appearing in book1 most likely to be dead
•	Not popular characters that appearing in book1 most likely to be dead
•	Single and not popular characters are most likely to be alive
•	Single and popular characters are most probably dead
So, if you are a popular, married and noble character in the Game of Thrones Universe, you are most likely to be alive. Also, you have to appear in the book1. 
	The data preparation process is not over yet. When we look at the culture column, we see that there are different spelling types for the same culture group. For instance, Westermen takes 3 different values; westermen, westerman, westerlands. These name errors have been corrected. 
After correcting these values, I wanted to find out the relationship between culture and survival.


In [None]:
# Get all of the culture values in our dataset
set(df['culture'])

In [None]:
# Lots of different names for one culture so lets group them up
cult = {
    'Summer Islands': ['summer islands', 'summer islander', 'summer isles'],
    'Ghiscari': ['ghiscari', 'ghiscaricari',  'ghis'],
    'Asshai': ["asshai'i", 'asshai'],
    'Lysene': ['lysene', 'lyseni'],
    'Andal': ['andal', 'andals'],
    'Braavosi': ['braavosi', 'braavos'],
    'Dornish': ['dornishmen', 'dorne', 'dornish'],
    'Myrish': ['myr', 'myrish', 'myrmen'],
    'Westermen': ['westermen', 'westerman', 'westerlands'],
    'Westerosi': ['westeros', 'westerosi'],
    'Stormlander': ['stormlands', 'stormlander'],
    'Norvoshi': ['norvos', 'norvoshi'],
    'Northmen': ['the north', 'northmen'],
    'Free Folk': ['wildling', 'first men', 'free folk'],
    'Qartheen': ['qartheen', 'qarth'],
    'Reach': ['the reach', 'reach', 'reachmen'],
    'Ironborn': ['ironborn', 'ironmen'],
    'Mereen': ['meereen', 'meereenese'],
    'RiverLands': ['riverlands', 'rivermen'],
    'Vale': ['vale', 'valemen', 'vale mountain clans']
}

def get_cult(value):
    value = value.lower()
    v = [k for (k, v) in cult.items() if value in v]
    return v[0] if len(v) > 0 else value.title()
df.loc[:, "culture"] = [get_cult(x) for x in df["culture"]]


In [None]:
#how does culter affect survival
df.loc[:, "culture"] = [get_cult(x) for x in df.culture.fillna("")]
data = df.groupby(["culture", "isAlive"]).count()["S.No"].unstack().copy(deep = True)
data.loc[:, "total"]= data.sum(axis = 1)
p = data[data.index != ""].sort_values("total")[[0, 1]].plot.barh(stacked = True, rot = 0, figsize = (14, 12),)
_ = p.set(xlabel = "No. of Characters", ylabel = "Culture"), p.legend(["Dead", "Alive"], loc = "lower right")

We can see that Northmen and Ironborn are more than many in number. There is no death in Summer Islands and Crannogmen etc.


In [None]:
#saving a copy of the dataset just in case
df2 = df.copy(deep=True)

In [None]:
df

In [None]:
#droping columns that are not useful
drop = ["S.No", "plod", "title", "dateOfBirth", "DateoFdeath", "mother", "father", "heir", "house", 
        "spouse", "book2", "book3", "book4", "book5", "isAliveMother", "isAliveFather","isAliveHeir",
        "isAliveSpouse", "popularity", "name"]
df = df.drop(drop, axis=1)

In [None]:
#turning categorical variables into one-hot encoded variables
df = pd.get_dummies(df)

In [None]:
#creating response and explanatory variables 
y=df.isAlive #response
X=df.drop('isAlive', axis=1)

Therefore, for my model, I omitted several variables that were not required to estimate the character's survival rate. For example; S.No, plod, title, dateOfBirth, DateoFdeath, mother, father, heir,house, spouse, book2, book3, book4, book5, isAliveMother, isAliveFather, isAliveHeir, isAliveSpouse, popularity and name. I also removed the isAlive variable from the data so that I can use it for the response variable. 
	When I put the remaining variables into the model, the score of the model came to 0.776 which is pretty good actually. So, I made sure that I could put the character I wanted to guess into the model. I wanted to estimate the living rate of Arya Stark. While watching the series, my favorite character was already Arya Stark and I knew that she lived at the end of the series.


In [None]:
#making logit
lr = LogisticRegression()

In [None]:
lr.fit(X,y)

In [None]:
lr.predict(X)

In [None]:
#model's score is pretty good
lr.score(X,y)

In [None]:
#finding arya's index number
df2[df2["name"]=="Arya Stark"].index

In [None]:
X.iloc[1466]

In [None]:
#finding survival rate which is 0.557
lr.predict_proba([X.iloc[1466]])

According to my first model, Arya's chances of living is 99%. 
I looked at the multicollinearity by looking vifs.


In [None]:
lr.predict([X.iloc[1466]])

In [None]:
from statsmodels.stats.outliers_influence import variance_inflation_factor

In [None]:
vif = pd.DataFrame()

In [None]:
vif["VIF Factor"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]

In [None]:
vif["features"] = X.columns

In [None]:
vif

Arya had the chance to live 0.992 by using the important variables I found by doing EDA. And my model has no multicollinearity problem. 