In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error as mae
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix

In [3]:
df = pd.read_csv('/kaggle/input/titanic/train.csv')
df.columns

In [4]:
data = df.copy()
data.pop("PassengerId")
data.head()

In [5]:
import seaborn as sns

#### Explorative Data Analysis

In [6]:
data.groupby('Pclass').count()['Survived'].reset_index()

In [7]:
sns.catplot(x="Pclass", y="Fare", data=data, kind="boxen");

In [8]:
sns.catplot(x="Survived", y="Fare", data=data, kind="boxen");

#### Cleaning the "Cabin" column

In [9]:
mode = (data["Cabin"].mode())[0]
mode

In [10]:
data["Cabin"][:13]

In [11]:
def get_cabin(v):
    n = 0
    for i in v:
        try: i + "2"
        except: n+=1
        else:
            v[n] = i[:1]
            n+=1
    return v

In [12]:
get_cabin(data["Cabin"])
# The number of people in each Cabin
print(data.groupby('Cabin').count()['Survived'].reset_index())
# The number of people that survived in each Cabin 
print(data.groupby('Cabin').sum()['Survived'].reset_index())

### Replace Missing Values

In [13]:
# How many missing values are in each column
for i in data.columns:
    print(i, sum(data[i].isnull()))
data.shape

In [14]:
def replace_mean(data, column):
    mean = round((data[column].mean(axis = 0, skipna=True)), 2)
    data[column] = data[column].replace(np.nan, mean)
    return data[column]

def replace_mode(data, column):
    mode = (data[column].mode())[0]
    data[column] = data[column].replace(np.nan, mode)
    return data[column]

In [15]:
# Clean missing values in "Age"
replace_mean(data,"Age")


# Replace  missing  values in "Cabin" with "H" a new Cabin alphabet.
# I did this because of the huge amount of missing data in the column
replace_mode(data, "Cabin")


# Replace the missing values with the column's mode, which in this case is "S"
# I did tthis because there are just 2 missing values
replace_mode(data, "Embarked")

In [16]:
# How many missing values are in each column
for i in data.columns:
    print(i, sum(data[i].isnull()))

#### Survival of Women

In [17]:
print("%d women were onboard"% df[df['Sex'] == 'female'].count()['Survived'])
print("%d women survived"% df[df['Sex'] == 'female'].sum()['Survived'])

In [18]:
#The rows that contain only "female"
cc = df[df['Sex'] == 'female']
# Show the number of women who survived according to "Pclass"
cc.groupby('Pclass').sum()['Survived'].reset_index()

#### Survival of Men

In [19]:
print("%d men were onboard"% df[df['Sex'] == 'male'].count()['Survived'])
print("%d men survived"% df[df['Sex'] == 'male'].sum()['Survived'])

In [20]:
#The rows that contain only "male"
cd = df[df['Sex'] == 'male']
# Show the number of men who survived according to "Pclass"
cd.groupby('Pclass').sum()['Survived'].reset_index()

#### Change Name to length of Name

In [21]:
data["Name"] = [len(i) for i in data["Name"]]
data["Name"]

*This shows that the cabin that a passenger is in affects whether they survive or not*

### **The survival function**

In [22]:
def c_e_survival(data, c_e, c_e_survival_rate, embarked_survival_rate):
    data[c_e_survival_rate] = [embarked_survival_rate[i] for i in data[c_e]]
    return data[c_e_survival_rate]

#### Cabin Survival Rate

In [23]:
# The number of people in each Cabin
print(data.groupby([i for i in data["Cabin"]],dropna = False).count()["Survived"])
# The number of people that survived in each Cabin
print(data.groupby([i for i in data["Cabin"]],dropna = False).sum()["Survived"])

In [24]:
Cabin_survival_rate = data.groupby("Cabin").mean()["Survived"]
Cabin_survival_rate

In [25]:
#Run the function
c_e_survival(data, "Cabin", "Cabin_survival_rate", Cabin_survival_rate)

#### "Embarked" Column

In [26]:
data["Embarked"][:13]

#### Calculate the "Embarked" survival rate

In [27]:
# The number of people from each location
print(data.groupby([i for i in data["Embarked"]],dropna = False).count()["Survived"])
# The number of people that survived in each location 
print(data.groupby([i for i in data["Embarked"]],dropna = False).sum()["Survived"])

As you can see the location of embarkment plays a huge role in your survival

In [28]:
embarked_survival_rate = data.groupby("Embarked").mean()["Survived"]
embarked_survival_rate

In [29]:
#Run the function
c_e_survival(data, "Embarked", "Embarked_survival_rate", embarked_survival_rate)

In [30]:
data.head()
data[data["Survived"]==1].groupby("Ticket").count()

#### Calculate the "Age" survival rate

In [31]:
# Check how many ages are in the data
data["Age"].unique()

In [32]:
sns.catplot(x= "Age", y = "Survived", data=data, height =11, aspect =2)

In [33]:
[(-np.inf, 1),(2, 5),(6, 16), (17, 27), (28, 49), (50, 69), (70, np.inf)]

In [34]:
# Check how many ages are in the data
data["Age"].unique()

In [35]:
# Let's group the ages together and then calculate the survival rate
bins = pd.IntervalIndex.from_tuples([(-np.inf, 1),(1, 5),(5, 16), (16, 27), (27, 39), (39, 49), (49, 69), (69, np.inf)])
bins

In [36]:
data['age_bracket'] = pd.cut(data['Age'], bins)
data['age_bracket']

In [37]:
age_survival_rate = data.groupby("age_bracket").mean()["Survived"]
age_survival_rate

In [38]:
c_e_survival(data, "age_bracket", 'age_survival_rate', age_survival_rate)

#### Calculate the "Parch" survival rate

In [39]:
data.groupby("Parch").count()

In [40]:
parch_survival_rate = data.groupby("Parch").mean()["Survived"]
parch_survival_rate

In [41]:
c_e_survival(data, "Parch", 'parch_survival_rate', parch_survival_rate)

#### Calculate the "Pclass" survival rate

In [42]:
data["Pclass"].unique()

In [43]:
pclass_survival_rate = data.groupby("Pclass").mean()["Survived"]
pclass_survival_rate

In [44]:
c_e_survival(data, "Pclass", 'pclass_survival_rate', pclass_survival_rate)

#### Calculate the "SibSp" survival rate

In [45]:
data["SibSp"].unique()

In [46]:
sibsp_survival_rate = data.groupby("SibSp").mean()["Survived"]
sibsp_survival_rate

In [47]:
c_e_survival(data, "SibSp", 'sibsp_survival_rate', sibsp_survival_rate)

#### Calculate the "Sex" survival rate

In [48]:
sex_survival_rate = data.groupby("Sex").mean()["Survived"]
sex_survival_rate

In [49]:
c_e_survival(data, "Sex", 'sex_survival_rate', sex_survival_rate)

In [50]:
# How many missing values are in each column
for i in data.columns:
    print(i, sum(data[i].isnull()))

In [51]:
g = []
for i in data["Ticket"]:
    if " " in i:
        # Append the last element in the split This ensures that even with Tickets with multiple spaces,
        # It is the ticket number that gets saved
        g.append(i.split(" ")[-1])
    else:
        g.append(i)
len(g)

In [52]:
data["Ticket_num"] = g

In [53]:
# Check the number of unique numbers
data["Ticket_num"].describe()

There are waaaay too many unique numbers for it to be useful to the model

In [54]:
data["Ticket_num"].replace("LINE", "1601", inplace = True)

In [55]:
data[data["Ticket_num"]== "LINE"]

In [56]:
from sklearn.ensemble import RandomForestClassifier

In [57]:
pip install sklearn_evaluation

In [58]:
X = data.copy()
y = X.pop("Survived")

features = ['Pclass', 'Name', 'Age', 'Fare',
       'Cabin_survival_rate', 'Embarked_survival_rate', 'age_survival_rate',
       'pclass_survival_rate', 'sibsp_survival_rate', 'sex_survival_rate']
"""
features = ['Pclass', 'Name', 'Age', 'Fare', 'Cabin_survival_rate', 'Embarked_survival_rate', 'age_survival_rate',
       'pclass_survival_rate', 'sibsp_survival_rate', 'sex_survival_rate']
"""
X = X[features]



In [59]:
from sklearn_evaluation import plot

In [60]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

model = RandomForestClassifier()
model.fit(X_train, y_train)

# plot all features
ax = plot.feature_importances(model)

In [61]:
n = 0
for i in X.columns:
    n += 1
    print(n, i)

In [62]:
from sklearn.metrics import confusion_matrix, classification_report


# Compute predictions over the prediction space: y_pred
y_pred = model.predict(X_test)

# Print R^2
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))



In [63]:
print(cross_val_score(model, X, y, cv=3, scoring="f1"))

In [64]:
test = pd.read_csv('/kaggle/input/titanic/test.csv')
test_data = test.copy()
test_data.head()
Id = test.copy()

In [65]:
get_cabin(test_data["Cabin"])

In [66]:
replace_mean(test_data, ["Age", "Fare"])

replace_mode(test_data, "Cabin")

In [67]:
# How many missing values are in each column
for i in test_data.columns:
    print(i, sum(test_data[i].isnull()))
test_data.shape

#### Change "Name" to length of "Name"

In [68]:
test_data["Name"] = [len(i) for i in test_data["Name"]]
test_data["Name"]

#### Cabin Survival Rate

In [69]:
c_e_survival(test_data, "Cabin", "Cabin_survival_rate", Cabin_survival_rate)

#### Embarked Survival Rate

In [70]:
c_e_survival(test_data, "Embarked", "Embarked_survival_rate", embarked_survival_rate)

#### Age Survival Rate

In [71]:
# Let's group the ages together and then calculate the survival rate
bins = pd.IntervalIndex.from_tuples([(-np.inf, 1),(1, 5),(5, 16), (16, 27), (27, 39), (39, 49), (49, 69), (69, np.inf)])
bins

In [72]:
test_data['age_bracket'] = pd.cut(test_data['Age'], bins)
c_e_survival(test_data, "age_bracket", 'age_survival_rate', age_survival_rate)

#### Pclass Survival Rate

In [73]:
c_e_survival(test_data, "Pclass", 'pclass_survival_rate', pclass_survival_rate)

#### SibSp survival rate

In [74]:
c_e_survival(test_data, "SibSp", 'sibsp_survival_rate', sibsp_survival_rate)

#### Sex survival rate

In [75]:
c_e_survival(test_data, "Sex", 'sex_survival_rate', sex_survival_rate)

In [76]:
# How many missing values are in each column
for i in test_data.columns:
    print(i, sum(test_data[i].isnull()))

In [77]:
predictions = model.predict(test_data[features])

In [78]:
output = pd.DataFrame({'PassengerId': Id.PassengerId, 'Survived': predictions})
output.to_csv('my_submission.csv', index=False)
print("Your submission was successfully saved!")