# Imports

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# inline plotting
import matplotlib.pyplot as plt
%matplotlib inline

# machine learning (we'll talk more about these later!)
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB


# Loading data


In [None]:
# get titanic & test csv files as a DataFrame
train_df = pd.read_csv("../input/train.csv")
test_df = pd.read_csv("../input/test.csv")

# preview the data
train_df.head()
# test_df.head()

# Aggregate Exploration

In [None]:
# Let's look at the columns (nomenclature: features, classes)
train_df.columns

In [None]:
# Let's get some aggregate data quickly
train_df.describe()
#test_df.info()


In [None]:
# checking if there is any missing values in the data sets
train_df.isnull().sum()
#calculation with missing values:When summing data, NA (missing) values will be treated as zero

In [None]:
# Same thing for the test data
test_df.isnull().sum()


# Feature Analysis


## Embarked

In [None]:
train_df['Embarked'].value_counts()
#train_df['Embarked'].mode()

In [None]:
# Visual tools can help us get a better sense of the data

# First let's look at where people boarded from
embarked_count = train_df.groupby(["Embarked"])["Ticket"].count()
ax1 = embarked_count.plot(kind='bar', figsize=(5, 3), fontsize=12)
ax1.set_xlabel("Embarked", fontsize=12)
ax1.set_ylabel("Count", fontsize=12)
plt.show()

# Next let's look at where survivors boarded from
embarked_survival = train_df[['Embarked','Survived']].groupby(["Embarked"]).sum()
ax2 = embarked_survival.plot(kind='bar', figsize=(5, 3), fontsize=12)
ax2.set_xlabel("Embarked", fontsize=12)
ax2.set_ylabel("Survived", fontsize=12)
plt.show()

# Finally let's look at your chances of surviving depending on where you boarded
embarked_perc = train_df[["Embarked", "Survived"]].groupby(['Embarked']).mean()
ax3 = embarked_perc.plot(kind='bar', figsize=(5, 3), fontsize=12)
ax3.set_xlabel("Embarked", fontsize=12)
ax3.set_ylabel("Mean Survival Rate", fontsize=12)
plt.show()

In [None]:
# What to do about those pesky
# null values we found in our aggregate exploration?
train_df.Embarked = train_df.Embarked.fillna("S")
test_df.Embarked = test_df.Embarked.fillna("S")

# How do we look now?
train_df.Embarked.isnull().sum(), test_df.Embarked.isnull().sum()

## Age

In [None]:
#Moving onto the "Age" category.

# Let's take a look at the total numbers
ax1 = train_df["Age"].dropna().hist()
ax1.set_xlabel("Age", fontsize=12)
ax1.set_ylabel("Count", fontsize=12)
plt.show()

# Let's look at those that survived
age_survived = train_df[['Age','Survived']].dropna().groupby(['Age'],as_index=False).sum()
ax2 = age_survived.plot(kind='bar',x="Age", y="Survived" , figsize=(20, 5), fontsize=12)
ax2.set_xlabel("Age", fontsize=12)
ax2.set_ylabel("Survived", fontsize=12)
plt.show()

# Finally let's look at your chances of surviving depending on where your age
age_perct = train_df[['Age','Survived']].dropna().groupby(['Age'],as_index=False).mean()
ax3 = age_perct.plot(kind='bar',x="Age", y="Survived" , figsize=(20, 5), fontsize=12)
ax3.set_xlabel("Age", fontsize=12)
ax3.set_ylabel("Mean Survival", fontsize=12)
plt.show()

In [None]:
# Let's fill in the null values here aswell.  This can be tricky.  
# There is no right answer

# Get average,std, and count of NaN's in train_df
average_age_train  = train_df["Age"].mean()
std_age_train      = train_df["Age"].std()
count_nan_age_train = train_df["Age"].isnull().sum()

# Get average,std, and count of NaN's in test_df
average_age_test   = test_df["Age"].mean()
std_age_test       = test_df["Age"].std()
count_nan_age_test = test_df["Age"].isnull().sum()

# Let's now use Numpy to generate random numbers subject to these means and std's
train_random = np.random.normal(average_age_train, std_age_train, count_nan_age_train)
test_random = np.random.normal(average_age_test, std_age_test, count_nan_age_test)

# We can now fill in our NaN values using these randomly generated values
train_df["Age"][np.isnan(train_df["Age"])] = train_random
test_df["Age"][np.isnan(test_df["Age"])] = test_random

# Convert from float to int to make things 
train_df['Age'] = train_df['Age'].astype(int)
test_df['Age']    = test_df['Age'].astype(int)

# How do we look now?
print(train_df.Embarked.isnull().sum(), test_df.Embarked.isnull().sum())

# Let's now look at what our data looks like after this
ax1 = train_df["Age"].dropna().hist()
ax1.set_xlabel("Age", fontsize=12)
ax1.set_ylabel("Count", fontsize=12)
plt.show()


In [None]:
#cabin doesnt seem to contribute to survival and has a lot of missing values
train_df["Cabin"].isnull().count()


In [None]:
train_df.Embarked = train_df.Embarked.fillna("S")
train_df.Embarked.isnull().sum()

In [None]:
# pclass
# embarked_survival = train_df[['Embarked','Survived']].groupby(["Embarked"]).sum()
# ax1 = embarked_survival.plot(kind='bar', figsize=(5, 3), fontsize=12)
# ax1.set_xlabel("Embarked", fontsize=12)
# ax1.set_ylabel("Survival", fontsize=12)
# ax1.plot()
# plt.show()
# embarked_count = train_df.groupby(["Embarked"])["Ticket"].count()
# x2 = embarked_count.plot(kind='bar', figsize=(5, 3), fontsize=12)
# ax2.set_xlabel("Embarked", fontsize=12)
# ax2.set_ylabel("Count", fontsize=12)
# ax2.plot()
# plt.show()
