Linear regression on titanic data set with tensorflow

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline

import numpy as np
import pandas as pd

import statsmodels.api as sm
from statsmodels.nonparametric.kde import KDEUnivariate
from statsmodels.nonparametric import smoothers_lowess

from pandas import Series, DataFrame
from pandas.tools.plotting import scatter_matrix
from patsy import dmatrices

import tensorflow as tf
from sklearn import datasets, svm
#for heat map
import seaborn as sns

import tempfile

In [None]:
data_file = pd.read_csv("../input/train.csv")

In [None]:


#cleaning the data set, most of the entries in ticket and cabin has NaN
data_file = data_file.drop(['Ticket','Cabin'], axis=1)
#Removing entries which have atleast on NaN in one of the feature
data_file = data_file.dropna()
data_file[0:5]

In [None]:
#Plotting basics, pandas beutifully integrates with matplotlib
"""Plots will have
1. Number of people survived, not survived
2. Histogram of people survived with respect to age
3. Histogram of people who did not survived with respect to age
4. Class distribution
5. Class distribution for people who survived
6. Age distribution with each class
7. Embark count
8. Fare distribution with each class"""

#All in one figure
fig = plt.figure(figsize=(30,25),dpi=100)

subplot1 = plt.subplot2grid((4,2),(0,0))
data_file.Survived.value_counts().plot(kind='bar', alpha = 0.4)
subplot1.set_xlim(-2,2)
plt.title("Number of people survived, not survived")

subplot2 = plt.subplot2grid((4,2),(0,1))
data_file.Age[data_file.Survived==1].plot(kind='kde')
plt.title("Age distribution of survivers")

subplot3 = plt.subplot2grid((4,2),(1,0))
data_file.Age[data_file.Survived==0].plot(kind='kde')
plt.title("Age distribution of non-survivers")

subplot4 = plt.subplot2grid((4,2),(1,1))
data_file.Pclass.value_counts().plot(kind="bar", alpha = 0.4)
subplot4.set_xlim(-1, len(data_file.Pclass.value_counts()))
plt.title("Class distribution")

subplot5 = plt.subplot2grid((4,2),(2,0))
data_file.Pclass[data_file.Survived==1].value_counts().plot(kind="bar", alpha = 0.4)
subplot5.set_xlim(-1, len(data_file.Pclass[data_file.Survived==1].value_counts()))
plt.title("Class distribution for survivers")

subplot6 = plt.subplot2grid((4,2),(2,1))
data_file.Age[data_file.Pclass == 1].plot(kind='kde')
data_file.Age[data_file.Pclass == 2].plot(kind='kde')
data_file.Age[data_file.Pclass == 3].plot(kind='kde')
plt.title("Age distribution with each class")
plt.legend(('First class','Second class','Third class'),loc='best')

subplot7 = plt.subplot2grid((4,2),(3,0))
data_file.Embarked.value_counts().plot(kind='barh',alpha=0.4)
subplot7.set_ylim(-1, len(data_file.Embarked.value_counts()))
plt.title("Embark count")

subplot8 = plt.subplot2grid((4,2),(3,1))
data_file.Fare[data_file.Survived == 0].plot(kind='kde')
data_file.Fare[data_file.Survived == 1].plot(kind='kde')
plt.title("Fare distribution among survivors and non-survivers")
plt.legend(('Non-survivors','Survivors'),loc='best')

In [None]:
#fig = plt.figure(figsize=(30,25),dpi=2000)
pd.options.display.mpl_style = 'default'

scatter_matrix(data_file, alpha=0.2, figsize=(15, 15), diagonal='kde')
#Shows correlaton beetween data, since most of them are integral, does not much idea.

In [None]:
#We can have this correlation in table also. For that we need some adjustment in data_file
new_data_file = data_file.drop(['PassengerId','Name'], axis=1)
def to_numeric_str(var):
    if var=='male':
        return '0'
    else:
        return '1'
#Usage of lambda function
new_data_file[['Sex']] = new_data_file[['Sex']].apply(lambda row: to_numeric_str(row['Sex']),axis=1)
new_data_file[['Sex']] = new_data_file[['Sex']].apply(pd.to_numeric)
new_data_file[0:5]

In [None]:
corr = new_data_file.corr()
corr
sns.heatmap(corr, xticklabels=corr.columns.values, yticklabels=corr.columns.values)

In [None]:
#Graphical Analysis for Sex and survivors
fig = plt.figure(figsize=(18,12), dpi=100)

subplot9 = fig.add_subplot(4,4,1)
data_file.Survived[data_file.Sex == 'male'].value_counts().plot(kind='bar', label='Male')
subplot9.set_xlim(-2,3)
subplot9.set_ylim(0,400)
plt.title("Male who died and survived")
plt.legend(loc='best')

subplot10 = fig.add_subplot(4,4,2)
data_file.Survived[data_file.Sex == 'female'].value_counts().plot(kind='bar', label='Female')
subplot10.set_xlim(-2,3)
subplot10.set_ylim(0,400)
plt.title("Female who survived and died")
plt.legend(loc='best')
#In the below graph we see, more men died and more women survived with respect to each other, what the partiality

In [None]:
#Graphical Analysis for Sex, Class and survivors, for high class people
fig = plt.figure(figsize=(18,20), dpi=100)

subplot11 = fig.add_subplot(4,4,1)
data_file.Survived[data_file.Sex=='female'][data_file.Pclass!=3].value_counts().plot(kind='bar')
subplot11.set_ylim(0,200)
plt.title("High class females")

subplot12 = fig.add_subplot(4,4,2)
data_file.Survived[data_file.Sex=='female'][data_file.Pclass==1].value_counts().plot(kind='bar')
subplot12.set_ylim(0,200)
plt.title("Low class females")

subplot13 = fig.add_subplot(4,4,5)
data_file.Survived[data_file.Sex=='male'][data_file.Pclass!=3].value_counts().plot(kind='bar')
subplot13.set_ylim(0,200)
plt.title("High class Males")

subplot14 = fig.add_subplot(4,4,6)
data_file.Survived[data_file.Sex=='male'][data_file.Pclass==1].value_counts().plot(kind='bar')
subplot14.set_ylim(0,200)
plt.title("Low class Males")

#High class people were saved first, then low class. In that also, females took the lead.

In [None]:
#Now with above analysis, we apply learning models for prediction
#We will use tensor flow, linear regression and softmax
#First thing is to make data correct (tensors) before constructing graph and learning

data_columns = ['Survived','Pclass','Sex','Age','SibSp','Parch','Fare','Embarked']
#Construct test and train data
df_train = pd.read_csv("../input/train.csv")
df_test = pd.read_csv("../input/test.csv")

df_train = df_train.drop(['PassengerId','Name','Ticket','Cabin'], axis=1)
df_train = df_train.dropna()

#this one does not have Survived column
#df_test = df_test.drop(['PassengerId','Name','Ticket','Cabin'], axis=1)
#df_test = df_test.dropna()

df_test = df_train[0:20]

label_val = 'Survived'
#catagorical = ['Survived','Pclass','Sex','SibSp','Parch','Embarked']
#continous = ['Age','Fare']
catagorical = ['Sex','Embarked']
continous = ['Pclass','SibSp','Parch','Age','Fare']

In [None]:
#Define funtions
def input_fn(df):
  continuous_cols = {k: tf.constant(df[k].values) for k in continous}

  categorical_cols = {k: tf.SparseTensor(
      indices=[[i, 0] for i in range(df[k].size)],
      values=df[k].values,
      shape=[df[k].size, 1])
                      for k in catagorical}

  feature_cols = dict(continuous_cols.items() + categorical_cols.items())

  label = tf.constant(df[label_val].values)
  return feature_cols, label

def train_input_fn():
  return input_fn(df_train)

def eval_input_fn():
  return input_fn(df_test)

def test_input(df):
  continuous_cols = {k: tf.constant(df[k].values) for k in continous}

  categorical_cols = {k: tf.SparseTensor(
      indices=[[i, 0] for i in range(df[k].size)],
      values=df[k].values,
      shape=[df[k].size, 1])
                      for k in catagorical}

  feature_cols = dict(continuous_cols.items() + categorical_cols.items())
  return feature_cols

In [None]:
#For now, we will do simple feature engineering, no cross overs
#define each cataorical feature, no bucketing is required
#pclass = tf.contrib.layers.sparse_column_with_keys(column_name='Pclass',
#                                                  keys=[1,2,3])
#pclass = tf.contrib.layers.sparse_column_with_hash_bucket('Pclass', hash_bucket_size=3)
sex = tf.contrib.layers.sparse_column_with_keys(column_name='Sex',
                                                  keys=['male','female'])
#sibsp = tf.contrib.layers.sparse_column_with_keys(column_name='SibSp',
#                                                  keys=[0,1,2,3,4,5,8
#sibsp = tf.contrib.layers.sparse_column_with_hash_bucket('SibSp', hash_bucket_size=8)

#parch = tf.contrib.layers.sparse_column_with_keys(column_name='Parch',
#                                                  keys=[0,1,2,3,4,5,6])
#parch = tf.contrib.layers.sparse_column_with_hash_bucket("Parch", hash_bucket_size=7)

embarked = tf.contrib.layers.sparse_column_with_keys(column_name='Embarked',
                                                  keys=['C','Q','S'])

pclass = tf.contrib.layers.real_valued_column('Pclass')
sibsp = tf.contrib.layers.real_valued_column('SibSp')
parch = tf.contrib.layers.real_valued_column('Parch')

age = tf.contrib.layers.real_valued_column('Age')
fare = tf.contrib.layers.real_valued_column('Fare')

In [None]:
#Creating model with L1 and L2 regularization
model_dir = tempfile.mkdtemp()

m = tf.contrib.learn.LinearClassifier(feature_columns=[
  pclass, sex, sibsp, parch, embarked, age, fare],
  optimizer=tf.train.FtrlOptimizer(
    learning_rate=0.1,
    l1_regularization_strength=1.0,
    l2_regularization_strength=1.0),
  model_dir=model_dir)

In [None]:
#Training and evaluation of model
m.fit(input_fn=train_input_fn, steps =100)
results = m.evaluate(input_fn=eval_input_fn, steps=1)
for key in sorted(results):
    print (key, results[key])

In [None]:
raw_data = input_data = pd.read_csv("../input/train.csv")
input_data = input_data.drop(['PassengerId','Name','Ticket','Cabin'], axis=1)
#input_data = input_data.dropna()
def predict_input():
    return test_input(input_data)
#x = test_input(input_data)
ans = m.predict(input_fn=predict_input)
print(len(ans))

In [None]:
:

raw_data = raw_data.drop(['Pclass','Name','Sex','Age','SibSp','Parch','Ticket','Fare','Cabin','Embarked'],axis=1)
raw_data['Survived'] = ans
raw_data.to_csv('out.csv')
#Accuracy achived on kaggle = 76.077%