[![AnalyticsDojo](https://github.com/rpi-techfundamentals/fall2018-materials/blob/master/fig/final-logo.png?raw=1)](http://rpi.analyticsdojo.com)
<center><h1>Pytorch - Revisiting Titanic</h1></center>
<center><h3><a href = 'http://rpi.analyticsdojo.com'>rpi.analyticsdojo.com</a></h3></center>


In [0]:
!pip install torch torchvision

In [0]:
import numpy as np
import pandas as pd
import pandas as pd

train= pd.read_csv('https://raw.githubusercontent.com/rpi-techfundamentals/fall2018-materials/master/input/train.csv')
test = pd.read_csv('https://raw.githubusercontent.com/rpi-techfundamentals/fall2018-materials/master/input/test.csv')



In [0]:
#Print to standard output, and see the results in the "log" section below after running your script
train.head()

In [0]:
#Now let's fix the missing value problem in the age field
train["Age"] = train["Age"].fillna(train["Age"].median())
test["Age"] = test["Age"].fillna(test["Age"].median())

train["Fare"] = train["Fare"].fillna(train["Fare"].median())
test["Fare"] = test["Fare"].fillna(test["Fare"].median())

In [0]:
#For Recoding Data, we can use what we know of selecting rows and columns
train["Embarked"] = train["Embarked"].fillna("S")
test["Embarked"] = test["Embarked"].fillna("S")


In [0]:
#Let's create a new feature called namelength
train['NameLength'] = train['Name'].map(lambda x: len(x))
test['NameLength'] = test['Name'].map(lambda x: len(x))

In [0]:
dataset_title = [i.split(',')[1].split('.')[0].strip() for i in train['Name']]
train['Title'] = pd.Series(dataset_title)
train['Title'].value_counts()


In [0]:
#Replace Infrequent classes
train['Title'] = train['Title'].replace(['Lady', 'the Countess', 'Countess', 'Capt', 'Col', 'Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona', 'Ms', 'Mme', 'Mlle'], 'Rare')
train['Title'].value_counts()


In [0]:
#Double check that the same works for the Test. 
dataset_title = [i.split(',')[1].split('.')[0].strip() for i in test['Name']]
test['Title'] = pd.Series(dataset_title)
test['Title'] = test['Title'].replace(['Lady', 'the Countess', 'Countess', 'Capt', 'Col', 'Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona', 'Ms', 'Mme', 'Mlle'], 'Rare')
test['Title'].value_counts()

In [0]:
#Create a field for the family count. 
train['FamilyS'] = train['SibSp'] + train['Parch'] + 1
test['FamilyS'] = test['SibSp'] + test['Parch'] + 1

##Update

In [0]:
#Create a categorical variable from the family count 
def family(x):
    if x < 2:
        return 'Single'
    elif x == 2:
        return 'Couple'
    elif x <= 4:
        return 'InterM'
    else:
        return 'Large'
    
train['FamilyS'] = train['FamilyS'].apply(family)
test['FamilyS'] = test['FamilyS'].apply(family)

In [0]:
#Drop some colums that won't be modeled. 
train_min=train.drop(['PassengerId', 'Cabin', 'Name', 'SibSp', 'Parch', 'Ticket'], axis=1)
train_min
test_min=test.drop(['PassengerId', 'Cabin', 'Name', 'SibSp', 'Parch', 'Ticket'], axis=1)
train_min


In [0]:
#Adding .values will make whatever comes out into a numpy array. 
X_train = train_min.iloc[:, 1:9].values
Y_train = train_min.iloc[:, 0].values
X_test = test_min.iloc[:, 0:8].values

print(X_test[1], X_train[1])

In [0]:
# Previously we used get dummies (part of pandas)
#Here we will first transform string labels to numeric categories. 
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
labelencoder_X_1 = LabelEncoder()
X_train[:, 1] = labelencoder_X_1.fit_transform(X_train[:, 1]) #Female/male
X_train[:, 4] = labelencoder_X_1.fit_transform(X_train[:, 4]) #Embarked
X_train[:, 6] = labelencoder_X_1.fit_transform(X_train[:, 6]) #Title
X_train[:, 7] = labelencoder_X_1.fit_transform(X_train[:, 7]) #Couple


labelencoder_X_2 = LabelEncoder()
X_test[:, 1] = labelencoder_X_2.fit_transform(X_test[:, 1])
X_test[:, 4] = labelencoder_X_2.fit_transform(X_test[:, 4])
X_test[:, 6] = labelencoder_X_1.fit_transform(X_test[:, 6]) 
X_test[:, 7] = labelencoder_X_2.fit_transform(X_test[:, 7])
X_test.shape

In [0]:
print(pd.DataFrame(X_test).isna().sum())
print(pd.DataFrame(X_test).isna().sum())

In [0]:
# Converting numeric categories values to one-hot representation
one_hot_encoder = OneHotEncoder(categorical_features = [1, 4, 6, 7])
X_train = one_hot_encoder.fit_transform(X_train).toarray()
X_test = one_hot_encoder.fit_transform(X_test).toarray()

In [0]:
#Double check no missing values or values that aren't numeric.
print(np.isnan(X_train).sum(),np.isnan(X_test).sum())
print(X_train[0])

In [0]:
#Split the data
from sklearn.model_selection import train_test_split
x_train, x_val, y_train, y_val = train_test_split(X_train, Y_train, test_size = 0.2)
x_train.shape



In [0]:
y_val.shape

In [0]:
#Define the model 
import torch
import torch.nn as nn
import torch.nn.functional as F

class Net(nn.Module):
    
    def __init__(self):
        super(Net, self).__init__()
        #Note that 17 is the number of columns in the input matrix. 
        self.fc1 = nn.Linear(18, 270)
        #270 is arbitrary, but needs to be consistent.  2 is the number of classes in the output (died/survived)
        self.fc2 = nn.Linear(270, 2)
        
    def forward(self, x):
        x = self.fc1(x)
        x = F.dropout(x, p=0.1)
        x = F.relu(x)
        x = self.fc2(x)
        x = torch.sigmoid(x)  #signmoid flattens to probability 0-1. 
        
        return x
    
net = Net()

In [0]:
#Define training
batch_size = 50
num_epochs = 100
learning_rate = 0.01
batch_no = len(x_train) // batch_size

In [0]:
#define loss function and optimizer 
#Adam is a specific flavor of gradient decent which is typically better
optimizer = torch.optim.Adam(net.parameters(), lr=learning_rate)
criterion = nn.CrossEntropyLoss()

In [0]:
from sklearn.utils import shuffle
from torch.autograd import Variable
running_loss = 0.0
for epoch in range(num_epochs):
    x_train, y_train = shuffle(x_train, y_train)
    # Mini batch learning
    for i in range(batch_no):
        start = i * batch_size
        end = start + batch_size
        inputs = Variable(torch.FloatTensor(x_train[start:end]))
        labels = Variable(torch.LongTensor(y_train[start:end]))
        
        # zero the parameter gradients
        optimizer.zero_grad()

        # forward + backward + optimize
        outputs = net(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        # print statistics
        running_loss += loss.item()
        
    print('Epoch {}'.format(epoch+1), "loss: ",running_loss)
    running_loss = 0.0

        
        

In [0]:
#This is a little bit tricky to get the resulting prediction.  
def calculate_accuracy(x,y=[]):
  # Evaluate the model with the test set. 
  test_var = Variable(torch.FloatTensor(x), requires_grad=True)
  with torch.no_grad():   
      result = net(test_var) #This outputs the probability for each class.
  values, labels = torch.max(result, 1)
  if len(y) != 0:
      num_right = np.sum(labels.data.numpy() == y)
      print('Accuracy {:.2f}'.format(num_right / len(y)), "for a total of ", len(y), "records")
  else:
      print("returning predictions")
      return labels.data.numpy()
 
  


In [0]:
# Check out the accuracy. 
calculate_accuracy(x_train, y_train)
calculate_accuracy(x_val, y_val)
predictions=calculate_accuracy(X_test)
len(predictions)
predictions

In [0]:
#Writing to File
submission=pd.DataFrame(test.loc[:,['PassengerId']])
submission['Survived']=predictions
#Any files you save will be available in the output tab below

submission.to_csv('submission.csv', index=False)


 

In [0]:
from google.colab import files
files.download('submission.csv')

In [0]:
# Evaluate the model
test_var = Variable(torch.FloatTensor(x_val), requires_grad=True)
with torch.no_grad():
    result = net(test_var)
values, labels = torch.max(result, 1)
num_right = np.sum(labels.data.numpy() == y_val)
print('Accuracy {:.2f}'.format(num_right / len(y_val)))