In [96]:
import pandas as pd
# Import the linear regression class
from sklearn.linear_model import LinearRegression, LogisticRegression
# Sklearn also has a helper that makes it easy to do cross validation
from sklearn import cross_validation


import numpy as np


#Reding the data
data=pd.read_csv("train.csv")

#Helps in getting the first impression of the dataset (DESCRIBES ONLY COLUMNS WITH NUMERICAL VALUES)
print (data.describe())

       PassengerId    Survived      Pclass         Age       SibSp  \
count   891.000000  891.000000  891.000000  714.000000  891.000000   
mean    446.000000    0.383838    2.308642   29.699118    0.523008   
std     257.353842    0.486592    0.836071   14.526497    1.102743   
min       1.000000    0.000000    1.000000    0.420000    0.000000   
25%     223.500000    0.000000    2.000000   20.125000    0.000000   
50%     446.000000    0.000000    3.000000   28.000000    0.000000   
75%     668.500000    1.000000    3.000000   38.000000    1.000000   
max     891.000000    1.000000    3.000000   80.000000    8.000000   

            Parch        Fare  
count  891.000000  891.000000  
mean     0.381594   32.204208  
std      0.806057   49.693429  
min      0.000000    0.000000  
25%      0.000000    7.910400  
50%      0.000000   14.454200  
75%      0.000000   31.000000  
max      6.000000  512.329200  


In [97]:
#Filling missing values
data["Age"] = data["Age"].fillna(data["Age"].median())

In [98]:
# Find all the unique genders -- the column appears to contain only male and female.
print(data["Sex"].unique())

# Replace all the occurences of male with the number 0 and female with 1
data.loc[data["Sex"] == "male", "Sex"] = 0
data.loc[data["Sex"] == "female", "Sex"] = 1

['male' 'female']


In [99]:
# Find all the unique values for "Embarked".
print(data["Embarked"].unique())

# Filling NA with S
data["Embarked"] = data["Embarked"].fillna('S')

# We'll assign the code 0 to S, 1 to C and 2 to Q
data.loc[data["Embarked"] == "S", "Embarked"] = 0
data.loc[data["Embarked"] == "C", "Embarked"] = 1
data.loc[data["Embarked"] == "Q", "Embarked"] = 2

['S' 'C' 'Q' nan]


In [100]:
# MACHINE LEARNING ALGO 1 - Linear regression 
# The columns we'll use to predict the target
predictors = ["Pclass", "Sex", "Age", "SibSp", "Parch", "Fare", "Embarked"]

# Initialize our algorithm class
alg = LinearRegression()
# Generate cross validation folds for the titanic dataset.  It return the row indices corresponding to train and test.
# We set random_state to ensure we get the same splits every time we run this.
kf = KFold(data.shape[0], n_folds=3, random_state=1)

predictions = []
for train, test in kf:
    # The predictors we're using the train the algorithm.  Note how we only take the rows in the train folds.
    train_predictors = (data[predictors].iloc[train,:])
    # The target we're using to train the algorithm.
    train_target = data["Survived"].iloc[train]
    # Training the algorithm using the predictors and target.
    alg.fit(train_predictors, train_target)
    # We can now make predictions on the test fold
    test_predictions = alg.predict(data[predictors].iloc[test,:])
    predictions.append(test_predictions)

    
# METHOD 1    
# The predictions are in three separate numpy arrays.  Concatenate them into one.  
# We concatenate them on axis 0, as they only have one axis.
predictions = np.concatenate(predictions, axis=0)

# Map predictions to outcomes (only possible outcomes are 1 and 0)
predictions[predictions > .5] = 1
predictions[predictions <=.5] = 0

accuracy = sum(predictions[predictions == data["Survived"]]) / len(predictions)
print ("Linear regreesion accuracy "+str(accuracy))

#MACHINE LEARNING ALGO 2 - Logistic regression 
# Initialize our algorithm
alg = LogisticRegression(random_state=1)
# Compute the accuracy score for all the cross validation folds.  (much simpler than what we did before!)
scores = cross_validation.cross_val_score(alg, data[predictors], data["Survived"], cv=3)
# Take the mean of the scores (because we have one for each fold)
print("Logistic Regression accuracy "+ str(scores.mean()))

Linear regreesion accuracy 0.783389450056
Logistic Regression accuracy 0.787878787879




In [101]:
# TEST SET

titanic_test = pd.read_csv("test.csv")
titanic_test["Age"]=titanic_test["Age"].fillna(28.0)
titanic_test.loc[titanic_test["Sex"] == "male", "Sex"] = 0
titanic_test.loc[titanic_test["Sex"] == "female", "Sex"] = 1
# Filling NA with S
titanic_test["Embarked"] = titanic_test["Embarked"].fillna('S')

# We'll assign the code 0 to S, 1 to C and 2 to Q
titanic_test.loc[titanic_test["Embarked"] == "S", "Embarked"] = 0
titanic_test.loc[titanic_test["Embarked"] == "C", "Embarked"] = 1
titanic_test.loc[titanic_test["Embarked"] == "Q", "Embarked"] = 2

titanic_test["Fare"]=titanic_test["Fare"].fillna(titanic_test["Fare"].median())

In [102]:
# Initialize the algorithm class
alg = LogisticRegression(random_state=1)

# Train the algorithm using all the training data
alg.fit(data[predictors], data["Survived"])

# Make predictions using the test set.
predictions = alg.predict(titanic_test[predictors])

# Create a new dataframe with only the columns Kaggle wants from the dataset.
submission = pd.DataFrame({
        "PassengerId": titanic_test["PassengerId"],
        "Survived": predictions
    })

In [103]:
submission.to_csv("linearregreesion_logisticregression.csv", index=False)