In [96]:
#Loading Datasets
import pandas as pd
titanic = pd.read_csv('train.csv')
titanic_test = pd.read_csv('test.csv')

In [97]:
#Because the cleanup is used on a lot of csv's
def csv_cleanup(csv):
    #Filling missing data with the most common one in that dataset
    csv.Age = csv.Age.fillna(csv.Age.median())
    csv.Fare = csv.Fare.fillna(csv.Fare.median())
    #Setting survey results like male/female/s/c/q with numbers for better calculation
    csv.loc[csv.Sex == "female", "Sex"] = 1
    csv.loc[csv.Sex == "male", "Sex"] = 0
    csv.Embarked = csv.Embarked.fillna("S")
    csv.loc[csv.Embarked == "S", "Embarked"] = 0
    csv.loc[csv.Embarked == "C", "Embarked"] = 1
    csv.loc[csv.Embarked == "Q", "Embarked"] = 2
    return csv
    
titanic = csv_cleanup(titanic)
titanic_test = csv_cleanup(titanic_test)

In the next step, I will be using Linear Regression to train the dataset.

In [98]:
# Import the linear regression class
from sklearn.linear_model import LinearRegression
# Sklearn also has a helper that makes it easy to do cross validation
from sklearn.cross_validation import KFold

# The columns we'll use to predict the target
predictors = ["Pclass", "Sex", "Age", "SibSp", "Parch", "Fare", "Embarked"]

# Initialize our algorithm class
alg = LinearRegression()
# Generate cross validation folds for the titanic dataset.  It return the row indices corresponding to train and test.
# We set random_state to ensure we get the same splits every time we run this.
kf = KFold(titanic.shape[0], n_folds=3, random_state=1)

predictions = []
for train, test in kf:
    # The predictors we're using the train the algorithm.  Note how we only take the rows in the train folds.
    train_predictors = (titanic[predictors].iloc[train,:])
    # The target we're using to train the algorithm.
    train_target = titanic["Survived"].iloc[train]
    # Training the algorithm using the predictors and target.
    alg.fit(train_predictors, train_target)
    # We can now make predictions on the test fold
    test_predictions = alg.predict(titanic[predictors].iloc[test,:])
    predictions.append(test_predictions)
    

Now, I will be using Logistic Regression where the values are mapped between 0 and 1

In [99]:
from sklearn import cross_validation
from sklearn.linear_model import LogisticRegression

predictors = ["Pclass", "Sex", "Age", "SibSp", "Parch", "Fare", "Embarked"]

alg = LogisticRegression(random_state=1)
# Compute the accuracy score for all the cross validation folds.  (much simpler than what we did before!)
scores = cross_validation.cross_val_score(alg, titanic[predictors], titanic["Survived"], cv=3)
# Take the mean of the scores (because we have one for each fold)
print(scores.mean())

0.787878787879


The exported file needs to submitted to the competition.

In [100]:
# Train the algorithm using all the training data
alg.fit(titanic[predictors], titanic["Survived"])

# Make predictions using the test set.
predictions = alg.predict(titanic_test[predictors])

# Create a new dataframe with only the columns Kaggle wants from the dataset.
submission = pd.DataFrame({
        "PassengerId": titanic_test["PassengerId"],
        "Survived": predictions
    })

submission.to_csv("kaggle1.csv", index=False)

Kaggle Score of 0.75120

#### Improvement 1

I am thinking about combining Siblings/Spouses on board with Parents/Children on board. And then multiplying that "Family" column with the average Fare to calculate the total money spent by the family (although I am not sure if using mean Fare is the best way to go). But let's just try it.

In [101]:
def Imp1(data):
    data["Family"] = data["SibSp"]+data["Parch"]
    data["Improve1"] = data["Family"]*data["Fare"]

#Run the above func for train & testing data
Imp1(titanic)
Imp1(titanic_test)

In [102]:
from sklearn import cross_validation
from sklearn.linear_model import LogisticRegression

predictors = ["Pclass", "Sex", "Age", "Improve1", "Embarked"]

alg = LogisticRegression(random_state=1)
# Compute the accuracy score for all the cross validation folds.  (much simpler than what we did before!)
scores = cross_validation.cross_val_score(alg, titanic[predictors], titanic["Survived"], cv=3)
# Take the mean of the scores (because we have one for each fold)
print(scores.mean())

0.791245791246


In [103]:
# Train the algorithm using all the training data
alg.fit(titanic[predictors], titanic["Survived"])

# Make predictions using the test set.
predictions = alg.predict(titanic_test[predictors])

# Create a new dataframe with only the columns Kaggle wants from the dataset.
submission = pd.DataFrame({
        "PassengerId": titanic_test["PassengerId"],
        "Survived": predictions
    })

submission.to_csv("kaggle2.csv", index=False)

Kaggle Score: 0.75120
Not an improvement. I wonder if I think more about how each variable effects the outcome. Maybe using Fare/Family?
Let's try

#### Improvement 2

In [104]:
def Imp2(data):
    data["Family"] = data["SibSp"]+data["Parch"]
    data["Improve2"] = data["Family"]/data["Fare"]
    #there are some families with 0 as its value for parch/sibsp
    data.Improve2 = data.Improve2.fillna(data.Improve2.median())

#Run the above func for train & testing data
Imp2(titanic)
Imp2(titanic_test)

In [105]:
from sklearn import cross_validation
from sklearn.linear_model import LogisticRegression

predictors = ["Pclass", "Sex", "Age", "Improve2", "Embarked"]

alg = LogisticRegression(random_state=1)
# Compute the accuracy score for all the cross validation folds.  (much simpler than what we did before!)
scores = cross_validation.cross_val_score(alg, titanic[predictors], titanic["Survived"], cv=3)
# Take the mean of the scores (because we have one for each fold)
print(scores.mean())

0.786756453423


In [106]:
# Train the algorithm using all the training data
alg.fit(titanic[predictors], titanic["Survived"])

# Make predictions using the test set.
predictions = alg.predict(titanic_test[predictors])

# Create a new dataframe with only the columns Kaggle wants from the dataset.
submission = pd.DataFrame({
        "PassengerId": titanic_test["PassengerId"],
        "Survived": predictions
    })

submission.to_csv("kaggle3.csv", index=False)

Same Kaggle score as before. Is that even possible?? 

#### Improvement 3

I wonder if I include a column of women in higher Pclass effect the outcome. Women_H are women form Class 1 and Class 2 respectively (giving them the value 1) whereas women from Class 3 are given value 0. Convert all the women in higher class to 1 and lower to 0.

In [107]:
def Imp3(data):
    data.loc[data["Pclass"] == 1, "Pclass"] = 0 
    data.loc[data["Pclass"] == 2, "Pclass"] = 1
    data.loc[data["Pclass"] == 3, "Pclass"] = 1 
    #
    #Now create a column which basically is 1 for higher class women
    data["Improve3"] = data["Pclass"]*data["Sex"]
    
#Run the above func for train & testing data
Imp3(titanic)
Imp3(titanic_test)

In [108]:
from sklearn import cross_validation
from sklearn.linear_model import LogisticRegression

predictors = ["Pclass", "Sex", "Age", "Improve3", "Embarked"]

alg = LogisticRegression(random_state=1)
# Compute the accuracy score for all the cross validation folds.  (much simpler than what we did before!)
scores = cross_validation.cross_val_score(alg, titanic[predictors], titanic["Survived"], cv=3)
# Take the mean of the scores (because we have one for each fold)
print(scores.mean())

0.773288439955


In [109]:
# Train the algorithm using all the training data
alg.fit(titanic[predictors], titanic["Survived"])

# Make predictions using the test set.
predictions = alg.predict(titanic_test[predictors])

# Create a new dataframe with only the columns Kaggle wants from the dataset.
submission = pd.DataFrame({
        "PassengerId": titanic_test["PassengerId"],
        "Survived": predictions
    })

submission.to_csv("kaggle4.csv", index=False)

It will be really funny if I get the same score.

Kaggle Score 0.76555 (Not the same score)
Improved it by 0.01435