# Logistic Regression 
### We will be using Titanic Dataset from kaggle, classifying whether passenger survived or not

In [1]:
#importing required dependencies
import pandas as pd
import numpy as np
from sklearn.cross_validation import train_test_split
from sklearn.preprocessing import StandardScaler,LabelEncoder
from sklearn.metrics import accuracy_score,roc_auc_score,classification_report,confusion_matrix
from sklearn.linear_model import LogisticRegression



In [2]:
titanic_train = pd.read_csv('train.csv')

In [3]:
titanic_train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [4]:
# check coloumn data types
titanic_train.dtypes

PassengerId      int64
Survived         int64
Pclass           int64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
dtype: object

In [5]:
# we will use only some coloumns for our classification
#Lets begin with converting Cabin column to string, and take first letter and assign as cabin

char_cabin = titanic_train["Cabin"].astype(str)     

new_Cabin = np.array([cabin[0] for cabin in char_cabin]) 

titanic_train["Cabin"] = pd.Categorical(new_Cabin)  

titanic_train.head()     #see now cabin column became some sort of categorical variable

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,n,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,n,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,n,S


In [6]:
# We also deal with age coloumn,so check whether age coloumn values are null, if null replace with medium age say 27 or 28 


new_age_var = np.where(titanic_train["Age"].isnull(), 
                       28,                       
                       titanic_train["Age"])     

titanic_train["Age"] = new_age_var 

Now we are ready to use logistic regression model to predict survival. The scikit-learn library has a logistic regression function in linear_model subfolder.  
First we will make a model using only sex variable

In [7]:
# Conver sex varaible to categorical variable

label_encoder = LabelEncoder()

sex_encoded = label_encoder.fit_transform(titanic_train['Sex'])

In [13]:
# Initialize logistic regression model
log_model = LogisticRegression()

# Train the model
log_model.fit(X = pd.DataFrame(sex_encoded), 
              y = titanic_train["Survived"])

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [10]:
# lets do prediction on test

preds = log_model.predict_proba(X= pd.DataFrame(sex_encoded))     #predict_proba to get predicted class probabilities
preds = pd.DataFrame(preds)
preds.columns = ["Death_prob", "Survival_prob"]

# Generate table of predictions vs Sex
pd.crosstab(titanic_train["Sex"], preds.ix[:, "Survival_prob"])

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated


Survival_prob,0.19312542897248655,0.7311133823315542
Sex,Unnamed: 1_level_1,Unnamed: 2_level_1
female,0,314
male,577,0


#### The table shows that for male survival chance is 19% and for female it is 73%.....Now let us include some more features and build the model again

In [15]:
# we will include Pclass,cabin and also age along with sex as our new features. 
#As Pclass and cabin are categorical we need to change them to numerical
encoded_class = label_encoder.fit_transform(titanic_train["Pclass"])
encoded_cabin = label_encoder.fit_transform(titanic_train["Cabin"])

#overall training_features
train_features = pd.DataFrame([encoded_class,
                              encoded_cabin,
                              sex_encoded,
                              titanic_train["Age"]]).T

# Initialize logistic regression model
log_model = LogisticRegression()

# Train the model
log_model.fit(X = train_features ,
              y = titanic_train["Survived"])


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [16]:
# Make predictions
preds = log_model.predict(X= train_features)

# Generate table of predictions vs actual
pd.crosstab(preds,titanic_train["Survived"])

Survived,0,1
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1
0,463,98
1,86,244



#### The table above shows the classes our model predicted vs. true values of the Survived variable. This table of predicted vs. actual values is known as a confusion matrix.

We can calculate the overall prediction accuracy from the matrix by adding the total number of correct predictions and dividing by the total number of predictions. In the case of our model, the prediction accuracy is

In [17]:
(467+237)/889


0.7919010123734533

Often accuracy is not best metric for assessing a model it is a good idea to consider accuracy as well as sensitivity and precision when assessing model performance. 

In [18]:
#Confusion matrix
confusion_matrix(y_true=titanic_train['Survived'],y_pred=preds)

array([[463,  86],
       [ 98, 244]])

In [19]:
# summary of common classification metrics
print(classification_report(y_true=titanic_train["Survived"],
                              y_pred=preds) )

             precision    recall  f1-score   support

          0       0.83      0.84      0.83       549
          1       0.74      0.71      0.73       342

avg / total       0.79      0.79      0.79       891



### Now let's implement our model on test data

In [20]:
titanic_test = pd.read_csv("test.csv")

char_cabin = titanic_test["Cabin"].astype(str)     

new_Cabin = np.array([cabin[0] for cabin in char_cabin]) 

titanic_test["Cabin"] = pd.Categorical(new_Cabin)  

new_age_var = np.where(titanic_test["Age"].isnull(), 
                       28,                       
                       titanic_test["Age"])      

titanic_test["Age"] = new_age_var 

In [21]:
# Convert test variables to match model features
encoded_sex = label_encoder.fit_transform(titanic_test["Sex"])
encoded_class = label_encoder.fit_transform(titanic_test["Pclass"])
encoded_cabin = label_encoder.fit_transform(titanic_test["Cabin"])

test_features = pd.DataFrame([encoded_class,
                              encoded_cabin,
                              encoded_sex,
                              titanic_test["Age"]]).T

In [22]:
test_preds = log_model.predict(X=test_features)


result = pd.DataFrame({"PassengerId":titanic_test["PassengerId"],
                           "Survived":test_preds})

In [23]:
result.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,1


#### Create the model including other features also and see the accuracy and results