### Logistic Regression

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt #for plotting the data 
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn import metrics



%matplotlib inline

In [None]:
# Input data files are available in the "data/Titanic1/" directory.
# read the train data in data DataFrame
titanic_path = '../input/testtitanic/titanic_data.csv'
train_data = pd.read_csv(titanic_path)
#read the test data
#test_data = pd.read_csv('C:/Users/Bala/Desktop/ds/test.csv') 

In [None]:
train_data.head()  #Reading the data to check the header

In [None]:
print(train_data.shape) # Prints shape

In [None]:
train_data.describe() # # summarize numeric values

In [None]:
train_data.isnull().sum()  #Data not having any NaNs

## Missing Value Treatment

In [None]:
# percent of missing "Age" 
print('Percent of missing "Age" records is %.2f%%' 
      %((train_data['Age'].isnull().sum()/train_data.shape[0])*100))

~20% of entries for passenger age are missing. Let's see what the 'Age' variable looks like in general.

In [None]:
ax = train_data["Age"].hist(bins=15, density=True, stacked=True, color='teal', alpha=0.6)
train_data["Age"].plot(kind='density', color='teal')
ax.set(xlabel='Age')
plt.xlim(-10,85)
plt.show()

Since "Age" is (right) skewed, using the mean might give us biased results by filling in ages that are older than desired. To deal with this, we'll use the median to impute the missing values.

In [None]:
# mean age
print('The mean of "Age" is %.2f' %(train_data["Age"].mean(skipna=True)))
# median age
print('The median of "Age" is %.2f' %(train_data["Age"].median(skipna=True)))

In [None]:
# percent of missing "Cabin" 
print('Percent of missing "Cabin" records is %.2f%%' 
      %((train_data['Cabin'].isnull().sum()/train_data.shape[0])*100))

77% of records are missing, which means that imputing information and using this variable for prediction is probably not wise.

We'll ignore this variable in our model.

In [None]:
print('Boarded passengers grouped by port of embarkation (C = Cherbourg, Q = Queenstown,S = Southampton):')
print(train_data['Embarked'].value_counts())
sns.countplot(x='Embarked', data=train_data, palette='Set2')
plt.show()

There are only 2 (0.22%) missing values for "Embarked", so we can just impute with the port where most people boarded.

In [None]:
#fill the NaN value
#train_data['Age'].fillna(train_data['Age'].median(),inplace=True)
train_data.Cabin.fillna('U',inplace=True)
train_data.Embarked.fillna('S',inplace=True)
train_data.Age.fillna(train_data.Age.median(), inplace=True)

In [None]:
train_data.isnull().sum()  #Data not having any NaNs

## Data Exploration

### 1. Sex versus Survival

In [None]:
total = train_data['Sex'].value_counts()
survived_sex = train_data[train_data['Survived']==1]['Sex'].value_counts()
died_sex = train_data[train_data['Survived']==0]['Sex'].value_counts()
df = pd.DataFrame([total,survived_sex,died_sex])
df.index = ['Total','Survived','Died']
print(df)
df.plot(kind='bar')

## 2. Age vs Survival

In [None]:
figure = plt.figure(figsize=(15,8))
plt.hist([train_data[train_data['Survived']==1]['Age'],
          train_data[train_data['Survived']==0]['Age']], color = ['g','r'],
         bins = 10,label = ['Survived','Dead'])
plt.xlabel('Age')
plt.ylabel('Number of passengers')
plt.legend()

## 3. Class versus Survival

In [None]:
survived_1 = train_data[train_data['Pclass']==1]['Survived'].value_counts()
survived_2 = train_data[train_data['Pclass']==2]['Survived'].value_counts()
survived_3 = train_data[train_data['Pclass']==3]['Survived'].value_counts()
df = pd.DataFrame([survived_1,survived_2,survived_3])
df['total']=df[0]+df[1]
df.index = ['1st class','2nd class','3rd class']
df.rename(index=str,columns={1:'Survived',0:'Died'})
print (df)
df.plot(kind='bar',label=['Survived','Died'])

## 4. Fare versus Survival

In [None]:
figure = plt.figure(figsize=(15,8))
plt.hist([train_data[train_data['Survived']==1]['Fare'],train_data[train_data['Survived']==0]['Fare']],bins=10,label=['Survived','Died'])
plt.xlabel('Fare')
plt.ylabel('No. of People')
plt.legend()

## 5. Fare and Age versus Survival

In [None]:
#Age versus Fare
train_data.plot.scatter('Age','Fare',c='Survived',colormap='jet',alpha=0.8,figsize=(15,8))

## 6. Embarkment versus Survival

In [None]:
survived_embarkment  = train_data[train_data['Survived']==1]['Embarked'].value_counts()
died_embarkment = train_data[train_data['Survived']==0]['Embarked'].value_counts()
df = pd.DataFrame([survived_embarkment,died_embarkment])
df.index=['survived','died']
df.plot(kind='bar',stacked=True)

## Encoding the value for Logistic Regression

One Hot Encoding for using the logistic reg.

In [None]:
data_set = train_data[['Pclass','Sex','Age','Fare','SibSp','Cabin']]
one_hot_encoded_training_predictors = pd.get_dummies(data_set)

In [None]:
one_hot_encoded_training_predictors.head()

In [None]:
X = one_hot_encoded_training_predictors
y = train_data['Survived']

In [None]:
#dividing the data in training and test data 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.65, random_state=1)

## Building Model

In [None]:
logreg = LogisticRegression() #logistic regression using python
logreg.fit(X_train, y_train), 

In [None]:
y_pred = logreg.predict(X_test) #predicting the values
print('Accuracy of logistic regression classifier on test set: {:.2f}'.format(logreg.score(X_test, y_test)))