In [29]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [30]:
titanic = pd.read_csv("titanic_train.csv")

In [31]:
embark = pd.get_dummies(titanic['Embarked'],drop_first=False,columns=['Q','S',"C"])

In [32]:
embark.head()

Unnamed: 0,C,Q,S
0,0,0,1
1,1,0,0
2,0,0,1
3,0,0,1
4,0,0,1


In [33]:
titanic['Embarked'].value_counts()

S    644
C    168
Q     77
Name: Embarked, dtype: int64

In [34]:
titanic['Name'].value_counts()

Bailey, Mr. Percy Andrew             1
Goodwin, Miss. Lillian Amy           1
Isham, Miss. Ann Elizabeth           1
Peter, Miss. Anna                    1
O'Brien, Mr. Thomas                  1
                                    ..
Sage, Miss. Dorothy Edith "Dolly"    1
Hamalainen, Master. Viljo            1
Hickman, Mr. Lewis                   1
Pengelly, Mr. Frederick William      1
Smiljanic, Mr. Mile                  1
Name: Name, Length: 891, dtype: int64

In [35]:
def new_name(name):
    start = 0
    end=0
    for x in range(len(name)):
        if name[x]==',':
            start=x
        if name[x]=='.':
            end=x
    return name[start+2:end]

titanic['Name']=titanic['Name'].apply(lambda x: new_name(x))

In [36]:
titanic['Name'].value_counts()

Mr                          517
Miss                        182
Mrs                         124
Master                       40
Dr                            7
Rev                           6
Major                         2
Mlle                          2
Col                           2
Mrs. Martin (Elizabeth L      1
Lady                          1
Don                           1
Mme                           1
the Countess                  1
Jonkheer                      1
Sir                           1
Ms                            1
Capt                          1
Name: Name, dtype: int64

In [37]:
greetings = pd.get_dummies(titanic['Name'])
greetings.head()


Unnamed: 0,Capt,Col,Don,Dr,Jonkheer,Lady,Major,Master,Miss,Mlle,Mme,Mr,Mrs,Mrs. Martin (Elizabeth L,Ms,Rev,Sir,the Countess
0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
2,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0


In [38]:
titanic.drop('Name',axis=1,inplace=True)

In [39]:
def impute_age(cols):
    Age = cols[0]
    Pclass = cols[1]
    
    if pd.isnull(Age):

        if Pclass == 1:
            return 37

        elif Pclass == 2:
            return 29

        else:
            return 24

    else:
        return Age

In [40]:
titanic['Age'] = titanic[['Age','Pclass']].apply(impute_age,axis=1)

In [41]:
titanic.drop('Cabin',axis=1,inplace=True)

In [42]:
titanic.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked
0,1,0,3,male,22.0,1,0,A/5 21171,7.25,S
1,2,1,1,female,38.0,1,0,PC 17599,71.2833,C
2,3,1,3,female,26.0,0,0,STON/O2. 3101282,7.925,S
3,4,1,1,female,35.0,1,0,113803,53.1,S
4,5,0,3,male,35.0,0,0,373450,8.05,S


In [43]:
sex = pd.get_dummies(titanic['Sex'],drop_first=True)
embark = pd.get_dummies(titanic['Embarked'],drop_first=False)


In [44]:
titanic.drop(['Sex','Embarked'],axis=1,inplace=True)

In [45]:
titanic = pd.concat([titanic,sex,embark,greetings],axis=1)

In [46]:
titanic.head()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Ticket,Fare,male,C,...,Miss,Mlle,Mme,Mr,Mrs,Mrs. Martin (Elizabeth L,Ms,Rev,Sir,the Countess
0,1,0,3,22.0,1,0,A/5 21171,7.25,1,0,...,0,0,0,1,0,0,0,0,0,0
1,2,1,1,38.0,1,0,PC 17599,71.2833,0,1,...,0,0,0,0,1,0,0,0,0,0
2,3,1,3,26.0,0,0,STON/O2. 3101282,7.925,0,0,...,1,0,0,0,0,0,0,0,0,0
3,4,1,1,35.0,1,0,113803,53.1,0,0,...,0,0,0,0,1,0,0,0,0,0
4,5,0,3,35.0,0,0,373450,8.05,1,0,...,0,0,0,1,0,0,0,0,0,0


In [47]:
titanic.drop(['Ticket'],axis=1,inplace=True)

In [48]:
from sklearn.model_selection import train_test_split

In [49]:
X_train, X_test, y_train, y_test = train_test_split(titanic.drop('Survived',axis=1), 
                                                    titanic['Survived'], test_size=0.30, 
                                                    random_state=101)

In [50]:
from sklearn.linear_model import LogisticRegression

In [56]:
logmodel = LogisticRegression(max_iter=2000)
logmodel.fit(X_train,y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=2000,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [57]:
predictions = logmodel.predict(X_test)

In [58]:
from sklearn.metrics import classification_report

In [59]:
print(classification_report(y_test,predictions))

              precision    recall  f1-score   support

           0       0.82      0.90      0.86       154
           1       0.85      0.73      0.78       114

    accuracy                           0.83       268
   macro avg       0.83      0.82      0.82       268
weighted avg       0.83      0.83      0.83       268

