# Logistic Regression

### Titanic dataset
### Predicting whether survived or not


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

sns.set()

In [None]:
train = pd.read_csv("../input/titanicdataset-traincsv/train.csv")
train.head()

In [None]:
train.info()

In [None]:
train.describe().T

In [None]:
train.shape

In [None]:
sns.heatmap(train.isnull(),yticklabels=False,cbar=False,cmap='viridis')

#### We can conclude here that the Age and Cabin columns have null values, which need to be taken care before feeding the data to the model

In [None]:
sns.countplot(x='Survived',data=train,palette="Set1",hue='Sex')

#### From above figure, it is clear that the most of the survived(1) people are Females and majority of the people who not survived (0) are males

In [None]:
sns.countplot(x='Survived',data=train,palette="Set1",hue='Pclass')

#### From the above figure, it is clear that the people who survived(1) are from 1st passanger class and the people who not survived are majorly from 3rd passanger class

In [None]:
sns.distplot(train['Age'].dropna(), kde=False, bins=30)

#### It is clear from above figure that there are alot of childrens from age 0 to 10 in the ship and apart from this the average age is around 25-30

In [None]:
sns.countplot(x='SibSp',data=train)

#### As seen in above figure most of the people onboard are alone (i.e 0 no. of sibling/spouse) and other people are thosse who have 1 sibling/spouse most probably they must be the couples.


In [None]:
plt.figure(figsize=(10,7))
sns.boxplot(x='Pclass',y='Age',data=train)

In [None]:
avg_age_first_class = np.round(train[(train['Pclass'] ==1)]['Age'].mean())
avg_age_first_class

In [None]:
avg_age_second_class = np.round(train[(train['Pclass'] ==2)]['Age'].mean())
avg_age_second_class

In [None]:
avg_age_third_class = np.round(train[(train['Pclass'] ==3)]['Age'].mean())
avg_age_third_class

#### Since we have many empty values in Age column, so we will replace all the null value in Age with avg. value of age according to Pclass.

#### Also from above boxplot we can see that the people who are in 1st class have avg. age of around 38, 2nd class passengers have avg. age of around 29 and 3rd class passengers have avg. age of around 25. This is because to afford 1st class tickets people should accumulate enough wealth so avg. age is higher

In [None]:
def impute_age(cols):
    Age = cols[0]
    Pclass = cols[1]
    
    if pd.isnull(Age):
        
        if Pclass ==1:
            return avg_age_first_class
        elif Pclass ==2:
            return avg_age_second_class
        else:
            return avg_age_third_class
    else:
        return Age

In [None]:
train['Age'] = train[['Age','Pclass']].apply(impute_age,axis=1)

In [None]:
sns.heatmap(train.isnull(),yticklabels=False,cbar=False,cmap='viridis')

#### we have transformed null values in Age column as per out impute_age function. Now the null values in cabin column is large in number so it is better to drop the cabin column.

In [None]:
train.drop('Cabin',axis=1,inplace=True)

In [None]:
train.info()

In [None]:
sns.heatmap(train.isnull(),yticklabels=False,cbar=False,cmap='viridis')

In [None]:
sex = pd.get_dummies(train['Sex'],drop_first=True)

In [None]:
embark = pd.get_dummies(train['Embarked'],drop_first=True)

In [None]:
train = pd.concat([train,sex,embark], axis=1)

In [None]:
train.head()

In [None]:
train.drop(['Sex','Embarked','Name','Ticket'],axis=1,inplace=True)

In [None]:
train.head()

In [None]:
train.tail()

In [None]:
train.drop(['PassengerId'],axis=1,inplace=True)

In [None]:
train.head()

In [None]:
X = train.drop('Survived',axis=1)
y = train['Survived']

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=101)

In [None]:
logmodel = LogisticRegression()

In [None]:
logmodel.fit(X_train,y_train)

In [None]:
predictions = logmodel.predict(X_test)

In [None]:
from sklearn.metrics import classification_report

In [None]:
print(classification_report(y_test,predictions))