In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

In [None]:
train = pd.read_csv('../input/train.csv')
test = pd.read_csv('../input/test.csv')
train.head()

In [None]:
train.describe

In [None]:
train.shape

In [None]:
train.isnull().sum()

In [None]:
sns.heatmap(train.isnull(), cbar = False, cmap = 'viridis')

In [None]:
sns.heatmap(test.isnull(), cbar = False, cmap = 'viridis') #test

In [None]:
train.Age.isnull().sum()/train.shape[0]*100  #calculating what percentage of the Ages are null values

In [None]:
ax = train.Age.hist(bins = 30, density = True, stacked = True, color = 'magenta', alpha = 0.7, figsize = (16, 5))
train.Age.plot(kind = 'density')
ax.set_xlabel('Age')
plt.show()

In [None]:
#how age affected survival

Survived = 'Survived'
not_survived = 'did not survive'

fig, axes = plt.subplots(nrows = 1, ncols = 2, figsize = (10, 4))
women = train[train['Sex'] == 'female']
men = train[train['Sex'] == 'male']

ax = sns.distplot(women[women[Survived]==1].Age.dropna(), bins = 18, label = Survived, ax = axes[0], kde = False)
ax = sns.distplot(women[women[Survived]==0].Age.dropna(), bins = 40, label = not_survived, ax = axes[0], kde = False)
ax.legend()
ax.set_title('Female Survival')

ax = sns.distplot(men[men[Survived]==1].Age.dropna(), bins = 18, label = Survived, ax = axes[1], kde = False)
ax = sns.distplot(men[men[Survived]==0].Age.dropna(), bins = 40, label = not_survived, ax = axes[1], kde = False)
ax.legend()
ax.set_title('Male Survival')

In [None]:
train.Sex.value_counts()

In [None]:
sns.catplot(x = 'Pclass', y = 'Age', data = train, kind = 'box')

In [None]:
sns.catplot(x = 'Pclass', y = 'Fare', data = train, kind = 'box')

In [None]:
train[train['Pclass'] == 1]['Age'].mean() #mean age of 1st class passengers

In [None]:
train[train['Pclass'] == 2]['Age'].mean()

In [None]:
train[train['Pclass'] == 3]['Age'].mean()

In [None]:
print(test[test['Pclass'] == 1]['Age'].mean())
print(test[test['Pclass'] == 2]['Age'].mean())
print(test[test['Pclass'] == 3]['Age'].mean())

In [None]:
#fill missing ages with mean of Pclass Age

def impute_Age(cols):
    Age = cols[0]
    Pclass = cols[1]
    
    if pd.isnull(Age):
        if Pclass == 1:
            return train[train['Pclass'] == 1]['Age'].mean()
        elif Pclass == 2:
             return train[train['Pclass'] == 2]['Age'].mean()
        elif Pclass == 3:
            return train[train['Pclass'] == 2]['Age'].mean()
        
    else:
        return Age

In [None]:
train['Age'] = train[['Age', 'Pclass']].apply(impute_Age, axis = 1)

In [None]:
sns.heatmap(train.isnull(), cbar = False, cmap = 'viridis')

In [None]:
test['Age'] = test[['Age', 'Pclass']].apply(impute_Age, axis = 1)

In [None]:
sns.heatmap(test.isnull(), cbar = False, cmap = 'viridis')

In [None]:
f = sns.FacetGrid(train, row = 'Embarked', height = 2.5, aspect = 3)
f.map(sns.pointplot, 'Pclass', 'Survived', 'Sex', order = None, hue_order = None)
f.add_legend()

In [None]:
train.Embarked.isnull().sum()

In [None]:
train['Embarked'].value_counts()

In [None]:
common_value = 'S'
train['Embarked'].fillna(common_value, inplace = True)

In [None]:
train.Embarked.isnull().sum()

In [None]:
test.Fare.isnull().sum()

In [None]:
fill_fare = test.Fare.mean()

In [None]:
test['Fare'].fillna(fill_fare, inplace = True)

In [None]:
test.Fare.isnull().sum()

In [None]:
#drop cabin and ticket column

train.drop(labels = ['Cabin', 'Ticket', 'Name', 'PassengerId'], inplace = True, axis = 1)
test.drop(labels = ['Cabin','Name', 'Ticket'], inplace = True, axis = 1)

In [None]:
sns.heatmap(train.isnull(), cbar = False, cmap = 'viridis')

In [None]:
sns.heatmap(test.isnull(), cbar = False, cmap = 'viridis')

In [None]:
#feature transformation, categorical values into integers

train.head()

In [None]:
train.info()

In [None]:
train.Fare = train.Fare.astype('int')
train.Age = train.Age.astype('int')
train.info()

In [None]:
test.Fare = train.Fare.astype('int')
test.Age = train.Age.astype('int')
test.info()

In [None]:
Gender = {'male': 0, 'female': 1}
train.Sex = train.Sex.map(Gender)

Port = {'S': 0, 'C': 1, 'Q': 2}
train.Embarked = train.Embarked.map(Port)
train.head()

In [None]:
Gender = {'male': 0, 'female': 1}
test.Sex = test.Sex.map(Gender)

Port = {'S': 0, 'C': 1, 'Q': 2}
test.Embarked = test.Embarked.map(Port)
test.head()

In [None]:
test.info()

**Logistic Regression Model**

In [None]:
X = train.drop('Survived', axis = 1)
y = train.Survived

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.33, random_state = 42)
X_train.shape

In [None]:
model = LogisticRegression(solver = 'lbfgs', max_iter = 400)
model.fit(X_train, y_train)
y_predict = model.predict(X_test)

In [None]:
model.score(X_test, y_test)

**Test**

In [None]:
test.head()

In [None]:
wanted_test_columns = X_train.columns
wanted_test_columns

In [None]:
predictions = model.predict(test[wanted_test_columns])

In [None]:
submission = pd.DataFrame()
submission['PassengerId'] = test['PassengerId']
submission['Survived'] = predictions
submission

In [None]:
submission.to_csv('../logisticregression_submission.csv', index=False)

In [None]:
test_submission = pd.read_csv('../logisticregression_submission.csv')
test_submission