In [1]:
import numpy as np                                                 # Implemennts milti-dimensional array and matrices
import pandas as pd                                                # For data manipulation and analysis
import pandas_profiling
import matplotlib.pyplot as plt                                    # Plotting library for Python programming language and it's numerical mathematics extension NumPy
import seaborn as sns                                              # Provides a high level interface for drawing attractive and informative statistical graphics
%matplotlib inline
sns.set()

from subprocess import check_output
from collections import Counter
import os

In [1]:
# Uncomment on kaggle to know data path
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [1]:
base_path='/kaggle/input/titanic/'

In [1]:
train = pd.read_csv(base_path+'train.csv')
test = pd.read_csv(base_path+'test.csv')

In [1]:
print(train.shape)
print(test.shape)

In [1]:
test_PassengerId = test["PassengerId"]

# Outlier treatment

In [1]:
def detect_outliers(df,n,features):
    """
    Takes a dataframe df of features and returns a list of the indices
    corresponding to the observations containing more than n outliers according
    to the Tukey method.
    """
    outlier_indices = []
    
    # iterate over features(columns)
    for col in features:
        # 1st quartile (25%)
        Q1 = np.percentile(df[col], 25)
        # 3rd quartile (75%)
        Q3 = np.percentile(df[col],75)
        # Interquartile range (IQR)
        IQR = Q3 - Q1
        
        # outlier step
        outlier_step = 1.5 * IQR
        
        # Determine a list of indices of outliers for feature col
        outlier_list_col = df[(df[col] < Q1 - outlier_step) | (df[col] > Q3 + outlier_step )].index
        
        # append the found outlier indices for col to the list of outlier indices 
        outlier_indices.extend(outlier_list_col)
        
    # select observations containing more than 2 outliers
    outlier_indices = Counter(outlier_indices)        
    multiple_outliers = list( k for k, v in outlier_indices.items() if v > n )
    
    return multiple_outliers  

In [1]:
Outliers_to_drop = detect_outliers(train,2,["Age","SibSp","Parch","Fare"])

In [1]:
train[["Age","SibSp","Parch","Fare"]].describe()

In [1]:
train.loc[Outliers_to_drop] # Show the outliers rows

In [1]:
# Drop outliers
train = train.drop(Outliers_to_drop, axis = 0).reset_index(drop=True)

# joining train and test

In [1]:
print(train.shape)
print(test.shape)

In [1]:
## Join train and test datasets in order to obtain the same number of features during categorical conversion
train_len = len(train)
dataset =  pd.concat(objs=[train, test], axis=0).reset_index(drop=True)

# Missing Values and NA treatment

In [1]:
# dataset = dataset.fillna(np.nan)

In [1]:
miss = dataset.isna().sum().sort_values().reset_index()
miss.columns = ['column', 'total_miss']
percent = (dataset.isna().sum()/dataset.shape[0])*100
percent = percent.sort_values().reset_index()
percent.columns = ['column', 'percent_miss']
miss = pd.merge(left=miss, right=percent, on='column')
miss.sort_values(by='percent_miss', ascending=False)

In [1]:
#Fill Fare missing values with the median value
dataset["Fare"] = dataset["Fare"].fillna(dataset["Fare"].median())

In [1]:
dataset["Embarked"] = dataset["Embarked"].fillna(dataset['Embarked'].mode()[0])


In [1]:
index_NaN_age = list(dataset["Age"][dataset["Age"].isnull()].index)

for i in index_NaN_age :
    age_med = dataset["Age"].median()
    age_pred = dataset["Age"][((dataset['SibSp'] == dataset.iloc[i]["SibSp"]) & (dataset['Parch'] == dataset.iloc[i]["Parch"]) & (dataset['Pclass'] == dataset.iloc[i]["Pclass"]))].median()
    if not np.isnan(age_pred) :
        dataset['Age'].iloc[i] = age_pred
    else :
        dataset['Age'].iloc[i] = age_med


# Feature Engineering

In [1]:
dataset['FamilySize'] = dataset['SibSp'] + dataset['Parch']+1

In [1]:
dataset['GenderClass'] = dataset.apply(lambda x: 'child' if x['Age'] < 15 else x['Sex'],axis=1)

In [1]:
dataset_title = [i.split(",")[1].split(".")[0].strip() for i in dataset["Name"]]
dataset["Title"] = pd.Series(dataset_title)
dataset["Title"].head()

In [1]:
dataset["Title"] = dataset["Title"].replace(['Lady', 'the Countess','Countess','Capt', 'Col','Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')
dataset["Title"] = dataset["Title"].map({"Master":0, "Miss":1, "Ms" : 1 , "Mme":1, "Mlle":1, "Mrs":1, "Mr":2, "Rare":3})
dataset["Title"] = dataset["Title"].astype(int)

In [1]:
dataset = pd.get_dummies(dataset, columns=['GenderClass','Embarked'])

In [1]:
# Create categorical values for Pclass
dataset["Pclass"] = dataset["Pclass"].astype("category")
dataset = pd.get_dummies(dataset, columns = ["Pclass"],prefix="Pc")

In [1]:
# dataset["Cabin"][dataset["Cabin"].notnull()].head()

In [1]:
# dataset["Cabin"] = pd.Series([i[0] if not pd.isnull(i) else 'X' for i in dataset['Cabin'] ])

In [1]:
# dataset = pd.get_dummies(dataset, columns = ["Cabin"],prefix="Cabin")

In [1]:
# Ticket = []
# for i in list(dataset.Ticket):
#     if not i.isdigit() :
#         Ticket.append(i.replace(".","").replace("/","").strip().split(' ')[0]) #Take prefix
#     else:
#         Ticket.append("X")
        
# dataset["Ticket"] = Ticket
# dataset["Ticket"].head()

In [1]:
# dataset = pd.get_dummies(dataset, columns = ["Ticket"], prefix="T")

In [1]:
dataset = dataset.drop(['Name','Sex','SibSp','Parch', 'PassengerId', 'Cabin','Ticket'], axis = 1)


In [1]:
dataset.head(5)

In [1]:
train = dataset.iloc[:train_len]
test = dataset.iloc[train_len:]
test.drop(['Survived'], axis=1, inplace=True)

In [1]:
train.Survived = train.Survived.astype('int64')

In [1]:
print(train.shape)
print(test.shape)

In [1]:
sns.pairplot(train[["Fare","Age","Survived"]],vars = ["Fare","Age"],hue="Survived", dropna=True,markers=["o", "s"])
plt.title('Pair Plot')

In [1]:
corr = train.corr()[['Age', 'Fare', 'FamilySize', 'Title', 'GenderClass_female', 'Embarked_Q', 'Pc_1', 'Survived']]

In [1]:
plt.figure(figsize=(10,10))
sns.heatmap(corr,vmax=.8,linewidth=.01, square = True, annot = True,cmap='YlGnBu',linecolor ='black')
plt.title('Correlation between features')
plt.ylim(10, 0)

In [1]:
corr['Survived'].sort_values().plot(kind='bar')

In [1]:
sns.countplot(x='Survived',data=train)

# Modeling

In [1]:
X = train.loc[:,train.columns != 'Survived']
X.head()

In [1]:
y = train.Survived 

In [1]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

In [1]:
print(X_train.shape)
print(y_train.shape)

In [1]:
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression(max_iter=300)
logreg.fit(X_train,y_train)

In [1]:
y_pred_train = logreg.predict(X_train)  

In [1]:
y_pred_test = logreg.predict(X_test)                                                           # make predictions on the testing set

# Model Evaluation

In [1]:
from sklearn.metrics import accuracy_score
print('Accuracy score for test data is:', accuracy_score(y_test,y_pred_test))

In [1]:
from sklearn.metrics import confusion_matrix

confusion_matrix = pd.DataFrame(confusion_matrix(y_test, y_pred_test))

print(confusion_matrix)

In [1]:
confusion_matrix.index = ['Actual Died','Actual Survived']
confusion_matrix.columns = ['Predicted Died','Predicted Survived']
print(confusion_matrix)

In [1]:
preds1 = np.where(logreg.predict_proba(X_test)[:,1]> 0.75,1,0)
print('Accuracy score for test data is:', accuracy_score(y_test,preds1))

In [1]:
preds2 = np.where(logreg.predict_proba(X_test)[:,1]> 0.25,1,0)
print('Accuracy score for test data is:', accuracy_score(y_test,preds2))

# Submission

In [1]:
test.shape

In [1]:
test.info()

In [1]:
final_pred = logreg.predict(test)                                                           # make predictions on the testing set

In [1]:
final_pred

In [1]:
sub = pd.DataFrame()
sub['PassengerId'] = test_PassengerId
sub['Survived'] = final_pred
sub.to_csv('submission.csv',index=False)

In [1]:
sub.shape

In [1]:
sub.head(5)