In [None]:
!pip install -U dataprep

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from dataprep.eda import plot ,plot_diff, plot_missing,plot_correlation,create_report
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
train_df = pd.read_csv("/kaggle/input/titanic/train.csv")
test_df = pd.read_csv("/kaggle/input/titanic/test.csv")
gs = pd.read_csv("/kaggle/input/titanic/gender_submission.csv")

# SIMPLE EDA

In [None]:
train_df.head()

In [None]:
test_df.head()

In [None]:
train_df.describe()

In [None]:
train_df.info()

In [None]:
test_df.info()

In [None]:
plot(train_df)

In [None]:
plot_correlation(train_df)

In [None]:
pom = train_df.drop("Survived",axis=1,inplace=False)
plot_diff([pom,test_df])

In [None]:
plot_missing(train_df)

In [None]:
plot_missing(test_df)

In [None]:
train_data_report = create_report(train_df)

train_data_report.show()

train_data_report.save(filename='train_data_report')

In [None]:
test_data_report = create_report(test_df)

test_data_report.show()

test_data_report.save(filename='test_data_report')

In [None]:
categorical_cols = [cname for cname in train_df.columns if
                    train_df[cname].dtype == "object"]
numerical_cols = [cname for cname in train_df.columns if 
                train_df[cname].dtype in ['int64', 'float64']]

categorical_cols

In [None]:
numerical_cols

In [None]:
train_df.isnull().sum()

In [None]:
test_df.isnull().sum()

# SIMPLE FEATURE ENGINEERING

# Modified version of @Chris Solomou's (christodoulos - username) version of feature engineering in his notebook

In [None]:
train_df['title'] = np.NaN
test_df['title'] = np.NaN
train_df['cabin_class'] = np.NaN
test_df['cabin_class'] = np.NaN
train_df['alone'] = np.NaN
test_df['alone'] = np.NaN


for i,row in enumerate(train_df['Name']): 
    train_df['title'][i] = row.split(',')[1].split('.')[0]
    
for i,row in enumerate(test_df['Name']): 
    test_df['title'][i] = row.split(',')[1].split('.')[0]
    
for i,_ in enumerate(train_df['alone']):
    if train_df['SibSp'][i] + train_df['Parch'][i] == 0: train_df['alone'][i] = 1
    else: train_df['alone'][i] = 0
        
for i,_ in enumerate(test_df['alone']):
    if test_df['SibSp'][i] + test_df['Parch'][i] == 0: test_df['alone'][i] = 1
    else: test_df['alone'][i] = 0
        
for i,row in enumerate(train_df['Cabin']):
    if str(row) != "nan":
        train_df['cabin_class'][i] =  str(row)[:1]
    
for i,row in enumerate(test_df['Cabin']):
    if str(row) != "nan":
        test_df['cabin_class'][i] =  str(row)[:1]


In [None]:
train_df.drop('Cabin',axis=1,inplace=True)
test_df.drop('Cabin',axis=1,inplace=True)

train_df.drop('Name',axis=1,inplace=True)
test_df.drop('Name',axis=1,inplace=True)

train_df.drop('Ticket',axis=1,inplace=True)
test_df.drop('Ticket',axis=1,inplace=True)

train_df.drop('PassengerId',axis=1,inplace=True)
test_ids = test_df.PassengerId
test_df.drop('PassengerId',axis=1,inplace=True)

classes = train_df.Survived
train_df.drop('Survived',axis=1,inplace=True)


In [None]:
train_df.head()

In [None]:
test_df.head()

# MODELING

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

In [None]:
categorical_cols = [cname for cname in train_df.columns if
                    train_df[cname].dtype == "object"]
numerical_cols = [cname for cname in train_df.columns if 
                train_df[cname].dtype in ['int64', 'float64']]

In [None]:
numerical_cols

In [None]:
categorical_cols

In [None]:
numerical_transformer = SimpleImputer(strategy="constant")


categorical_transformer = Pipeline(steps=[
                                        ("imputer",SimpleImputer(strategy="constant")),
                                        ("onehot",OneHotEncoder(handle_unknown="ignore"))])


preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

In [None]:
X_train, X_valid, y_train, y_valid = train_test_split(train_df, classes, 
                                                                train_size=0.8, test_size=0.2,
                                                                random_state=123)

In [None]:
lr_model = LogisticRegression(max_iter=1000,random_state=123,C=0.175)
rf_model = RandomForestClassifier(n_estimators=1000,random_state=123)
xgb_model = XGBClassifier(n_estimators=1000,random_state=123,learning_rate=0.01)

In [None]:
lr_pipe = Pipeline(steps=[('preprocessor', preprocessor),
                      ('model', lr_model)
                     ])
rf_pipe = Pipeline(steps=[('preprocessor', preprocessor),
                      ('model', rf_model)
                     ])
xgb_pipe = Pipeline(steps=[('preprocessor', preprocessor),
                      ('model', xgb_model)
                     ])

In [None]:
lr_pipe.fit(X_train,y_train)
rf_pipe.fit(X_train,y_train)
xgb_pipe.fit(X_train,y_train)

In [None]:
lr_pred = lr_pipe.predict(X_valid)
rf_pred = rf_pipe.predict(X_valid)
xgb_pred = xgb_pipe.predict(X_valid)

In [None]:
print(f"Logistic regression accuracy: {accuracy_score(y_valid, lr_pred)}")
print(f"Random forest accuracy: {accuracy_score(y_valid, rf_pred)}")
print(f"XGB accuracy: {accuracy_score(y_valid, xgb_pred)}")

In [None]:
final_predictions = lr_pipe.predict(test_df)


In [None]:
submission = pd.DataFrame({'PassengerId':test_ids,'Survived':final_predictions})
submission.to_csv('submission.csv',index = False)

# FINAL NOTE:

This notebook is intended to be simple, so i used automated eda tools and did very minimum modeling.
To increase model score we should test cross validation score, tune hyper parameters, test more models, etc.

Hope this notebook helps you!

And check out christodoulos from whom i took and modified feature engineering part and logistic regression hp.