In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)


# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# Load datasets
df_train = pd.read_csv('/kaggle/input/tabular-playground-series-apr-2021/train.csv')
df_test = pd.read_csv('/kaggle/input/tabular-playground-series-apr-2021/test.csv')

# Explore the training dataset
print(df_train.head())
print(df_train.describe())
print(df_train.info())
print(df_train.shape)

In [None]:
# Find any null values
df_train[df_train.isnull().any(axis=1)]

In [None]:
# Drop any values that likely won't be a factor, or will be difficult to convert to numeric values
df_train = df_train.drop('Name', axis=1).drop('Ticket', axis=1).drop('Cabin', axis=1).drop('PassengerId', axis=1)

# Convert categorical data to category and return code
df_train["Sex"] = df_train['Sex'].astype('category')
df_train["Embarked"] = df_train['Embarked'].astype('category')
df_train['Sex_Cat'] = df_train["Sex"].cat.codes
df_train['Embarked_Cat'] = df_train["Embarked"].cat.codes

# Drop the original columns
df_train = df_train.drop('Sex', axis=1).drop('Embarked', axis=1)

# Remove any other rows that have missing values
df_train.dropna(inplace=True)

# Retrieve the remaining dataset as X and y values
X = df_train.drop('Survived', axis=1).values
y = df_train['Survived'].values


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Take a look at the correlation values to see which ones likely have an impact on survivorship 
sns.heatmap(df_train.corr(), square=True, cmap='RdYlGn')

plt.show()

In [None]:
from sklearn.linear_model import Lasso
from sklearn.model_selection import train_test_split

# Use a Lasso Regression to find the features that are most important
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

features = df_train.drop('Survived', axis=1).columns
lasso = Lasso(alpha=0.1)
lasso_coef = lasso.fit(X_train, y_train).coef_

_ = plt.plot(range(len(features)), lasso_coef)
_ = plt.xticks(range(len(features)), features, rotation=60)
_ = plt.ylabel('Coefficients')
plt.show()

In [None]:
# Remove the SibSP as it seems to have very little effect and redefine our X and y
df_train = df_train.drop('Age', axis=1).drop('Fare', axis=1)
X = df_train.drop('Survived', axis=1).values
y = df_train['Survived'].values

In [None]:
from sklearn.linear_model import LogisticRegression

# Create a new training and test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

# Perform a logistic regression on the data
logreg = LogisticRegression()
logreg.fit(X_train, y_train)
y_pred = logreg.predict(X_test)

In [None]:
from sklearn.metrics import roc_curve

# Measure the ROC curve to determine the performance of the model
y_pred_prob = logreg.predict_proba(X_test)[:,1]

# Retrieve the False Positive and True Positive Rates, along with the threshold
fpr, tpr, thresholds = roc_curve(y_test, y_pred_prob)

# Plot the results
plt.plot([0,1], [0,1], 'k--')
plt.plot(fpr, tpr, label="Logistic Regression")
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Logistic Regression ROC Curve')
plt.show()

In [None]:
from sklearn.metrics import roc_auc_score

# Return the ROC-AUC score for our model
roc_auc_score(y_test, y_pred_prob)

In [None]:
from sklearn.model_selection import cross_val_score

# Get the cross value scores for ROC-AUC scoring, returning an array and the mean
cv_scores = cross_val_score(logreg, X, y, cv=10, scoring='roc_auc')

print(cv_scores)
print(np.mean(cv_scores))

In [None]:
# Now let's try it for the Test dataset

# Drop any values that likely won't be a factor, or will be difficult to convert to numeric values
df_test = df_test.drop('Name', axis=1).drop('Ticket', axis=1).drop('Cabin', axis=1).drop('PassengerId', axis=1)
df_test = df_test.drop('Age', axis=1).drop('Fare', axis=1)

# Convert categorical data to category and return code
df_test["Sex"] = df_test['Sex'].astype('category')
df_test["Embarked"] = df_test['Embarked'].astype('category')
df_test['Sex_Cat'] = df_test["Sex"].cat.codes
df_test['Embarked_Cat'] = df_test["Embarked"].cat.codes

# Drop the original columns
df_test = df_test.drop('Sex', axis=1).drop('Embarked', axis=1)

# Fill NA values
df_test.fillna(0, inplace=True)

# Retrieve the remaining dataset as X values
X = df_test.values

In [None]:
# Retrieve sample submission and replace values with predictions, saving to CSV
df_submission = pd.read_csv('/kaggle/input/tabular-playground-series-apr-2021/sample_submission.csv')

df_submission['Survived'] = logreg.predict(X)

df_submission.to_csv('submission.csv', index=False)