In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib.pyplot as plt

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
data = pd.read_csv("/kaggle/input/tabular-playground-series-apr-2021/train.csv")
test_data = pd.read_csv("/kaggle/input/tabular-playground-series-apr-2021/test.csv")
data.info()

In [None]:
print(data.shape)
print(test_data.shape)
data.head()

In [None]:
data["Survived"].value_counts()

In [None]:
sex_ratio = data.groupby("Sex")["Survived"].value_counts()
print(sex_ratio)


In [None]:
emb_ratio = data.groupby("Embarked")["Survived"].value_counts()
print(emb_ratio)
print("\nSurvival rate across Embarking Location", [round(emb_ratio[i+1]/emb_ratio[i]*100,1) for i in [0,2,4]])

In [None]:
sns.boxplot(data=data, x="Embarked", y="Fare")


In [None]:
red_data = data.drop(['PassengerId', 'Name', 'Ticket', 'Cabin'],axis=1)

In [None]:
sns.heatmap(pd.isna(red_data), yticklabels=False)

In [None]:
nulls = pd.isna(red_data)
print(nulls["Age"].value_counts())
print(nulls["Embarked"].value_counts())
print("\n3.2% of Age is NA. We need to fill this later on appropriately")

In [None]:
red_data["Sex"] = red_data["Sex"].map({"female":0, "male":1})

In [None]:
red_data = pd.concat([red_data.drop("Embarked",axis=1), pd.get_dummies(red_data["Embarked"], drop_first=True)], axis=1)
red_data.head()

In [None]:
red_data.dropna(inplace=True)

In [None]:
print(red_data.shape)
print(red_data.columns)

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X = red_data.drop("Survived", axis=1)
y = red_data["Survived"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
log_model = LogisticRegression(max_iter=1000)
log_model.fit(X_train, y_train)

In [None]:
print(log_model.score(X_train, y_train)*100)
print(log_model.score(X_test, y_test)*100)

In [None]:
from sklearn.metrics import classification_report, confusion_matrix

In [None]:
cfm = confusion_matrix(y_test, log_model.predict(X_test))
print(cfm)
print(np.trace(cfm)/cfm.sum()*100)

In [None]:
print(classification_report(y_test, log_model.predict(X_test)))

In [None]:
np.sum(pd.isna(test_data))

In [None]:
test_data["Age"].fillna(test_data["Age"].mean(), inplace=True)
test_data["Fare"].fillna(test_data["Fare"].mean(), inplace=True)
test_data["Age"].fillna("Q", inplace=True)

In [None]:
test_final = test_data.drop(["Name", "Ticket", "Cabin"], axis=1)
test_final["Sex"] = test_final["Sex"].map({"female":0, "male":1})
test_final = pd.concat([test_final.drop("Embarked",axis=1), pd.get_dummies(test_final["Embarked"], drop_first=True)], axis=1)
test_final.head(1)

In [None]:
result = log_model.predict(test_final.drop("PassengerId", axis=1))
print(result)

In [None]:
result = pd.DataFrame({"PassengerId": test_final["PassengerId"], "Survived":result})
print(result.shape)
result.head()

In [None]:
# result.to_csv("./my_result_logmodel_1.csv", index=False)

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree

In [None]:
dtree = DecisionTreeClassifier(max_depth=2)
dtree.fit(X_train, y_train)
# Unrestricted Tree grows to ~60 depth

In [None]:
print(dtree.score(X_train, y_train)*100)
print(dtree.score(X_test, y_test)*100)

In [None]:
plt.figure(figsize=(15,10))
tree.plot_tree(dtree, filled=True, rounded=True, feature_names=X_train.columns)

In [None]:
print(X_train.columns)
print(len(X_train.columns))

In [None]:
rf_model = RandomForestClassifier(n_estimators=300, max_features=2, max_depth=3)
rf_model.fit(X_train, y_train)

In [None]:
print(rf_model.score(X_train, y_train)*100)
print(rf_model.score(X_test, y_test)*100)

In [None]:
from sklearn.svm import SVC

In [None]:
svm_model = SVC(kernel="rbf", C=10, gamma=10)
nmax = 1000
svm_model.fit(X_train[:nmax], y_train[:nmax])

In [None]:
# from sklearn.model_selection import GridSearchCV
# grid = GridSearchCV(SVC(), param_grid={"C":[0.1,1,10], "gamma":[1,10,100]}, verbose=2)
# grid.fit(X_train, y_train)

In [None]:
# print(svm_model.score(X_train[:nmax], y_train[:nmax]) )
# print(svm_model.score(X_test[:nmax], y_test[:nmax]) )

In [None]:
from tensorflow import keras
from keras.models import Sequential
from keras.layers import Dense, Dropout

In [None]:
nn = Sequential()
nn.add( Dense(100, activation="sigmoid", input_shape=(8,)) )
nn.add( Dropout(0.25))
nn.add( Dense(100, activation="sigmoid") )
nn.add( Dropout(0.25))
nn.add( Dense(1, activation="sigmoid"))
nn.summary()

In [None]:
nn.compile(loss="binary_crossentropy", metrics=["accuracy"])

In [None]:
nn.fit(X_train, y_train, epochs=10)

In [None]:
print(nn.evaluate(X_train, y_train))
print(nn.evaluate(X_test, y_test))

In [None]:
result2 = [1 if x>0.5 else 0 for x in nn.predict(test_final.drop("PassengerId", axis=1))]
# sum(nn.predict_classes(X_test).reshape(-1,) != result2)
result2 = pd.DataFrame({"PassengerId": test_final["PassengerId"], "Survived":result2})
print(result2.head())

In [None]:
result2.to_csv("./result_2_nn_trials.csv", index=False)

In [None]:
featplus_data = pd.concat([X_train, X_train["Age"].apply(np.log), X_train["Fare"].apply(np.log)], axis=1)
featplus_data.head(1)

In [None]:
nn2 = Sequential()
nn2.add( Dense(100, activation="sigmoid", input_shape=(10,)) )
nn2.add( Dropout(0.25))
nn2.add( Dense(100, activation="sigmoid") )
nn2.add( Dropout(0.25))
nn2.add( Dense(1, activation="sigmoid"))
# nn2.summary()b
nn2.compile(loss="binary_crossentropy", metrics=["accuracy"])
nn2.fit(featplus_data, y_train, epochs=10)

In [None]:
print(nn2.evaluate(featplus_data, y_train))
print(nn2.evaluate(pd.concat([X_test, X_test["Age"].apply(np.log), X_test["Fare"].apply(np.log)], axis=1), y_test))