In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import classification_report, confusion_matrix, explained_variance_score
import math


# import os
# for dirname, _, filenames in os.walk('/kaggle/input'):
#     for filename in filenames:
#         print(os.path.join(dirname, filename))

In [None]:
X = pd.read_csv("/kaggle/input/tabular-playground-series-apr-2021/train.csv")
test = pd.read_csv("/kaggle/input/tabular-playground-series-apr-2021/test.csv")
X.info()

In [None]:
print(X.shape)
X.head()

In [None]:
X.drop(["PassengerId", "Name", "Ticket", "Cabin"], axis=1, inplace=True)
X.head(2)

In [None]:
sns.heatmap(pd.isna(X), yticklabels=False)

In [None]:
X.isna().sum()
# Note that the HEATMAP method doesnt say everything about Missing Values. Only the bulk, graphically discernible ones.

In [None]:
# Rewrite my column names for easy memorization and referencing downstreaming without calling X.columns repeatedly

X.columns = ["survived", "class", "sex", "age", "sibsp", "parch", "fare", "embarked"]
X.head(1)

In [None]:
X["sex"] = X["sex"].map({"female":0, "male":1})
# X["embarked"] = X["embarked"].map({"S":0, "C":1, "Q":2})
X.sample(2)

In [None]:
X = pd.concat([X, pd.get_dummies(X["embarked"], drop_first=True)], axis=1)
X.drop("embarked",axis=1, inplace=True)
X.head(1)

In [None]:
X[ X["fare"].isna() ].isna().sum()

In [None]:
X[ X["age"].isna() ].isna().sum()

In [None]:
# X[ X["embarked"].isna() ].isna().sum()

In [None]:
plt.figure(figsize=(15,5))
plt.subplot(1,2,1)
sns.histplot(data=X, x="age", hue="survived")
plt.subplot(1,2,2)
sns.kdeplot(data=X, x="age", hue="survived")

In [None]:
plt.figure(figsize=(17,5))
plt.subplot(1,2,1)
sns.histplot(data=X, x="fare", hue="survived")
plt.subplot(1,2,2)
sns.kdeplot(data=X, x="fare", hue="survived")

In [None]:
unmissed = X.dropna()
print(unmissed.shape)
unmissed.head(1)

In [None]:
from sklearn.linear_model import LinearRegression, LogisticRegression

In [None]:
age_model = LinearRegression()
age_model.fit(unmissed.drop("age",axis=1), unmissed["age"])
print(age_model.score(unmissed.drop("age",axis=1), unmissed["age"])*100)
print("The result does not change even if you encode 'Embarked' as 1,2,3")

In [None]:
fare_model = LinearRegression()
fare_model.fit(unmissed.drop("fare",axis=1), unmissed["fare"])
print(fare_model.score(unmissed.drop("fare",axis=1), unmissed["fare"])*100)
print("The result does not change even if you encode 'Embarked' as 1,2,3")

In [None]:
unmissed.head(1)

In [None]:
df = unmissed["Q"] + unmissed["S"]
df[unmissed["S"]==1] = 2
df[unmissed["Q"]==1] = 1
# df[(unmissed["S"]==0) & (unmissed["Q"]==0)] = 0
df.value_counts()

In [None]:
def fit_score(model, feature_list):
    model.fit(unmissed[feature_list], df)
    return round(model.score(unmissed[feature_list], df)*100, 2)

In [None]:
qs_model = LogisticRegression(multi_class='ovr', solver='liblinear')
# qs_model.fit( unmissed.drop(["Q","S","age","survived","fare"],axis=1) , df )
# qs_model.score( unmissed.drop(["Q","S","age","survived","fare"],axis=1) , df )*100
print("Accuracy of predictin Embarked using different features:\n")
for feat in unmissed.columns[:-2]:
    print(feat," ", fit_score(qs_model, [feat]))
print("\nUsing all features = ", fit_score(qs_model, unmissed.columns[:-2]))

In [None]:
new_model=LogisticRegression(max_iter=500).fit(unmissed.drop("survived",axis=1), unmissed["survived"])
new_model.score(unmissed.drop("survived",axis=1), unmissed["survived"])*100

In [None]:
new_model_wo_embarked=LogisticRegression(max_iter=500).fit(unmissed.drop(["survived","Q","S"],axis=1), unmissed["survived"])
new_model_wo_embarked.score(unmissed.drop(["survived","Q","S"],axis=1), unmissed["survived"])*100

In [None]:
alone = unmissed["sibsp"]+unmissed["parch"]
alone.apply(lambda x: 1 if x>0 else 0)
with_alone = pd.concat( [unmissed, alone] ,axis=1 )

In [None]:
new_model_alone=LogisticRegression(max_iter=500).fit(with_alone.drop("survived",axis=1), with_alone["survived"])
new_model_alone.score(with_alone.drop("survived",axis=1), with_alone["survived"])*100

In [None]:
sns.boxplot(data=unmissed, x="sibsp", y="age")

In [None]:
sns.boxplot(data=unmissed, x="sibsp", y="fare")

In [None]:
plt.figure(figsize=(15,8))
sns.heatmap(unmissed.corr(), annot=True, cmap="coolwarm")

In [None]:
new_model_corr=LogisticRegression(max_iter=500).fit(unmissed[["class", "sex","age"]], unmissed["survived"])
new_model_corr.score(unmissed[["class", "sex","age"]], unmissed["survived"])*100

In [None]:
from sklearn.decomposition import PCA

In [None]:
pca = PCA(n_components=2)
pca.fit(unmissed.drop("survived",axis=1))
pca.explained_variance_ratio_

In [None]:
pca_2 = PCA(n_components=2)
pca_2.fit(unmissed)
pca_2.explained_variance_ratio_

So With or Without "survived", close to 99.9% variance in the data matrix is explained by the first two principal components

In [None]:
pc_data = pd.DataFrame(pca.transform(unmissed.drop("survived",axis=1)), columns=["pc1", "pc2"], index=unmissed.index)
pc_data

In [None]:
new_model_pca_only=LogisticRegression(max_iter=500).fit(pc_data, unmissed["survived"])
new_model_pca_only.score(pc_data, unmissed["survived"])*100

In [None]:
pca_with_unmissed = pd.concat([unmissed.drop("survived",axis=1), pc_data], axis=1)
new_model_pca_incl=LogisticRegression(max_iter=500).fit(pca_with_unmissed, unmissed["survived"])
new_model_pca_incl.score(pca_with_unmissed, unmissed["survived"])*100

In [None]:
famsize = pd.DataFrame(unmissed["sibsp"]+unmissed["parch"]+1, columns=["famsize"], index=unmissed.index)
famsize_with_unmissed = pd.concat([unmissed.drop("survived",axis=1), famsize], axis=1)

new_model_famsize=LogisticRegression(max_iter=500).fit(famsize_with_unmissed, unmissed["survived"])
new_model_famsize.score(famsize_with_unmissed, unmissed["survived"])*100

In [None]:
plt.figure(figsize=(15,5))
sns.scatterplot(data=pd.concat([pc_data, unmissed["survived"]], axis=1), x="pc1",y="pc2", hue="survived")

In [None]:
np.matmul(unmissed.drop("survived", axis=1).values - pca.mean_ ,pca.components_[0].reshape(-1,1))  #!= pc_data["pc1"].values

In [None]:
pc_data["pc1"].values

In [None]:
pca.components_[0].reshape(1,-1)
print(pca.singular_values_)
print(pca.mean_)

In [None]:
pca_2.transform(unmissed)[:,0]

In [None]:
a = unmissed.drop("survived", axis=1).values - pca_2.mean_[1:]
b = pca_2.components_[0][1:].reshape(-1,1)
c = np.matmul( a, b).reshape(-1,)
c + unmissed["survived"] * pca_2.components_[0][0]


In [None]:
(unmissed["survived"] * pca_2.components_[0][0]).shape

In [None]:
c.reshape(-1,).shape

In [None]:
A = np.matmul(unmissed.drop("survived", axis=1).values - pca.mean_ ,pca.components_[0].reshape(-1,1)).reshape(-1,)
A

In [None]:
p = unmissed.drop("survived", axis=1).values - pca_2.mean_[1:]
q = pca_2.components_[0][1:].reshape(-1,1)
B = np.matmul( a, b).reshape(-1,)
B

In [None]:
C = (A-B)/pca_2.components_[0][0]
C.min(), C.max()

In [None]:
A2 = np.matmul(unmissed.drop("survived", axis=1).values - pca.mean_ ,pca.components_[1].reshape(-1,1)).reshape(-1,)
A2

In [None]:
p2 = unmissed.drop("survived", axis=1).values - pca_2.mean_[1:]
q2 = pca_2.components_[1][1:].reshape(-1,1)
B2 = np.matmul( a, b).reshape(-1,)
B2

In [None]:
C2 = (A-B)/pca_2.components_[1][0]
C2.min(), C2.max()

In [None]:
# D = np.sqrt(C**2+C2**2)
D = C+C2
print(D.min(), D.max())
plt.plot(D)

In [None]:
pd.concat([pc_data, unmissed["survived"]], axis=1)

In [None]:
from sklearn.svm import SVC, LinearSVC
from sklearn.model_selection import GridSearchCV

In [None]:
nsvm = 2000
xsvm, ysvm = pc_data.iloc[:nsvm], unmissed["survived"].iloc[:nsvm]
svm_classifier = LinearSVC(C=1, max_iter=30000).fit(xsvm, ysvm)
svm_classifier.score(xsvm, ysvm)*100

In [None]:
grid = GridSearchCV(SVC(), param_grid={"C":[0.01,0.1,1,10],"gamma":[0.01,0.1,1,10]}, verbose=3)
grid.fit(xsvm,ysvm)
grid.score(xsvm,ysvm)

In [None]:
grid.score(pc_data.iloc[nsvm:2*nsvm], unmissed["survived"].iloc[nsvm:2*nsvm])*100

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

In [None]:
scaler = MinMaxScaler().fit(unmissed)
scaled_unmissed = scaler.transform(unmissed)

In [None]:
Xsplit, ysplit = scaled_unmissed[:,1:], scaled_unmissed[:,0]
X_train, X_test, y_train, y_test = train_test_split(Xsplit, ysplit, test_size=0.25, random_state=42)

In [None]:
xknn ,yknn = X_train, y_train
knn = KNeighborsClassifier(n_neighbors=500).fit(xknn,yknn)
print(knn.score(xknn,yknn)*100)
print(knn.score(X_test,y_test)*100)

In [None]:
# 10 = 79.04, 75.28
# 20 = 78.11, 76.55
# 25 = 77.93, 76.66
# 30 = 77.80, 76.73
# 40 = 77.65, 76.79
# 70 = 77.39, 76.83
# 100= 77.27, 76.69
# 200= 77.16, 76.73
# 500= 76.99, 76.58

**AGE Filler using KNN**

In [None]:
from sklearn.neighbors import KNeighborsRegressor
df_scaled_unmissed = pd.DataFrame(scaled_unmissed, columns=unmissed.columns, index=unmissed.index)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df_scaled_unmissed.drop("age", axis=1), df_scaled_unmissed["age"], test_size=0.25, random_state=42)
age_knn = KNeighborsRegressor(n_neighbors=10).fit(X_train, y_train)
print(age_knn.score(X_train, y_train)*100)
print(age_knn.score(X_test, y_test)*100)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df_scaled_unmissed.drop("fare", axis=1), df_scaled_unmissed["fare"], test_size=0.25, random_state=42)
fare_knn = KNeighborsRegressor(n_neighbors=50).fit(X_train, y_train)
print(fare_knn.score(X_train, y_train)*100)
print(fare_knn.score(X_test, y_test)*100)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df_scaled_unmissed.drop(["Q","S"], axis=1), df, test_size=0.25, random_state=42)
embark_knn = KNeighborsClassifier(n_neighbors=50).fit(X_train, y_train)
print(embark_knn.score(X_train, y_train)*100)
print(embark_knn.score(X_test, y_test)*100)

In [None]:
raw = pd.read_csv("/kaggle/input/tabular-playground-series-apr-2021/train.csv")
raw = raw[raw["Embarked"].isna()].drop(["PassengerId", "Name", "Ticket", "Cabin"],axis=1)
raw["Sex"] = raw["Sex"].map({"female":0, "male":1})
embark_knn.predict(raw.drop(raw[raw["Age"].isna() | raw["Fare"].isna()].index, axis=0).drop("Embarked",axis=1))


In [None]:
raw.drop(raw[raw["Age"].isna() | raw["Fare"].isna()].index, axis=0).drop("Embarked",axis=1)

In [None]:
embark_knn.predict()

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import LabelEncoder, PowerTransformer, MinMaxScaler
from sklearn.metrics import classification_report, confusion_matrix

In [None]:
X = pd.read_csv("/kaggle/input/tabular-playground-series-apr-2021/train.csv")
test = pd.read_csv("/kaggle/input/tabular-playground-series-apr-2021/test.csv")

test_passenger_ids = test.pop("PassengerId")
X.drop(["PassengerId", "Name", "Ticket", "Cabin"], axis=1, inplace=True)
test.drop(["Name", "Ticket", "Cabin"], axis=1, inplace=True)

X.columns = ["survived", "class", "sex", "age", "sibsp", "parch", "fare", "embarked"]
test.columns = ["class", "sex", "age", "sibsp", "parch", "fare", "embarked"]

X.dropna(axis=0, thresh=7, inplace=True)

miss_embark = X[X["embarked"].isna()].index
unmissed_embark = X[X["embarked"].notna()].index

In [None]:
# Mean, Median and Mode filling for Rapid Testinguu

from sklearn.metrics import roc_auc_score

# X["age"].fillna(value=X["age"].median(), inplace=True)
test["age"].fillna(value=X["age"].median(), inplace=True)    # Note that im filling with Median from Train. Not Test

# X["fare"].fillna(value=X["fare"].mean(), inplace=True)
test["fare"].fillna(value=X["fare"].mean(), inplace=True)    # Note that im filling with Median from Train. Not Test

# X["embarked"].fillna(value=X["embarked"].mode(), inplace=True)
test["embarked"].fillna(value=X["embarked"].mode(), inplace=True)    # Note that im filling with Median from Train. Not Test

X.dropna(inplace=True)

X['fare'] = X['fare'].map(lambda i: np.log(i) if i > 0 else 0)
test['fare'] = test['fare'].map(lambda i: np.log(i) if i > 0 else 0)
# boxcoxed = PowerTransformer(method="yeo-johnson")
# boxcoxed.fit(X[["fare"]].dropna())
# X["fare"] = boxcoxed.transform(X[["fare"]])
# test["fare"] = boxcoxed.transform(test[["fare"]])

# X["logage"] = np.log(X["age"])
# test["logage"] = np.log(test["age"])

X["famsize"] = X["sibsp"]+X["parch"]+1
test["famsize"] = test["sibsp"]+test["parch"]+1

In [None]:
X["sex"] = X["sex"].map({"female":0, "male":1})
test["sex"] = test["sex"].map({"female":0, "male":1})

X = pd.concat([X, pd.get_dummies(X["embarked"], drop_first=True)], axis=1)
test = pd.concat([test, pd.get_dummies(test["embarked"], drop_first=True)], axis=1)

X.drop("embarked",axis=1, inplace=True)
test.drop("embarked",axis=1, inplace=True)

In [None]:
new_model_cleaned = LogisticRegression(max_iter=1000).fit(X.drop(["survived"],axis=1), X["survived"])
new_model_cleaned.score(X.drop(["survived"],axis=1), X["survived"])*100

In [None]:
print(roc_auc_score(X["survived"], new_model_cleaned.predict_proba(X.drop(["survived"],axis=1))[:,1])*100)

In [None]:
# Without log(fare):
#     acc = 76.679 , auc = 82.72
# With log(fare):
#     acc = 76.666 , auc = 82.894
# With log(fare)+log(age):
#     acc = 76.609 , auc = 82.91

# X.dropna():
#     acc = 76.659 , auc = 82.897

# 76.62, 82.90
# 76.65, 82.58
# 76.67, 82.58

In [None]:
test["survived"] = new_model_cleaned.predict(test)
test.head()

In [None]:
result = pd.concat([pd.Series(test_passenger_ids), test["survived"]],axis=1)
result.columns = ["PassengerId", "Survived"]
result.to_csv("./subm_apr_22_num_1.csv", index=False)

In [None]:
X.loc[unmissed_embark,"embarked"] = LabelEncoder().fit_transform(X.loc[unmissed_embark,"embarked"])
X["sex"] = X["sex"].map({"female":0, "male":1})
test["sex"] = test["sex"].map({"female":0, "male":1})

In [None]:
unmissed = X.dropna().drop("embarked",axis=1)
scaler = MinMaxScaler().fit(unmissed)
scaled_unmissed = scaler.transform(unmissed)
df_scaled_unmissed = pd.DataFrame(scaled_unmissed, columns=unmissed.columns, index=unmissed.index)
X_train, X_test, y_train, y_test = train_test_split(df_scaled_unmissed, X.loc[unmissed.index,"embarked"], test_size=0.25, random_state=42)

In [None]:
embark_knn = KNeighborsClassifier(n_neighbors=50).fit(X_train, y_train.astype("int"))
# print(embark_knn.score(X_train, y_train)*100)
# print(embark_knn.score(X_test, y_test)*100)

In [None]:
pred_train = embark_knn.predict(X_train)
pred_test = embark_knn.predict(X_test)

In [None]:
print((pred_train == y_train).sum() / y_train.shape[0]*100)
print((pred_test== y_test).sum() / y_test.shape[0]*100)

In [None]:
X.loc[miss_embark,"embarked"] = embark_knn.predict(X.loc[miss_embark].drop("embarked",axis=1))

In [None]:
test[test["embarked"].isna()].isna().sum()

In [None]:
test_fillable = test.dropna(axis=0, thresh=6)

In [None]:
test_fillable.shape

In [None]:
# Demonstrating why drop_first=True is needed when encoding categoricals. Otherwise, strong multicollinearity is obtained
# Even Logistic model can predict it to 99.75% accuracy if drop_first is not enabled

# chumma = pd.read_csv("/kaggle/input/tabular-playground-series-apr-2021/train.csv")
# chumma.drop(["PassengerId", "Name", "Ticket", "Cabin"], axis=1, inplace=True)
# chumma.columns = ["survived", "class", "sex", "age", "sibsp", "parch", "fare", "embarked"]
# chumma.head(1)

In [None]:
# da = pd.get_dummies(chumma["embarked"])
# print(da.head(), end="\n")

# genfit = LogisticRegression().fit(da[["C","Q"]], da["S"])
# print(genfit.score(da[["C","Q"]], da["S"])*100)
# genfit = LogisticRegression().fit(da[["C","S"]], da["Q"])
# print(genfit.score(da[["C","S"]], da["Q"])*100)
# genfit = LogisticRegression().fit(da[["S","Q"]], da["C"])
# print(genfit.score(da[["S","Q"]], da["C"])*100)