In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import cufflinks as cf
cf.go_offline()
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, cross_val_predict
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, MinMaxScaler
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, roc_auc_score
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.cluster import KMeans

In [None]:
train = pd.read_csv("../input/tabular-playground-series-apr-2021/train.csv")
test = pd.read_csv("../input/tabular-playground-series-apr-2021/test.csv")

train.drop(["Name", "Ticket", "Cabin"], axis=1, inplace=True)
test.drop(["Name", "Ticket", "Cabin"], axis=1, inplace=True)
passid_train = train.pop("PassengerId")
passid_test = test.pop("PassengerId")

joined = pd.concat([train, test], axis=0)

In [None]:
joined.columns = ["survived","class","sex","age","sibsp","parch","fare","embarked"]
train.columns  = ["survived","class","sex","age","sibsp","parch","fare","embarked"]
test.columns   = ["class","sex","age","sibsp","parch","fare","embarked"]

In [None]:
encoder_sex = LabelEncoder()
encoder_sex.fit(joined["sex"])
joined["sex"] = encoder_sex.transform(joined["sex"])
train["sex"] = encoder_sex.transform(train["sex"])
test["sex"] = encoder_sex.transform(test["sex"])

In [None]:
encoder_embarked = LabelEncoder()
encoder_embarked.fit(joined["embarked"].dropna())
joined.loc[joined["embarked"].notna(), "embarked"] = encoder_embarked.transform(joined["embarked"].dropna())
train.loc[train["embarked"].notna(), "embarked"] = encoder_embarked.transform(train["embarked"].dropna())
test.loc[test["embarked"].notna(), "embarked"] = encoder_embarked.transform(test["embarked"].dropna())

In [None]:
train["fare"] = train["fare"].apply(np.log)
test["fare"] = test["fare"].apply(np.log)
joined["fare"] = joined["fare"].apply(np.log)

In [None]:
train["famsize"] = train["sibsp"] + train["parch"] + 1
test["famsize"]  = test["sibsp"] + test["parch"] + 1

In [None]:
unmissed = train.dropna()
unmissed

In [None]:
# sns.displot(data=unmissed, x="fare")
unmissed["fare"].iplot(kind="hist")

In [None]:
# Fare buckets
# 1 : < 2.9
# 2 : < 3.8
# 3 : < 4.6
# 4 : >=4.6

def fare_bucket(x):
    if x<2.9:
        return 1
    elif x<3.8:
        return 2
    elif x<4.6:
        return 3
    else:
        return 4

In [None]:
joined["fare"].iplot(kind="hist")

In [None]:
unmissed["age"].iplot(kind="hist")

In [None]:
unmissed["fare_bucket"] = unmissed["fare"].apply(fare_bucket)
unmissed.head()

In [None]:
xx = unmissed.drop(["survived", "fare_bucket"],axis=1)
yy = unmissed["survived"]
baseline = LogisticRegression(max_iter=1000)
baseline.fit(xx,yy)
print("acc =",baseline.score(xx,yy)*100)
print("roc =", roc_auc_score(yy, baseline.predict_proba(xx)[:,1] )*100)

In [None]:
xx = unmissed.drop(["survived"],axis=1)
yy = unmissed["survived"]
baseline = LogisticRegression(max_iter=1000)
baseline.fit(xx,yy)
print("acc =",baseline.score(xx,yy)*100)
print("roc =", roc_auc_score(yy, baseline.predict_proba(xx)[:,1] )*100)

In [None]:
xx = unmissed.drop(["survived", "fare"],axis=1)
yy = unmissed["survived"]
baseline = LogisticRegression(max_iter=1000)
baseline.fit(xx,yy)
print("acc =",baseline.score(xx,yy)*100)
print("roc =", roc_auc_score(yy, baseline.predict_proba(xx)[:,1] )*100)

In [None]:
xx = unmissed.drop(["survived", "age"],axis=1)
yy = unmissed["survived"]
baseline = LogisticRegression(max_iter=1000)
baseline.fit(xx,yy)
print("acc =",baseline.score(xx,yy)*100)
print("roc =", roc_auc_score(yy, baseline.predict_proba(xx)[:,1] )*100)

In [None]:
xx = unmissed.drop(["survived", "embarked"],axis=1)
yy = unmissed["survived"]
baseline = LogisticRegression(max_iter=1000)
baseline.fit(xx,yy)
print("acc =",baseline.score(xx,yy)*100)
print("roc =", roc_auc_score(yy, baseline.predict_proba(xx)[:,1] )*100)

In [None]:
xx = unmissed.drop(["survived"],axis=1)
yy = unmissed["survived"]

dtree = DecisionTreeClassifier(max_depth=10)
dtree.fit(xx,yy)

print("acc =",dtree.score(xx,yy)*100)
print("roc =", roc_auc_score(yy, dtree.predict_proba(xx)[:,1] )*100)

In [None]:
xx = unmissed.drop(["survived"],axis=1)
yy = unmissed["survived"]
X_train, X_test, y_train, y_test = train_test_split(xx,yy, test_size=0.25, random_state=2021)

rf = RandomForestClassifier(n_estimators=200, max_depth=10, max_features=4)
rf.fit(X_train,y_train)

print("acc =",rf.score(X_train,y_train)*100)
print("acc =",rf.score(X_test,y_test)*100)
print("roc =", roc_auc_score(y_train, rf.predict_proba(X_train)[:,1] )*100)

In [None]:
# rf_params = {"n_estimators" : [100, 150, 200, 250, 300, 400, 500],
#              "max_depth"    : [5,8,10,15],
#              "max_features" : [2, 3, 4, 5],
#              "random_state" : [2021]
#             }

In [None]:
# rf_grid = GridSearchCV(RandomForestClassifier(), param_grid=rf_params, return_train_score=True, verbose=3)
# rf_grid.fit(X_train, y_train)

In [None]:
# print("best score = ",rf_grid.best_score_*100)
# print("best params =", rf_grid.best_params_)

In [None]:
# print("test acc = ",rf_grid.score(X_test, y_test)*100)
# print("train roc =", roc_auc_score(y_train, rf.predict_proba(X_train)[:,1] )*100)
# print("test roc =", roc_auc_score(y_test, rf.predict_proba(X_test)[:,1] )*100)

In [None]:
xx = unmissed.drop(["survived"],axis=1)
yy = unmissed["survived"]
rf_full_set = RandomForestClassifier(n_estimators=100, max_depth=8, max_features=2, random_state=2021)
rf_full_set.fit(xx,yy)
print("acc =",rf_full_set.score(xx,yy)*100)
print("roc =", roc_auc_score(yy, rf_full_set.predict_proba(xx)[:,1] )*100)

In [None]:
test["age"] = test["age"].fillna(joined["age"].median())
test["fare"] = test["fare"].fillna(joined["fare"].mean())
test.loc[test["embarked"].isna(), "embarked"] = train["embarked"].mode()
test["fare_bucket"] = test["fare"].apply(fare_bucket)

In [None]:
test.loc[test["embarked"].isna(), "embarked"] = 2

In [None]:
# embdata = joined.drop("survived",axis=1).dropna()
# knn_emb = KNeighborsClassifier(n_neighbors=50)
# knn_emb.fit(embdata.drop("embarked",axis=1), embdata["embarked"].astype("int"))
# knn_emb.score(embdata.drop("embarked",axis=1), embdata["embarked"].astype("int"))*100

In [None]:
# knn_emb.predict
# test.loc[test["embarked"].isna(), "embarked"] = knn_emb.predict( test[test["embarked"].isna()].drop(["embarked","famsize"], axis=1) )

In [None]:
test_predicitons = rf.predict(test)
result = pd.DataFrame({"PassengerId": passid_test.values, "Survived": test_predicitons})
result

In [None]:
result.to_csv("./subm_apr_24_num_3.csv", index=False)

DO I NEED TO IMPUTE "age","fare","embarked" ?
WE can answer by looking where all "age" is missing, in a graph of other parameters. Similarly for others too.

# ****Stronger Ensemble Models****

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier

In [None]:
xx = unmissed.drop(["survived"],axis=1)
yy = unmissed["survived"]
X_train, X_test, y_train, y_test = train_test_split(xx,yy, test_size=0.25, random_state=2021)
gbcl = GradientBoostingClassifier(n_estimators=300)
gbcl.fit(X_train, y_train)
print(gbcl.score(X_train, y_train)*100)
print(gbcl.score(X_test, y_test)*100)
print(roc_auc_score(y_train, gbcl.predict_proba(X_train)[:,1])*100)

In [None]:
unmissed["embarked"] = unmissed["embarked"].astype("int")

In [None]:
xx = unmissed.drop(["survived"],axis=1)
yy = unmissed["survived"]
X_train, X_test, y_train, y_test = train_test_split(xx,yy, test_size=0.25, random_state=2221)
xgbcl = XGBClassifier()
xgbcl.fit(X_train, y_train)
print(xgbcl.score(X_train, y_train)*100)
print(xgbcl.score(X_test, y_test)*100)
print(roc_auc_score(y_train, xgbcl.predict_proba(X_train)[:,1])*100)

In [None]:
ensemble=VotingClassifier(estimators=[ ('XGBoost', xgbcl), 
                                      ('Random Forest', rf), 
                                      ('Gradient boosting', gbcl),
                                      ('Logistic Regression', baseline)], 
                          voting='soft', weights=[1,1,1,1]).fit(unmissed.drop("survived",axis=1),unmissed["survived"])
print('Train accuracy for Ensemble is:',ensemble.score(X_train,y_train)*100)
print('Test accuracy for Ensemble is:',ensemble.score(X_test,y_test)*100)
# print('Train ROC for Ensemble is:',roc_auc_score(y_train, ensemble.predict_proba(X_train))*100)

In [None]:
test_predicitons = ensemble.predict(test)
result = pd.DataFrame({"PassengerId": passid_test.values, "Survived": test_predicitons})
result
result.to_csv("./subm_apr_24_num_5.csv", index=False)

### **TIME TO TRY REMOVING OUTLIERS AND FOCUSSING ON CORE CLEAN DATA TO FIT MORE THAN 80% ACCURACY**


In [None]:
# Point should be that the % of data points being dropped 
#(which can be construed as resulting in a direct additional error rate)
# should be overcome by the % gain in accuracy from the improved clean core dataset

In [None]:
# Simple KNN classifier

crsval = cross_val_score(LogisticRegression(max_iter=2000), unmissed.drop("survived", axis=1), unmissed["survived"], cv=5) 
crsval*100

In [None]:
plt.plot(crsval*100)
crsval*100

In [None]:
for n_neigh in range(51,71, 2):
    # n_neigh = 5
    crsval = cross_val_score(KNeighborsClassifier(n_neighbors=n_neigh), unmissed.drop("survived", axis=1), unmissed["survived"], cv=5) 
    print(f"n={n_neigh}, ",np.mean(crsval*100),",", crsval*100)

In [None]:
# KNN
n=1,   68.46738558321304 , [68.48497431 68.26179478 68.64943424 68.16671857 68.77400602]
n=2,   69.28953996419733 , [69.28945866 69.32579021 69.33457905 69.29305512 69.20481678]
n=3,   71.97504102001923 , [72.27902631 72.01951523 71.90387211 71.78449081 71.88830063]
n=4,   72.0705485871366 , [72.11812944 71.93647169 72.04920585 72.18415862 72.06477733]
n=5,   73.51036040811661 , [73.61291327 73.36897286 73.44544794 73.27416174 73.85030624]
n=6,   73.44703948880417 , [73.23921731 73.44682618 73.45582892 73.48697187 73.60635316]
n=7,   74.27230929587226 , [74.28245186 74.16307676 74.5094986  73.9281636  74.47835565]
n=8,   74.18407412742086 , [74.05408211 74.08522344 74.49911762 74.17730717 74.1046403 ]
n=9,   74.8297566628736 , [74.80666424 74.73400114 75.19464341 74.49392713 74.91954739]
n=10,  74.68027655599954 , [74.59905538 74.36030519 75.10121457 74.51987958 74.82092806]
n=11,  75.01972577860343 , [74.95718067 74.85856646 75.42821551 74.69635628 75.15830998]
n=12,  74.94186943700902 , [74.87932735 74.83780557 75.29326274 74.71711824 74.98183328]
n=13,  75.20450057786138 , [75.31011574 75.14921887 75.38669158 74.92992837 75.24654832]
n=14,  75.08927620309358 , [75.035034   74.988322   75.19464341 75.00778574 75.22059587]
n=15,  75.2885863168743 , [75.34125707 75.13883843 75.41264404 75.00778574 75.54240631]
n=16,  75.16920701694903 , [75.21150153 75.07655577 75.22578636 74.97664279 75.35554863]
n=17,  75.33945016549684 , [75.52810505 75.23745264 75.4905014  75.03892868 75.40226305]
n=18,  75.27301398173937 , [75.35163751 75.21150153 75.23616734 75.18426243 75.38150109]
n=19,  75.38201192246655 , [75.45544195 75.33606685 75.36592962 75.2621198  75.4905014 ]
n=20,  75.2117674269288 , [75.2997353  75.14402865 75.24654832 75.05969065 75.30883422]
n=21,  75.36644163475634 , [75.50734416 75.16997976 75.433406   75.35035814 75.37112011]
n=22,  75.2968914796642 , [75.28935486 75.15440909 75.45416796 75.32440569 75.2621198 ]
n=23,  75.41419409826489 , [75.39834951 75.28416463 75.40226305 75.41264404 75.57354926]
n=24,  75.3757856500647 , [75.40872995 75.15959931 75.44897747 75.32959618 75.53202533]
n=25,  75.42146073181391 , [75.4917735  75.19593087 75.35035814 75.39188207 75.67735908]
n=26,  75.40381408605674 , [75.34644729 75.24264286 75.38669158 75.36592962 75.67735908]
n=27,  75.47440352470412 , [75.56962682 75.13883843 75.46454895 75.61507319 75.58393024]
n=28,  75.3944718487751 , [75.36720818 75.15959931 75.29845323 75.57873975 75.56835877]
n=29,  75.56056508296803 , [75.45544195 75.31011574 75.56835877 75.64102564 75.82788332]
n=30,  75.4609094334021 , [75.4450615  75.14921887 75.41783453 75.48012042 75.81231184]
n=31,  75.57405998246402 , [75.54886594 75.2530233  75.48531091 75.64621613 75.93688363]
n=33,  75.53876534395974 , [75.48658328 75.24783308 75.53721582 75.47492993 75.94726461]
n=35,  75.5699089366376 , [75.45025173 75.22188197 75.63064466 75.64621613 75.90055019]
n=37,  75.53668947082913 , [75.4191104  75.28416463 75.53721582 75.57354926 75.86940725]
n=39,  75.55537383763321 , [75.42949084 75.40872995 75.44378698 75.6669781  75.82788332]
n=41,  75.55641236687408 , [75.53329527 75.26340375 75.48531091 75.65659711 75.84345479]
n=43,  75.50969875588399 , [75.42949084 75.28935486 75.38150109 75.67216859 75.77597841]
n=45,  75.46298595308785 , [75.39315929 75.16997976 75.35554863 75.67735908 75.71888301]
n=47,  75.44845279374898 , [75.39315929 75.14921887 75.30883422 75.56316828 75.82788332]
n=49,  75.40070092291602 , [75.38277884 75.09731666 75.2621198  75.48531091 75.77597841]
n=51,  75.39966228591595 , [75.39315929 75.13883843 75.21021489 75.45416796 75.80193086]
n=53,  75.35398752744432 , [75.36720818 75.01427311 75.29845323 75.34516765 75.74483546]
n=55,  75.38409194432616 , [75.33087663 75.09212643 75.44378698 75.35035814 75.70331153]
n=57,  75.34879816789541 , [75.27378419 74.99870244 75.42302502 75.45935846 75.58912073]
n=59,  75.33530229856667 , [75.26340375 75.06617533 75.35554863 75.37112011 75.62026368]
n=61,  75.29793189469096 , [75.31530596 74.90527846 75.36592962 75.30883422 75.59431122]
n=63,  75.30312098484188 , [75.29454508 75.0609851  75.29326274 75.29326274 75.57354926]
n=65,  75.24395105747627 , [75.23226242 74.96237089 75.23097685 75.25692931 75.53721582]
n=67,  75.25536976056408 , [75.2270722  75.00389267 75.21021489 75.24135783 75.59431122]
n=69,  75.2709418801806 , [75.2530233  74.9156589  75.33478667 75.32440569 75.52683484]

In [None]:
scaled = MinMaxScaler().fit(unmissed)
scaled_unmissed = pd.DataFrame(scaled.transform(unmissed), columns=unmissed.columns)

In [None]:
for n_neigh in range(51,71, 2):
    # n_neigh = 5
    crsval = cross_val_score(KNeighborsClassifier(n_neighbors=n_neigh), scaled_unmissed.drop("survived", axis=1), scaled_unmissed["survived"], cv=5) 
    print(f"n={n_neigh}, ",np.mean(crsval*100),",", crsval*100)

In [None]:
n=1,  68.75597020288001 , [68.79638761 68.59396896 68.65462473 68.54043393 69.19443579]
n=3,  72.55221101366752 , [72.69943427 72.81361914 72.50596906 72.18934911 72.55268348]
n=5,  74.22559255986553 , [74.28764208 74.38106607 74.13059275 73.9904495  74.33821239]
n=7,  75.01660501843946 , [75.36720818 75.07136555 74.83130904 74.58735596 75.22578636]
n=9,  75.5854750756188 , [75.5644366  75.62152904 75.54240631 75.36073913 75.8382643 ]
n=11,  75.90623987753901 , [76.00560544 75.90699123 75.8953597  75.69293055 76.03031247]
n=13,  76.17717801308758 , [76.22878497 76.2132143  76.19640818 75.88497872 76.36250389]
n=15,  76.2799454249247 , [76.41563295 76.44158406 76.25869407 75.96802658 76.31578947]
n=17,  76.39102004697642 , [76.4519645  76.58691026 76.4144088  76.19640818 76.30540849]
n=19,  76.49898015889752 , [76.47272539 76.76856802 76.4144088  76.26388456 76.57531402]
n=21,  76.53635131708756 , [76.41044273 76.86718223 76.57012353 76.22236064 76.61164746]
n=23,  76.61628283137782 , [76.57652982 76.89832356 76.54936157 76.2431226  76.81407661]
n=25,  76.71386438583986 , [76.59210048 76.85161156 76.74660023 76.54417108 76.83483858]
n=27,  76.73462608052367 , [76.65438314 76.81528001 76.7050763  76.54417108 76.95421987]
n=29,  76.74085590897896 , [76.63362226 76.7166658  76.74140974 76.6479809  76.96460085]
n=31,  76.76265446263612 , [76.74261691 76.75299735 76.88674349 76.57012353 76.86079103]
n=33,  76.7595390365523 , [76.82047023 76.78413868 76.85560054 76.52340911 76.81407661]
n=35,  76.82286006362395 , [76.89832356 76.99174755 76.82964808 76.51821862 76.8763625 ]
n=37,  76.87787980723668 , [76.94503555 76.89313334 76.99055331 76.67912385 76.88155299]
n=39,  76.95365984466373 , [77.0280791  76.91908444 77.11512509 76.73102876 76.97498183]
n=41,  76.95469864330258 , [76.99693777 76.88275289 77.19298246 76.76736219 76.93345791]
n=43,  76.98168822677616 , [77.00731821 76.96579644 77.16183951 76.74660023 77.02688674]
n=45,  76.98376528525792 , [76.96579644 76.92427467 77.15145853 76.82964808 77.04764871]
n=47,  76.9858408351109 , [76.98655733 76.96579644 77.10474411 76.85560054 77.01650576]
n=49,  76.98376485422112 , [76.91389422 77.01769866 77.1099346  76.82445759 77.0528392 ]
n=51,  76.97857447095987 , [76.98655733 76.93465511 77.08398214 76.88674349 77.00093429]
n=53,  76.95988891880464 , [76.95022577 76.95022577 77.10474411 76.9386484  76.85560054]
n=55,  76.97234582785573 , [76.98655733 76.93984533 77.08917264 76.95941036 76.88674349]
n=57,  76.95262347060675 , [77.00212799 76.77894846 77.15145853 76.90750545 76.92307692]
n=59,  76.91732872434325 , [76.94503555 76.77894846 77.07360116 76.88155299 76.90750545]
n=61,  76.94847064675362 , [76.93984533 76.88275289 77.06841067 76.88674349 76.96460085]
n=63,  76.94950949927207 , [77.01769866 76.73223647 77.12550607 76.96979134 76.90231496]
n=65,  76.97130896888238 , [77.0280791  76.77894846 77.08917264 76.9386484  77.02169625]
n=67,  76.98895604567633 , [77.05403021 76.81008979 77.14107755 76.90750545 77.03207723]
n=69,  76.94639439646582 , [77.05922043 76.76856802 77.07360116 76.79850514 77.03207723]


In [None]:
xx = sclassification_reportunmissed.drop("survived", axis=1)
yy = scaled_unmissed["survived"]
for n_neigh in [43, 45, 47, 49, 67]:
    # n_neigh = 5
    knncls = KNeighborsClassifier(n_neighbors=n_neigh).fit(xx,yy)
    print(f"n={n_neigh}, ",knncls.score(xx, yy)*100)

In [None]:
knn_opt = KNeighborsClassifier(n_neighbors=43).fit(xx,yy)
print("acc = ",knncls.score(xx, yy)*100)
print("roc =", roc_auc_score(yy, knn_opt.predict_proba(xx)[:,1])*100)

In [None]:
print(confusion_matrix(yy, knn_opt.predict(xx)))
print(classification_report(yy, knn_opt.predict(xx)))

In [None]:
scaled_unmissed.head()

In [None]:
test2 = test
test2["survived"] = 0
test2 = pd.concat([test2["survived"], test2.drop("survived",axis=1)], axis=1)

In [None]:
scaled_test = scaled.transform(test2)
scaled_test = pd.DataFrame(scaled_test, columns= test2.columns)
scaled_test.drop("survived", axis=1, inplace=True)

In [None]:
## test_predicitons = knn_opt.predict(scaled_test).astype("int")
result = pd.DataFrame({"PassengerId": passid_test.values, "Survived": test_predicitons})
result
result.to_csv("./subm_apr_26_num_1.csv", index=False)

In [None]:
train_preds = knn_opt.predict(xx)
train_preds

In [None]:
incorrect = xx[yy!=train_preds]
incorrect

In [None]:
sns.scatterplot(data=xx, x="age", y="fare", hue=yy)

In [None]:
sns.scatterplot(data=incorrect, x="age", y="fare", hue=yy)

In [None]:
kmeans = KMeans(n_clusters=2, max_iter=1000)
kmeans.fit(xx)

In [None]:
print(pd.DataFrame(kmeans.predict(xx)).value_counts())
print(unmissed["survived"].value_counts())

In [None]:
cluster_id = pd.DataFrame(kmeans.predict(xx), index=xx.index, columns=["cluster"])
cluster_id

In [None]:
print(yy[cluster_id["cluster"] == 0].mean())
print(yy[cluster_id["cluster"] == 1].mean())

In [None]:
ind_1 = cluster_id["cluster"] == 0
ind_2 = cluster_id["cluster"] == 1
mod1 = LogisticRegression().fit(xx[ind_1], yy[ind_1])
print(mod1.score(xx[ind_1], yy[ind_1])*100)
mod2 = LogisticRegression().fit(xx[ind_2], yy[ind_2])
print(mod2.score(xx[ind_2], yy[ind_2])*100)

In [None]:
segmented_scores = []
for n_clusters in range(2,11):
    kmeans = KMeans(n_clusters=n_clusters, max_iter=1000).fit(xx)
    cluster_id = pd.DataFrame(kmeans.predict(xx), index=xx.index, columns=["cluster"])
    cluster_fit_scores = []
    for clust_id in range(n_clusters):
        ind_i = cluster_id["cluster"] == clust_id
        logmod = LogisticRegression(max_iter=2000).fit(xx[ind_i], yy[ind_i])
        cluster_fit_scores.append(logmod.score(xx[ind_i], yy[ind_i])*100)
    segmented_scores.append([n_clusters, cluster_fit_scores])

In [None]:
for x in segmented_scores:
    print("# clusters =", x[0])
    print(x[1], end="\n"*2)

In [None]:
# Given a threshold say 80%, Can we find the "best fit" of a given data pt into a cluster, 
# such that its fit score is > threshold.
# One logic might be to start searching for such a cluster from highest order clustering
# schemes. If found above threshold, keep it. If not, go down an order and search.
# Such a logic might ensure a base minimum score (which would be the minimum in order=2 clustering)

# In this logic, threshold will become a hyperparameter. 
# Its starting value can be what we desire it to be, but it can be guaranteed to be 
# above the base minimum from 2-clustering.