In [840]:
import matplotlib.pyplot as plt
import seaborn as sns

In [841]:
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
from xgboost.sklearn import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
from catboost import CatBoostClassifier

In [842]:
train_path = "/kaggle/input/playground-series-s3e12/train.csv"
test_path = "/kaggle/input/playground-series-s3e12/test.csv"
sub_path = "/kaggle/input/playground-series-s3e12/sample_submission.csv"
df_train = pd.read_csv(train_path)
df_test = pd.read_csv(test_path)
df_sub = pd.read_csv(sub_path)

In [843]:
df_train.head(3)

Unnamed: 0,id,gravity,ph,osmo,cond,urea,calc,target
0,0,1.013,6.19,443,14.8,124,1.45,0
1,1,1.025,5.4,703,23.6,394,4.18,0
2,2,1.009,6.13,371,24.5,159,9.04,0


In [844]:
df_test.head(3)

Unnamed: 0,id,gravity,ph,osmo,cond,urea,calc
0,414,1.017,5.24,345,11.5,152,1.16
1,415,1.02,5.68,874,29.0,385,3.46
2,416,1.024,5.36,698,19.5,354,13.0


In [845]:
df_train.drop("id", inplace = True, axis = 1)
df_test.drop("id", inplace = True, axis = 1)

In [846]:
new_list = list(set(df_train.index.tolist()) - set(df_train[df_train["urea"] < 15].index.tolist()))
df_train = df_train.loc[new_list,]

In [847]:
s_temp = df_train["target"].copy()
df_train.drop("target", axis = 1, inplace = True)
df_train["train"], df_test["train"] = 1, 0
df_train["target"] = s_temp.copy()
df = pd.concat([df_train, df_test])

### Feature Generation

In [848]:
df["gravity/ph"] = df["gravity"] / df["ph"]
df["osmo/cond"] = df["osmo"] / df["cond"]
df["gravity*ph"] = df["gravity"] * df["ph"]
df["gravity*osmo"] = df["gravity"] * df["osmo"]
df["osmo*urea"] = df["osmo"] * df["urea"]
df["cond_urea_ph"] = df["cond"] * df["urea"] / df["ph"]
df["ph*osmo"] = df["ph"] * df["osmo"]
df["cond*calc"] = df["cond"] * df["calc"]
df["gravity/calc"] = df["gravity"] / df["calc"]

### Normalizing Data

In [849]:
columns = df.columns.tolist()
columns.remove("train")
columns.remove("target")

In [850]:
data_norm = preprocessing.normalize(df[columns], axis=0)
df_norm = pd.DataFrame(data_norm, columns = columns)

In [851]:
df_norm.reset_index(drop = True, inplace = True)
df.reset_index(drop = True, inplace = True)

In [852]:
df_norm["train"], df_norm["target"] = df["train"], df["target"]

### Setting Variables for Learning

In [853]:
features = ['gravity/ph', 'osmo/cond', 'gravity*ph', 'gravity*osmo', 'osmo*urea',
            'cond_urea_ph', 'ph*osmo', 'cond*calc', 'gravity/calc']

In [854]:
X = df_norm[df_norm["train"] == 1][features].copy()
y = df_norm[df_norm["train"] == 1]["target"].copy()
print(X.shape, y.shape)

(412, 9) (412,)


In [855]:
df_norm

Unnamed: 0,gravity,ph,osmo,cond,urea,calc,gravity/ph,osmo/cond,gravity*ph,gravity*osmo,osmo*urea,cond_urea_ph,ph*osmo,cond*calc,gravity/calc,train,target
0,0.037934,0.039449,0.024310,0.024651,0.015191,0.010662,0.035910,0.034258,0.039267,0.024119,0.007977,0.008174,0.025676,0.006233,0.033203,1,0.0
1,0.038383,0.034414,0.038578,0.039309,0.048269,0.030736,0.041651,0.034093,0.034661,0.038728,0.040224,0.047475,0.035545,0.028652,0.011654,1,0.0
2,0.037784,0.039067,0.020359,0.040808,0.019479,0.066472,0.036118,0.017331,0.038733,0.020119,0.008567,0.017521,0.021294,0.064328,0.005305,1,0.0
3,0.038233,0.031291,0.024255,0.034645,0.048759,0.048751,0.045629,0.024321,0.031393,0.024254,0.025547,0.046485,0.020321,0.040054,0.007319,1,1.0
4,0.038233,0.035243,0.047962,0.029648,0.047166,0.016250,0.040513,0.056197,0.035357,0.047960,0.048866,0.034167,0.045255,0.011426,0.021957,1,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
683,0.038533,0.039959,0.046809,0.035645,0.044593,0.053751,0.036012,0.045620,0.040403,0.047175,0.045091,0.034253,0.050078,0.045436,0.006690,0,
684,0.037896,0.035816,0.022499,0.023319,0.023889,0.010662,0.039513,0.033518,0.035616,0.022300,0.011611,0.013393,0.021575,0.005896,0.033170,0,
685,0.038608,0.036199,0.047962,0.048303,0.046553,0.033016,0.039830,0.034494,0.036672,0.048430,0.048232,0.053491,0.046483,0.037819,0.010913,0,
686,0.038158,0.034860,0.029798,0.035645,0.020827,0.012059,0.040877,0.029041,0.034905,0.029738,0.013406,0.018337,0.027811,0.010194,0.029530,0,


In [856]:
alldt = df_norm.copy()
alldt2 = alldt.copy()
alldt = alldt.iloc[:, :-1]
alldt

Unnamed: 0,gravity,ph,osmo,cond,urea,calc,gravity/ph,osmo/cond,gravity*ph,gravity*osmo,osmo*urea,cond_urea_ph,ph*osmo,cond*calc,gravity/calc,train
0,0.037934,0.039449,0.024310,0.024651,0.015191,0.010662,0.035910,0.034258,0.039267,0.024119,0.007977,0.008174,0.025676,0.006233,0.033203,1
1,0.038383,0.034414,0.038578,0.039309,0.048269,0.030736,0.041651,0.034093,0.034661,0.038728,0.040224,0.047475,0.035545,0.028652,0.011654,1
2,0.037784,0.039067,0.020359,0.040808,0.019479,0.066472,0.036118,0.017331,0.038733,0.020119,0.008567,0.017521,0.021294,0.064328,0.005305,1
3,0.038233,0.031291,0.024255,0.034645,0.048759,0.048751,0.045629,0.024321,0.031393,0.024254,0.025547,0.046485,0.020321,0.040054,0.007319,1
4,0.038233,0.035243,0.047962,0.029648,0.047166,0.016250,0.040513,0.056197,0.035357,0.047960,0.048866,0.034167,0.045255,0.011426,0.021957,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
683,0.038533,0.039959,0.046809,0.035645,0.044593,0.053751,0.036012,0.045620,0.040403,0.047175,0.045091,0.034253,0.050078,0.045436,0.006690,0
684,0.037896,0.035816,0.022499,0.023319,0.023889,0.010662,0.039513,0.033518,0.035616,0.022300,0.011611,0.013393,0.021575,0.005896,0.033170,0
685,0.038608,0.036199,0.047962,0.048303,0.046553,0.033016,0.039830,0.034494,0.036672,0.048430,0.048232,0.053491,0.046483,0.037819,0.010913,0
686,0.038158,0.034860,0.029798,0.035645,0.020827,0.012059,0.040877,0.029041,0.034905,0.029738,0.013406,0.018337,0.027811,0.010194,0.029530,0


In [857]:
X = alldt[alldt["train"] == 1].copy()
y = alldt2[alldt2["train"] == 1]["target"].copy()
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size = 0.3, random_state = 121)
X_train, y_train = X, y

xbc_model = XGBClassifier(n_estimators = 140, learning_rate = 0.05, max_depth = 4,
                          eval_metric = "auc", booster = 'gbtree')
xbc_model.fit(X_train, y_train)

rfc_model = RandomForestClassifier(n_estimators = 140, max_depth = 4)
rfc_model.fit(X_train, y_train)

cbc = CatBoostClassifier(learning_rate=0.05, n_estimators=140)
cbc.fit(X_train, y_train)


rfc_preds = rfc_model.predict_proba(X_val)[:, 1]
xgb_preds = xbc_model.predict_proba(X_val)[:, 1]
cbc_preds = cbc.predict_proba(X_val)[:, 1]


ensemble_preds = (cbc_preds + rfc_preds + xgb_preds)/ 3

ensemble_score = roc_auc_score(y_val, ensemble_preds)
print("ROC AUC Score:", ensemble_score)

0:	learn: 0.6830768	total: 1.81ms	remaining: 252ms
1:	learn: 0.6738375	total: 3.92ms	remaining: 270ms
2:	learn: 0.6636508	total: 5.44ms	remaining: 248ms
3:	learn: 0.6532405	total: 6.92ms	remaining: 235ms
4:	learn: 0.6449345	total: 8.41ms	remaining: 227ms
5:	learn: 0.6362487	total: 9.98ms	remaining: 223ms
6:	learn: 0.6277742	total: 11.5ms	remaining: 218ms
7:	learn: 0.6198903	total: 13ms	remaining: 214ms
8:	learn: 0.6117229	total: 14.5ms	remaining: 211ms
9:	learn: 0.6053308	total: 16.2ms	remaining: 210ms
10:	learn: 0.5976859	total: 17.8ms	remaining: 208ms
11:	learn: 0.5920583	total: 19.3ms	remaining: 205ms
12:	learn: 0.5869130	total: 20.7ms	remaining: 203ms
13:	learn: 0.5812771	total: 22.4ms	remaining: 202ms
14:	learn: 0.5768337	total: 24ms	remaining: 200ms
15:	learn: 0.5718005	total: 25.5ms	remaining: 197ms
16:	learn: 0.5656538	total: 27.1ms	remaining: 196ms
17:	learn: 0.5618459	total: 28.6ms	remaining: 194ms
18:	learn: 0.5585692	total: 30.3ms	remaining: 193ms
19:	learn: 0.5548944	total

In [858]:
X_sub = alldt[alldt["train"] == 0].copy()
rfc_preds = rfc_model.predict_proba(X_sub)[:, 1]
xgb_preds = xbc_model.predict_proba(X_sub)[:, 1]
cbc_preds = cbc.predict_proba(X_sub)[:, 1]
ensemble_preds = (.2 * cbc_preds + .3 * rfc_preds + .5 * xgb_preds)
my_preds = pd.DataFrame(data = ensemble_preds)
df_sub["target"] = my_preds
df_sub

Unnamed: 0,id,target
0,414,0.292246
1,415,0.633379
2,416,0.851359
3,417,0.425788
4,418,0.293882
...,...,...
271,685,0.915767
272,686,0.105842
273,687,0.728704
274,688,0.106748


In [859]:
df_sub.to_csv('submission_II.csv', index = False)