In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pathlib import Path
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, HistGradientBoostingClassifier
from sklearn.model_selection import GridSearchCV, StratifiedKFold, cross_validate
from xgboost import XGBClassifier
from sklearn.preprocessing import LabelEncoder

In [2]:
base_path = Path().cwd().joinpath('content/datasets/')
train_dataset = pd.read_csv(base_path.joinpath('train_molecular_data.csv'))
test_dataset = pd.read_csv(base_path.joinpath('test_molecular_data.csv'))

In [3]:
train_dataset.fillna(0, inplace=True)
test_dataset.fillna(0, inplace=True)

In [4]:
pd.set_option('display.max_rows',None)

In [5]:
y_train = train_dataset.loc[:, 'Expected']
X_train = train_dataset.drop('Expected', axis=1)

In [53]:
# cols = X_train.columns
# sc = StandardScaler()
# X_train = sc.fit_transform(X_train)
# X_train = pd.DataFrame(data=X_train, columns=cols)

In [54]:
# test_cols = test_dataset.columns
# test_dataset = sc.fit_transform(test_dataset)
# test_dataset = pd.DataFrame(data=test_dataset, columns=test_cols)

In [6]:
auc_scores = []

for col in X_train:
    feature = X_train[col]
    auc = round(roc_auc_score(y_train, feature), 3)
    if auc < 0.5:
        auc = 1 - auc
    auc_scores.append(round(auc, 3))

auc_frame = pd.DataFrame(data=np.array(auc_scores).reshape(1, 209),columns=X_train.columns)
auc_frame


Unnamed: 0,Assay Id,MaxEStateIndex,MinEStateIndex,MaxAbsEStateIndex,MinAbsEStateIndex,qed,MolWt,HeavyAtomMolWt,ExactMolWt,NumValenceElectrons,...,fr_sulfide,fr_sulfonamd,fr_sulfone,fr_term_acetylene,fr_tetrazole,fr_thiazole,fr_thiocyan,fr_thiophene,fr_unbrch_alkane,fr_urea
0,0.617,0.519,0.504,0.519,0.526,0.508,0.586,0.584,0.586,0.579,...,0.502,0.509,0.501,0.5,0.5,0.502,0.501,0.502,0.502,0.511


In [7]:
rfc = RandomForestClassifier(n_estimators=500)
model = rfc.fit(X_train, y_train)

In [8]:
pred = model.predict(test_dataset)
pred.reshape(-1, 1)

array([[2],
       [2],
       [2],
       ...,
       [2],
       [2],
       [1]])

In [9]:
final_df = pd.read_csv(base_path.joinpath('test_II.csv'))
final_df['Predicted'] = pred
final_df.columns = ['Id', 'Predicted']

In [10]:
final_df.shape

(10994, 2)

In [11]:
submission_path = Path().cwd().joinpath('content/submissions/')
final_df.to_csv(submission_path.joinpath('submission4.csv'), index=False)

In [12]:
rfc = RandomForestClassifier(n_estimators=100)
model = rfc.fit(X_train, y_train)

In [13]:
pred = model.predict(test_dataset)
pred.reshape(-1, 1)

array([[2],
       [2],
       [2],
       ...,
       [2],
       [2],
       [1]])

In [14]:
final_df = pd.read_csv(base_path.joinpath('test_II.csv'))
final_df['Predicted'] = pred
final_df.columns = ['Id', 'Predicted']

In [15]:
final_df.shape

(10994, 2)

In [16]:
submission_path = Path().cwd().joinpath('content/submissions/')
final_df.to_csv(submission_path.joinpath('submission6.csv'), index=False)

In [17]:
gb = GradientBoostingClassifier()
gb_model = gb.fit(X_train,y_train)

In [18]:
y_pred = gb_model.predict(test_dataset)
y_pred.reshape(-1, 1)

array([[2],
       [2],
       [2],
       ...,
       [2],
       [2],
       [2]])

In [19]:
final_df = pd.read_csv(base_path.joinpath('test_II.csv'))
final_df['Predicted'] = y_pred
final_df.columns = ['Id', 'Predicted']

In [20]:
final_df.shape

(10994, 2)

In [21]:
submission_path = Path().cwd().joinpath('content/submissions/')
final_df.to_csv(submission_path.joinpath('submission7.csv'), index=False)

In [22]:
hb = HistGradientBoostingClassifier()
hb_model = hb.fit(X_train, y_train)

In [23]:
y_predicted = hb_model.predict(test_dataset)

In [24]:
final_df = pd.read_csv(base_path.joinpath('test_II.csv'))
final_df['Predicted'] = y_predicted
final_df.columns = ['Id', 'Predicted']

In [25]:
final_df.to_csv(submission_path.joinpath('submission8.csv'), index=False)

In [26]:
hsb = HistGradientBoostingClassifier()
param = {'max_depth': [10, 12], 'learning_rate': [0.2, 0.1]}
gs = GridSearchCV(estimator=hsb, param_grid=param, cv=5, scoring='f1', n_jobs=6)
gs.fit(X_train, y_train)
print(gs.best_params_, gs.best_score_)

{'learning_rate': 0.2, 'max_depth': 12} 0.576137527035391


In [13]:
xgb = XGBClassifier()
lab = LabelEncoder()
y_data = lab.fit_transform(y_train)
skf = StratifiedKFold(n_splits=5, random_state=10, shuffle=True)
scores = cross_validate(estimator=xgb, X=X_train, y=y_data,cv=skf, scoring='f1', n_jobs=6)
print(np.mean(scores['test_score']))

0.9459673027005578


In [8]:
model = xgb.fit(X_train, y_data)
pred_vals = model.predict(test_dataset)
print(pred_vals)

[0 1 1 ... 1 1 1]


In [9]:
new_pred = lab.inverse_transform(pred_vals)
print(new_pred)

[1 2 2 ... 2 2 2]


In [11]:
final_df = pd.read_csv(base_path.joinpath('test_II.csv'))
final_df['Predicted'] = new_pred
final_df.columns = ['Id', 'Predicted']

In [12]:
submission_path = Path().cwd().joinpath('content/submissions/')
final_df.to_csv(submission_path.joinpath('submission9.csv'), index=False)

In [22]:
xgb2 = XGBClassifier(tree_method='hist', max_bin=255, n_estimators=500)
lab = LabelEncoder()
y_data = lab.fit_transform(y_train)
skf = StratifiedKFold(n_splits=5, random_state=10, shuffle=True)
scores = cross_validate(estimator=xgb2, X=X_train, y=y_data,cv=skf, scoring='f1', n_jobs=6)
print(np.mean(scores['test_score']))

0.9484676046017727


In [23]:
model = xgb2.fit(X_train, y_data)
pred_vals = model.predict(test_dataset)
print(pred_vals)

[1 1 1 ... 1 1 1]


In [24]:
new_pred = lab.inverse_transform(pred_vals)
print(new_pred)

[2 2 2 ... 2 2 2]


In [25]:
final_df = pd.read_csv(base_path.joinpath('test_II.csv'))
final_df['Predicted'] = new_pred
final_df.columns = ['Id', 'Predicted']

In [26]:
final_df.to_csv(submission_path.joinpath('submission10.csv'), index=False)