In [None]:
pip install rdkit
pip install duckdb

In [None]:
import duckdb
import pandas as pd
import numpy as np
from rdkit import Chem
from rdkit.Chem import AllChem
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import average_precision_score
from sklearn.preprocessing import OneHotEncoder
from tensorflow.keras.preprocessing.sequence import pad_sequences


In [None]:
train_path = '/content/drive/MyDrive/Kaggle/train.parquet'

con = duckdb.connect()

df = con.query(f"""(SELECT *
                        FROM parquet_scan('{train_path}')
                        WHERE binds = 0
                        ORDER BY random()
                        LIMIT 90000)
                        UNION ALL
                        (SELECT *
                        FROM parquet_scan('{train_path}')
                        WHERE binds = 1
                        ORDER BY random()
                        LIMIT 90000)""").df()

con.close()

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

In [None]:
#Generate ECFPs
def generate_ecfp(molecule, radius=2, bits=1024):
    if molecule is None:
        return None
    return list(AllChem.GetMorganFingerprintAsBitVect(molecule, radius, nBits=bits))

# Convert molecule SMILES to RDKit and obtain ECFP
df['molecule'] = df['molecule_smiles'].apply(Chem.MolFromSmiles)
df['ecfp'] = df['molecule'].apply(generate_ecfp)


In [None]:
#Generate partial charge
def compute_partial_charges(molecule):
    AllChem.ComputeGasteigerCharges(molecule)
    partial_charges = [atom.GetDoubleProp('_GasteigerCharge') for atom in molecule.GetAtoms()]
    return partial_charges

#Convert building_block_2/3 to RDKIT object and get partial charge
df['building_block_2'] = df['buildingblock2_smiles'].apply(Chem.MolFromSmiles)
df['building_block_3'] = df['buildingblock3_smiles'].apply(Chem.MolFromSmiles)

df['partial_charges_2'] = df['building_block_2'].apply(compute_partial_charges)
df['partial_charges_3'] = df['building_block_3'].apply(compute_partial_charges)

In [None]:
#One-hot encode the protein_name
onehot_encoder = OneHotEncoder(sparse_output=False)
protein_onehot = onehot_encoder.fit_transform(df['protein_name'].values.reshape(-1, 1))

In [None]:
#Convert features into list
X_ecfp = df['ecfp'].tolist()
X_protein = protein_onehot.tolist()
X_partial_charges_2 = df['partial_charges_2'].tolist()
X_partial_charges_3 = df['partial_charges_3'].tolist()

In [None]:
# Determine the maximum length of sequences in X_partial_charges_2 and X_partial_charges_3
max_length = max(max(len(seq) for seq in X_partial_charges_2), max(len(seq) for seq in X_partial_charges_3))

# Pad sequences to ensure they have the same length
X_partial_charges_2_padded = pad_sequences(X_partial_charges_2, maxlen=max_length, padding='pre', dtype='float32')
X_partial_charges_3_padded = pad_sequences(X_partial_charges_3, maxlen=max_length, padding='pre', dtype='float32')

In [None]:
# Convert lists to NumPy arrays
X_ecfp = np.array(X_ecfp)
X_protein = np.array(X_protein)
X_partial_charges_2 = np.array(X_partial_charges_2_padded)
X_partial_charges_3 = np.array(X_partial_charges_3_padded)

In [None]:
# Concatenate the arrays along the columns axis and filter target variable
X = np.concatenate((X_ecfp, X_protein, X_partial_charges_2, X_partial_charges_3), axis=1)
y = df['binds'].tolist()

In [None]:
#Split and train RFC
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.22, random_state=42)
rf_model_1 = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model_1.fit(X_train, y_train)

In [None]:
# Make predictions on the test set
y_pred_proba = rf_model_1.predict_proba(X_test)[:, 1]  # Probability of the positive class

# Calculate the mean average precision
map_score = average_precision_score(y_test, y_pred_proba)
print(f"Mean Average Precision (mAP): {map_score:.2f}")

Mean Average Precision (mAP): 0.97


In [None]:
#Read and prepare the test file
test = pd.read_csv('/content/drive/MyDrive/Kaggle/test.csv')
test_1 = test.copy()

In [None]:
#feature engineer
test_1['molecule'] = test_1['molecule_smiles'].apply(Chem.MolFromSmiles)
test_1['ecfp'] = test_1['molecule'].apply(generate_ecfp)
test_1['building_block_2'] = test_1['buildingblock2_smiles'].apply(Chem.MolFromSmiles)
test_1['building_block_3'] = test_1['buildingblock3_smiles'].apply(Chem.MolFromSmiles)
test_1['partial_charges_2'] = test_1['building_block_2'].apply(compute_partial_charges)
test_1['partial_charges_3'] = test_1['building_block_3'].apply(compute_partial_charges)

In [None]:
max_length_2 = max(len(seq) for seq in test_partial_charges_2)
max_length_3 = max(len(seq) for seq in test_partial_charges_3)
max_length = max(max_length_2, max_length_3)

# Pad sequences to ensure they have the same length
test_partial_charges_2_padded = pad_sequences(test_partial_charges_2, maxlen=max_length, padding='pre', dtype='float32').tolist()
test_partial_charges_3_padded = pad_sequences(test_partial_charges_3, maxlen=max_length, padding='pre', dtype='float32').tolist()

In [None]:
#onehot encoding and converting features into lists
protein_onehot_test = onehot_encoder.fit_transform(test_1['protein_name'].values.reshape(-1, 1))
test_ecfp = test_1['ecfp'].tolist()
test_protein = protein_onehot_test.tolist()
# Convert lists to NumPy arrays
test_X_ecfp = np.array(test_ecfp)
test_X_protein = np.array(test_protein)

test_partial_charges_2 = test_1['partial_charges_2']
test_partial_charges_3 = test_1['partial_charges_3']
X_partial_charges_2 = np.array(test_partial_charges_2_padded)
X_partial_charges_3 = np.array(test_partial_charges_3_padded)

In [None]:
print("Shape of test_partial_charges_2:", X_partial_charges_2.shape)
print("Shape of test_partial_charges_3:", X_partial_charges_3.shape)

Shape of test_partial_charges_2: (1674896, 29)
Shape of test_partial_charges_3: (1674896, 29)


In [None]:
test_partial_charges_2_reduced = X_partial_charges_2[:, 7:]
test_partial_charges_3_reduced = X_partial_charges_3[:, 7:]
print("Shape of test_partial_charges_2:", test_partial_charges_2_reduced.shape)
print("Shape of test_partial_charges_3:", test_partial_charges_3_reduced.shape)

Shape of test_partial_charges_2: (1674896, 22)
Shape of test_partial_charges_3: (1674896, 22)


In [None]:
# Concatenate the arrays along the columns axis
X = np.concatenate((test_X_ecfp, test_X_protein, test_partial_charges_2_reduced, test_partial_charges_3_reduced), axis=1)

In [None]:
# Predict the probabilities
probabilities = rf_model_1.predict_proba(X)[:, 1]
  # Create a DataFrame with 'id' and 'probability' columns
output_df = pd.DataFrame({'id': test['id'], 'binds': probabilities})
output_df.to_csv('submission.csv', index=False)
output_df.head()