In [None]:
pip install rdkit
pip install duckdb

In [None]:
import duckdb
import pandas as pd
import numpy as np
from rdkit import Chem
from rdkit.Chem import AllChem
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import average_precision_score
from sklearn.preprocessing import OneHotEncoder
from tensorflow.keras.preprocessing.sequence import pad_sequences


In [None]:
train_path = '/content/drive/MyDrive/Kaggle/train.parquet'

con = duckdb.connect()

df = con.query(f"""(SELECT *
                        FROM parquet_scan('{train_path}')
                        WHERE binds = 0
                        ORDER BY random()
                        LIMIT 90000)
                        UNION ALL
                        (SELECT *
                        FROM parquet_scan('{train_path}')
                        WHERE binds = 1
                        ORDER BY random()
                        LIMIT 90000)""").df()

con.close()

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

In [None]:
#Generate ECFPs
def generate_ecfp(molecule, radius=2, bits=1024):
    if molecule is None:
        return None
    return list(AllChem.GetMorganFingerprintAsBitVect(molecule, radius, nBits=bits))

# Convert molecule SMILES to RDKit and obtain ECFP
df['molecule'] = df['molecule_smiles'].apply(Chem.MolFromSmiles)
df['ecfp'] = df['molecule'].apply(generate_ecfp)


In [None]:
#Generate partial charge
def compute_partial_charges(molecule):
    AllChem.ComputeGasteigerCharges(molecule)
    partial_charges = [atom.GetDoubleProp('_GasteigerCharge') for atom in molecule.GetAtoms()]
    return partial_charges

#Convert building_block_2/3 to RDKIT object and get partial charge
df['building_block_2'] = df['buildingblock2_smiles'].apply(Chem.MolFromSmiles)
df['building_block_3'] = df['buildingblock3_smiles'].apply(Chem.MolFromSmiles)

df['partial_charges_2'] = df['building_block_2'].apply(compute_partial_charges)
df['partial_charges_3'] = df['building_block_3'].apply(compute_partial_charges)

In [None]:
#One-hot encode the protein_name
onehot_encoder = OneHotEncoder(sparse_output=False)
protein_onehot = onehot_encoder.fit_transform(df['protein_name'].values.reshape(-1, 1))

In [None]:
#Convert features into list
X_ecfp = df['ecfp'].tolist()
X_protein = protein_onehot.tolist()
X_partial_charges_2 = df['partial_charges_2'].tolist()
X_partial_charges_3 = df['partial_charges_3'].tolist()

In [None]:
# Determine the maximum length of sequences in X_partial_charges_2 and X_partial_charges_3
max_length = max(max(len(seq) for seq in X_partial_charges_2), max(len(seq) for seq in X_partial_charges_3))

# Pad sequences to ensure they have the same length
X_partial_charges_2_padded = pad_sequences(X_partial_charges_2, maxlen=max_length, padding='pre', dtype='float32')
X_partial_charges_3_padded = pad_sequences(X_partial_charges_3, maxlen=max_length, padding='pre', dtype='float32')

In [None]:
# Convert lists to NumPy arrays
X_ecfp = np.array(X_ecfp)
X_protein = np.array(X_protein)
X_partial_charges_2 = np.array(X_partial_charges_2_padded)
X_partial_charges_3 = np.array(X_partial_charges_3_padded)

In [None]:
# Concatenate the arrays along the columns axis and filter target variable
X = np.concatenate((X_ecfp, X_protein, X_partial_charges_2, X_partial_charges_3), axis=1)
y = df['binds'].tolist()

In [None]:
#Split and train RFC
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.22, random_state=42)
rf_model_1 = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model_1.fit(X_train, y_train)

In [None]:
# Make predictions on the test set
y_pred_proba = rf_model_1.predict_proba(X_test)[:, 1]  # Probability of the positive class

# Calculate the mean average precision
map_score = average_precision_score(y_test, y_pred_proba)
print(f"Mean Average Precision (mAP): {map_score:.2f}")

Mean Average Precision (mAP): 0.97


In [None]:
#Read and prepare the test file
test = pd.read_csv('/content/drive/MyDrive/Kaggle/test.csv')
test_1 = test.copy()

In [None]:
#feature engineer
test_1['molecule'] = test_1['molecule_smiles'].apply(Chem.MolFromSmiles)
test_1['ecfp'] = test_1['molecule'].apply(generate_ecfp)
test_1['building_block_2'] = test_1['buildingblock2_smiles'].apply(Chem.MolFromSmiles)
test_1['building_block_3'] = test_1['buildingblock3_smiles'].apply(Chem.MolFromSmiles)
test_1['partial_charges_2'] = test_1['building_block_2'].apply(compute_partial_charges)
test_1['partial_charges_3'] = test_1['building_block_3'].apply(compute_partial_charges)

In [None]:
max_length_2 = max(len(seq) for seq in test_partial_charges_2)
max_length_3 = max(len(seq) for seq in test_partial_charges_3)
max_length = max(max_length_2, max_length_3)

# Pad sequences to ensure they have the same length
test_partial_charges_2_padded = pad_sequences(test_partial_charges_2, maxlen=max_length, padding='pre', dtype='float32').tolist()
test_partial_charges_3_padded = pad_sequences(test_partial_charges_3, maxlen=max_length, padding='pre', dtype='float32').tolist()

In [None]:
#onehot encoding and converting features into lists
protein_onehot_test = onehot_encoder.fit_transform(test_1['protein_name'].values.reshape(-1, 1))
test_ecfp = test_1['ecfp'].tolist()
test_protein = protein_onehot_test.tolist()
# Convert lists to NumPy arrays
test_X_ecfp = np.array(test_ecfp)
test_X_protein = np.array(test_protein)

test_partial_charges_2 = test_1['partial_charges_2']
test_partial_charges_3 = test_1['partial_charges_3']
X_partial_charges_2 = np.array(test_partial_charges_2_padded)
X_partial_charges_3 = np.array(test_partial_charges_3_padded)

In [None]:
print("Shape of test_partial_charges_2:", X_partial_charges_2.shape)
print("Shape of test_partial_charges_3:", X_partial_charges_3.shape)

Shape of test_partial_charges_2: (1674896, 29)
Shape of test_partial_charges_3: (1674896, 29)


In [None]:
test_partial_charges_2_reduced = X_partial_charges_2[:, 7:]
test_partial_charges_3_reduced = X_partial_charges_3[:, 7:]
print("Shape of test_partial_charges_2:", test_partial_charges_2_reduced.shape)
print("Shape of test_partial_charges_3:", test_partial_charges_3_reduced.shape)

Shape of test_partial_charges_2: (1674896, 22)
Shape of test_partial_charges_3: (1674896, 22)


In [None]:
# Concatenate the arrays along the columns axis
X = np.concatenate((test_X_ecfp, test_X_protein, test_partial_charges_2_reduced, test_partial_charges_3_reduced), axis=1)

In [None]:
# Predict the probabilities
probabilities = rf_model_1.predict_proba(X)[:, 1]
  # Create a DataFrame with 'id' and 'probability' columns
output_df = pd.DataFrame({'id': test['id'], 'binds': probabilities})
output_df.to_csv('submission.csv', index=False)

In [None]:
output_df.head()
output_df.to_csv('submission_apr_29_3.csv', index=False)

XXXXXXX
XXXXXXX
XXXXXXX
XXXXXXX

In [None]:
test_partial_charges_2 = np.array(test['partial_charges_2_padded'])
test_partial_charges_3 = np.array(test['partial_charges_3_padded'])
X_partial_charges_2 = np.array(X_partial_charges_2_padded)
X_partial_charges_3 = np.array(X_partial_charges_3_padded)

In [None]:
print("Shape of test_X_ecfp:", test_X_ecfp.shape)
print("Shape of test_X_protein:", test_X_protein.shape)
print("Shape of test_partial_charges_2:", test_partial_charges_2.shape)
print("Shape of test_partial_charges_3:", test_partial_charges_3.shape)


Shape of test_X_ecfp: (1674896, 1024)
Shape of test_X_protein: (1674896, 3)
Shape of test_partial_charges_2: (1674896, 29)
Shape of test_partial_charges_3: (1674896, 29)


In [None]:
X_test = np.concatenate((test_X_ecfp, test_X_protein, test_partial_charges_2, test_partial_charges_3), axis=1)

ValueError: all the input arrays must have same number of dimensions, but the array at index 0 has 2 dimension(s) and the array at index 2 has 1 dimension(s)

In [None]:
test_1.head()

Unnamed: 0,id,buildingblock1_smiles,protein_name,ecfp,partial_charges_2,partial_charges_3,partial_charges_2_padded,partial_charges_3_padded
0,295246830,C#CCCC[C@H](NC(=O)OCC1c2ccccc2-c2ccccc21)C(=O)O,BRD4,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[-0.09847003034286526, -0.06220531321489389, -...","[-0.09847003034286526, -0.06220531321489389, -...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
1,295246831,C#CCCC[C@H](NC(=O)OCC1c2ccccc2-c2ccccc21)C(=O)O,HSA,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[-0.09847003034286526, -0.06220531321489389, -...","[-0.09847003034286526, -0.06220531321489389, -...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2,295246832,C#CCCC[C@H](NC(=O)OCC1c2ccccc2-c2ccccc21)C(=O)O,sEH,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[-0.09847003034286526, -0.06220531321489389, -...","[-0.09847003034286526, -0.06220531321489389, -...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
3,295246833,C#CCCC[C@H](NC(=O)OCC1c2ccccc2-c2ccccc21)C(=O)O,BRD4,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[-0.09847003034286526, -0.06220531321489389, -...","[-0.03718306424507208, 0.06916888875100359, -0...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
4,295246834,C#CCCC[C@H](NC(=O)OCC1c2ccccc2-c2ccccc21)C(=O)O,HSA,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[-0.09847003034286526, -0.06220531321489389, -...","[-0.03718306424507208, 0.06916888875100359, -0...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


In [None]:
test_partial_charges_2 = np.array(test['partial_charges_2_padded'])
test_partial_charges_3 = np.array(test['partial_charges_3_padded'])

--BREAk

In [None]:
# One-hot encode the protein_name
onehot_encoder = OneHotEncoder(sparse_output=False)
protein_onehot = onehot_encoder.fit_transform(df['protein_name'].values.reshape(-1, 1))

# Combine ECFPs and one-hot encoded protein_name
X = [ecfp + protein for ecfp, protein in zip(df['ecfp'].tolist(), protein_onehot.tolist())]
y = df['binds'].tolist()

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create and train the random forest model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred_proba = rf_model.predict_proba(X_test)[:, 1]  # Probability of the positive class

# Calculate the mean average precision
map_score = average_precision_score(y_test, y_pred_proba)
print(f"Mean Average Precision (mAP): {map_score:.2f}")

Mean Average Precision (mAP): 0.97


In [None]:
print("Data type of X_train:", X_train.dtype)
print("Shape of X_train:", X_train.shape)

Data type of X_train: float64
Shape of X_train: (144000, 1071)


In [None]:
test = pd.read_csv('/content/drive/MyDrive/Kaggle/test.csv')
test.shape

(1674896, 6)

In [None]:
 # Generate ECFPs for the molecule_smiles
test['molecule'] = test['molecule_smiles'].apply(Chem.MolFromSmiles)
test['ecfp'] = test['molecule'].apply(generate_ecfp)



    # Combine ECFPs and one-hot encoded protein_name



In [None]:
test.head()

Unnamed: 0,id,buildingblock1_smiles,buildingblock2_smiles,buildingblock3_smiles,molecule_smiles,protein_name,molecule,ecfp
0,295246830,C#CCCC[C@H](NC(=O)OCC1c2ccccc2-c2ccccc21)C(=O)O,C=Cc1ccc(N)cc1,C=Cc1ccc(N)cc1,C#CCCC[C@H](Nc1nc(Nc2ccc(C=C)cc2)nc(Nc2ccc(C=C...,BRD4,<rdkit.Chem.rdchem.Mol object at 0x7a2763bc7530>,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,295246831,C#CCCC[C@H](NC(=O)OCC1c2ccccc2-c2ccccc21)C(=O)O,C=Cc1ccc(N)cc1,C=Cc1ccc(N)cc1,C#CCCC[C@H](Nc1nc(Nc2ccc(C=C)cc2)nc(Nc2ccc(C=C...,HSA,<rdkit.Chem.rdchem.Mol object at 0x7a2763bc75a0>,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,295246832,C#CCCC[C@H](NC(=O)OCC1c2ccccc2-c2ccccc21)C(=O)O,C=Cc1ccc(N)cc1,C=Cc1ccc(N)cc1,C#CCCC[C@H](Nc1nc(Nc2ccc(C=C)cc2)nc(Nc2ccc(C=C...,sEH,<rdkit.Chem.rdchem.Mol object at 0x7a2763bc7610>,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,295246833,C#CCCC[C@H](NC(=O)OCC1c2ccccc2-c2ccccc21)C(=O)O,C=Cc1ccc(N)cc1,CC(O)Cn1cnc2c(N)ncnc21,C#CCCC[C@H](Nc1nc(Nc2ccc(C=C)cc2)nc(Nc2ncnc3c2...,BRD4,<rdkit.Chem.rdchem.Mol object at 0x7a2763bc7680>,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,295246834,C#CCCC[C@H](NC(=O)OCC1c2ccccc2-c2ccccc21)C(=O)O,C=Cc1ccc(N)cc1,CC(O)Cn1cnc2c(N)ncnc21,C#CCCC[C@H](Nc1nc(Nc2ccc(C=C)cc2)nc(Nc2ncnc3c2...,HSA,<rdkit.Chem.rdchem.Mol object at 0x7a2763bc76f0>,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [None]:
    # One-hot encode the protein_name
protein_onehot = onehot_encoder.transform(test['protein_name'].values.reshape(-1, 1))

In [None]:
test['building_block_2'] = test['buildingblock2_smiles'].apply(Chem.MolFromSmiles)
test['building_block_3'] = test['buildingblock3_smiles'].apply(Chem.MolFromSmiles)
test['partial_charges_2'] = test['building_block_2'].apply(compute_partial_charges)
test['partial_charges_3'] = test['building_block_3'].apply(compute_partial_charges)



In [None]:
test_partial_charges_2 = test['partial_charges_2'].tolist()
test_partial_charges_3 = test['partial_charges_3'].tolist()

In [None]:
max_length_2 = max(len(seq) for seq in test_partial_charges_2)
max_length_3 = max(len(seq) for seq in test_partial_charges_3)
max_length = max(max_length_2, max_length_3)

# Pad sequences to ensure they have the same length
test_partial_charges_2_padded = pad_sequences(test_partial_charges_2, maxlen=max_length, padding='pre', dtype='float32').tolist()
est_partial_charges_3_padded = pad_sequences(test_partial_charges_3, maxlen=max_length, padding='pre', dtype='float32').tolist()

In [None]:
# Convert padded sequences to NumPy arrays
test_partial_charges_2 = np.array(test_partial_charges_2_padded)
test_partial_charges_3 = np.array(est_partial_charges_3_padded)

In [None]:
test_ecfp = test['ecfp'].tolist()
test_protein = protein_onehot.tolist()

In [None]:
test_ecfp = np.array(test_ecfp)
test_protein = np.array(test_protein)

In [None]:
print("Shape of test_partial_charges_2:", test_partial_charges_2.shape)
print("Shape of test_partial_charges_3:", test_partial_charges_3.shape)


Shape of test_partial_charges_2: (1674896, 29)
Shape of test_partial_charges_3: (1674896, 29)


In [None]:
# Slice the arrays to keep only the last 22 features
test_partial_charges_2_reduced = test_partial_charges_2[:, 7:]
test_partial_charges_3_reduced = test_partial_charges_3[:, 7:]

# Check the new shapes
print("New shape of test_partial_charges_2:", test_partial_charges_2_reduced.shape)
print("New shape of test_partial_charges_3:", test_partial_charges_3_reduced.shape)


New shape of test_partial_charges_2: (1674896, 22)
New shape of test_partial_charges_3: (1674896, 22)


In [None]:
print("Shape of test_ecfp:", test_ecfp.shape)
print("Shape of test_protein:", test_protein.shape)


Shape of test_ecfp: (1674896, 1024)
Shape of test_protein: (1674896, 3)


In [None]:
X_test_2 = np.concatenate((test_ecfp, test_protein, test_partial_charges_2_reduced, test_partial_charges_3_reduced), axis=1)

In [None]:
probabilities_2 = rf_model_1.predict_proba(X_test_2)[:, 1]

In [None]:
output_df_3 = pd.DataFrame({'id': test['id'], 'binds': probabilities_2})
threshold = 0.5  # Example threshold, you can adjust it as needed
output_df_binary_2 = output_df_3.copy()
# Convert prediction probabilities to binary predictions
output_df_binary_2['binds'] = (output_df_3['binds'] > threshold).astype(int)
# Display the DataFrame with binary predictions
print(output_df_binary_2.head(10))
output_df_binary_2.to_csv('submission_apr_29_2.csv', index=False)

          id  binds
0  295246830      0
1  295246831      0
2  295246832      0
3  295246833      0
4  295246834      0
5  295246835      0
6  295246836      0
7  295246837      0
8  295246838      0
9  295246839      0


In [None]:
# Reshape 1D arrays to have shape (1674896, 1)
test_partial_charges_2_reshaped = test_partial_charges_2.reshape(-1, 1)
test_partial_charges_3_reshaped = test_partial_charges_3.reshape(-1, 1)

# Check the shapes after reshaping
print("Shape of test_partial_charges_2_reshaped:", test_partial_charges_2_reshaped.shape)
print("Shape of test_partial_charges_3_reshaped:", test_partial_charges_3_reshaped.shape)


Shape of test_partial_charges_2_reshaped: (1674896, 1)
Shape of test_partial_charges_3_reshaped: (1674896, 1)


In [None]:
X_test_2 = np.concatenate((test_ecfp, test_protein, test_partial_charges_2_reshaped, test_partial_charges_3_reshaped), axis=1)

In [None]:
print(len(test_partial_charges_3_reshaped))
print(len(test_partial_charges_2_reshaped))

1674896
1674896


In [None]:
print("Data type of X_test:", X_test_2.dtype)
print("Shape of X_test:", X_test_2.shape)


Data type of X_test: object
Shape of X_test: (1674896, 1029)


In [None]:
  # Predict the probabilities
probabilities_2 = rf_model_1.predict_proba(X_test_2)[:, 1]

ValueError: setting an array element with a sequence.

In [None]:
  # Create a DataFrame with 'id' and 'probability' columns
output_df_2 = pd.DataFrame({'id': test['id'], 'binds': probabilities_2})
output_df_2.to_csv('submission_apr_29_2.csv', index=False)

In [None]:
threshold = 0.5  # Example threshold, you can adjust it as needed
output_df_binary = output_df.copy()
# Convert prediction probabilities to binary predictions
output_df_binary['binds'] = (output_df['binds'] > threshold).astype(int)


# Display the DataFrame with binary predictions
print(output_df_binary.head())


~~~~~````~~~~`


In [None]:
df['building_block_2'] = df['buildingblock2_smiles'].apply(Chem.MolFromSmiles)
df['building_block_3'] = df['buildingblock3_smiles'].apply(Chem.MolFromSmiles)
df['partial_charges_2'] = df['building_block_2'].apply(compute_partial_charges)
df['partial_charges_3'] = df['building_block_3'].apply(compute_partial_charges)
# Determine the maximum length of sequences in X_partial_charges_2 and X_partial_charges_3
max_length = max(max(len(seq) for seq in X_partial_charges_2), max(len(seq) for seq in X_partial_charges_3))

# Pad sequences to ensure they have the same length
X_partial_charges_2_padded = pad_sequences(X_partial_charges_2, maxlen=max_length, padding='pre', dtype='float32')
X_partial_charges_3_padded = pad_sequences(X_partial_charges_3, maxlen=max_length, padding='pre', dtype='float32')
X_partial_charges_2 = np.array(X_partial_charges_2_padded)
X_partial_charges_3 = np.array(X_partial_charges_3_padded)
X_test = [ecfp + protein for ecfp, protein in zip(test['ecfp'].tolist(), protein_onehot.tolist())]

In [None]:
rf_model_1

In [None]:
X_test.head()

AttributeError: 'list' object has no attribute 'head'

In [None]:
   # Predict the probabilities
probabilities = rf_model.predict_proba(X_test)[:, 1]
    # Create a DataFrame with 'id' and 'probability' columns
output_df = pd.DataFrame({'id': test['id'], 'binds': probabilities})
output_df.to_csv('submission_apr_29.csv', index=False)

In [None]:
output_df.shape

(1674896, 2)

In [None]:
# Assuming output_df is your DataFrame containing the 'prediction' column
threshold = 0.5

# Count values above 0.5
count_above_threshold = (output_df['binds'] > threshold).sum()

print("Number of values above 0.5:", count_above_threshold)


Number of values above 0.5: 63128


In [None]:
output_df.head()

Unnamed: 0,id,binds
0,295246830,0.21
1,295246831,0.14
2,295246832,0.11
3,295246833,0.4
4,295246834,0.36


In [None]:

threshold = 0.5  # Example threshold, you can adjust it as needed
output_df_binary = output_df.copy()
# Convert prediction probabilities to binary predictions
output_df_binary['binds'] = (output_df['binds'] > threshold).astype(int)


# Display the DataFrame with binary predictions
print(output_df_binary.head())

          id  binds
0  295246830      0
1  295246831      0
2  295246832      0
3  295246833      0
4  295246834      0


In [None]:
output_df_binary.to_csv('submission_apr_29.csv', index=False)

In [None]:
output_df_binary.shape

(1674896, 2)