In [1]:
#import important libraries
import pandas as pd
import numpy as np
import matplotlib as mlt

In [2]:
#read the csv files for train and test data respectively
train_data = pd.read_csv('train_II.csv')
test_data = pd.read_csv('test_II.csv')

In [3]:
#checking structure of submission file
sub_data = pd.read_csv('sample_submission.csv')

In [4]:
sub_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10994 entries, 0 to 10993
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   Id         10994 non-null  object
 1   Predicted  10994 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 171.9+ KB


In [5]:
#change the column name x to Id in test data set to maintain consistency
test_data = test_data.rename(columns = {"x": "Id"})
test_data.head()

Unnamed: 0,Id
0,CC1=CC(=C(C=C1)C(C)(C)C)O;1682
1,CCS(=O)(=O)C1=C(N=CC=C1)S(=O)(=O)NC(=O)NC2=NC(...
2,CC1=NC2=CC=CC=C2N1C3CC4CCC(C3)N4CCC5(CCN(CC5)C...
3,CC1=CC(=O)[N-]S(=O)(=O)O1.[K+];1850
4,CCC(C)(C)C(=O)O[C@H]1C[C@H](C=C2[C@H]1[C@H]([C...


In [6]:
test_data['type'] = "test"
train_data['type'] = "train"

In [7]:
df = pd.concat([test_data, train_data])
df.head()

Unnamed: 0,Id,type,Expected
0,CC1=CC(=C(C=C1)C(C)(C)C)O;1682,test,
1,CCS(=O)(=O)C1=C(N=CC=C1)S(=O)(=O)NC(=O)NC2=NC(...,test,
2,CC1=NC2=CC=CC=C2N1C3CC4CCC(C3)N4CCC5(CCN(CC5)C...,test,
3,CC1=CC(=O)[N-]S(=O)(=O)O1.[K+];1850,test,
4,CCC(C)(C)C(=O)O[C@H]1C[C@H](C=C2[C@H]1[C@H]([C...,test,


In [8]:
#divide the column ID into two different columns namely assay Id and chemical ID
df[["Chemical_Id", "Assay_Id"]] = df.Id.str.split(";", expand = True)
df.head()

Unnamed: 0,Id,type,Expected,Chemical_Id,Assay_Id
0,CC1=CC(=C(C=C1)C(C)(C)C)O;1682,test,,CC1=CC(=C(C=C1)C(C)(C)C)O,1682
1,CCS(=O)(=O)C1=C(N=CC=C1)S(=O)(=O)NC(=O)NC2=NC(...,test,,CCS(=O)(=O)C1=C(N=CC=C1)S(=O)(=O)NC(=O)NC2=NC(...,1656
2,CC1=NC2=CC=CC=C2N1C3CC4CCC(C3)N4CCC5(CCN(CC5)C...,test,,CC1=NC2=CC=CC=C2N1C3CC4CCC(C3)N4CCC5(CCN(CC5)C...,36
3,CC1=CC(=O)[N-]S(=O)(=O)O1.[K+];1850,test,,CC1=CC(=O)[N-]S(=O)(=O)O1.[K+],1850
4,CCC(C)(C)C(=O)O[C@H]1C[C@H](C=C2[C@H]1[C@H]([C...,test,,CCC(C)(C)C(=O)O[C@H]1C[C@H](C=C2[C@H]1[C@H]([C...,30


In [9]:
#check info of training data
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 86377 entries, 0 to 75382
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Id           86377 non-null  object 
 1   type         86377 non-null  object 
 2   Expected     75383 non-null  float64
 3   Chemical_Id  86377 non-null  object 
 4   Assay_Id     86377 non-null  object 
dtypes: float64(1), object(4)
memory usage: 4.0+ MB


In [10]:
pip install --user scikit-learn

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 23.0 -> 23.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [11]:
from sklearn.feature_extraction.text import CountVectorizer
text = df['Chemical_Id']
# create the transform
vectorizer = CountVectorizer()
# tokenize and build vocab
vectorizer.fit(text)
# summarize
print(vectorizer.vocabulary_)
# encode document
vector = vectorizer.transform(text)
# summarize encoded vector
print(vector.shape)
print(type(vector))
print(vector.toarray())
features_df = pd.DataFrame(vector.todense(), columns=vectorizer.get_feature_names())
df = df.reset_index(drop=True)
df = pd.concat([df, features_df], axis = 1)
df.head()

{'cc1': 327, 'cc': 326, 'c1': 99, 'ccs': 595, 'nc': 765, 'nc2': 769, 'n2': 726, 'oc': 851, 'c2n1c3cc4ccc': 222, 'c3': 241, 'n4ccc5': 757, 'ccn': 551, 'cc5': 367, 'c6': 314, 'c6cl': 318, 'c7': 319, 'o1': 844, 'ccc': 371, '1c': 9, 'c2': 193, '3c': 56, 'o3': 846, 'cn': 604, 'ccc2': 384, 'cco2': 575, 'c4': 279, 'cl': 603, 'n5ccccc5': 762, 'oc1': 852, 'oc2': 859, 'c1o': 182, 'c1cl': 141, 'n1ccccc1cco': 719, 'coc1': 652, 'c2c': 199, 'nc3': 774, 'ccccoc': 510, 'cccccccc': 421, 'occcc': 885, 'c1ccn': 136, 'cn1cn': 617, 'sc1': 946, 'cccccccccc': 430, 'n2c': 727, 'cc4': 364, 'c3n': 263, 'na': 764, 'ccoc1c': 578, 'os': 936, 'coccoccn': 672, 'cc2cc': 358, 'c1cc1': 113, 'nn': 813, 'sc': 945, 'ccccc': 404, 'coc': 651, 'csc': 694, 'n3': 742, 'ccoc': 576, 'csc1': 695, 'c2n3ccccn3c': 226, 's2': 940, 'ccl': 550, 'ccc1': 372, 'cc2': 354, 'n1': 711, 'c3cl': 261, 'cccsp': 548, 'occ': 872, 'br': 98, 'c23': 195, 'c1i': 174, 'np': 837, 'nc1': 766, 'n1c': 713, 'cnc': 632, 'c21': 194, 'c1c2': 107, 'nocc2': 834,

(86377, 977)
<class 'scipy.sparse._csr.csr_matrix'>
[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


AttributeError: 'CountVectorizer' object has no attribute 'get_feature_names'

In [None]:
train = df[df["type"] == "train"]
train.shape

In [None]:
test = df[df["type"] == "test"]
test.shape

In [None]:
train.head()

In [None]:
test.head()

In [None]:
train_X = train.drop(["Id","type", "Chemical_Id", "Expected"], axis=1)
train_X.shape

In [None]:
test_X = test.drop(["Id", "type", "Expected", "Chemical_Id"], axis=1)
test_X.shape

In [None]:
print(train_X.dtypes)

In [None]:
train_Y = train["Expected"]
train_Y.shape

In [None]:
test_X['Assay_Id'] = test_X['Assay_Id'].astype(str).astype(float).astype(int)

In [None]:
train_X['Assay_Id'] = train_X['Assay_Id'].astype(str).astype(float).astype(int)

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

X_train, X_test, Y_train, Y_test = train_test_split(train_X, train_Y, test_size=0.2, random_state=0)

model = RandomForestClassifier(random_state=42)
model.fit(X_train, Y_train)
pred = model.predict(X_test)
print("Accuracy: ", accuracy_score(Y_test, pred)*100, "%")

In [None]:
from xgboost import XGBClassifier
#XGBoost classifier
clf=XGBClassifier(max_depth = 8,n_estimators = 400, random_state = 11)

In [None]:
#Fitting the model with the classifier
clf.fit(X_train,Y_train)
pred = clf.predict(X_test)

acc_score = clf.score(X_train,Y_train)
print(acc_score)

f1_macro_scores = cross_val_score(clf, X_train, Y_train, cv=5, scoring='f1_macro')

print("%0.4f f1_score(macro) with a standard deviation of %0.4f" % (f1_macro_scores.mean(), f1_macro_scores.std()))

In [None]:
prd = clf.predict(test_X)
print(prd)
prd1 =  [int(s) for s in prd]
print(prd1)

In [None]:
op = test_data[["Id"]]
op['Predicted'] = prd1
op.to_csv("Output.csv", index=False)

from google.colab import files
files.download("Output.csv")