<a href="https://colab.research.google.com/github/sanejait/ChemicalToxicityPrediction_FALCONS/blob/main/ChemicalToxicityPrediction_FALCONS.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Authors - Sahil Aneja, Rahul Ananda Bijai
# Team - FALCONS
# This python program is used to predict the toxicity outcome of a set of chemicals

# Importing all the required packages
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
import xgboost
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn import preprocessing
from sklearn.preprocessing import MinMaxScaler
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split,KFold,cross_val_score
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report,f1_score
from sklearn.feature_selection import RFE,SelectFromModel,SelectKBest,chi2



In [None]:
# Loading Train Dataset
train_data = pd.read_csv("train.csv")

train_data.head(n=10)

Unnamed: 0,Id,Expected
0,2971-36-0;1644,2
1,693-54-9;2451,2
2,7173-51-5;1384,2
3,138261-41-3;16,2
4,7681-82-5;1856,2
5,13194-48-4;1646,2
6,1763-23-1;1374,1
7,1689-84-5;1611,2
8,60-01-5;1852,2
9,121-75-5;1647,2


In [None]:
# Loading Test Dataset
test_data = pd.read_csv("test.csv")

test_data.head(n=10)

Unnamed: 0,x
0,88-60-8;1682
1,122931-48-0;1656
2,NOCAS_47311;36
3,55589-62-3;1850
4,79902-63-9;30
5,NOCAS_47353;1372
6,51338-27-3;4
7,123-31-9;1382
8,120-83-2;1388
9,119515-38-7;1644


In [None]:
# New train dataframe after splitting the 'Id' field
splitted_train_data = train_data["Id"].str.split(";", n = -1, expand = True) 
train_data["c_id"]= splitted_train_data[0] 
train_data["assay_id"]= splitted_train_data[1] 
train_data.head(n=10)

Unnamed: 0,Id,Expected,c_id,assay_id
0,2971-36-0;1644,2,2971-36-0,1644
1,693-54-9;2451,2,693-54-9,2451
2,7173-51-5;1384,2,7173-51-5,1384
3,138261-41-3;16,2,138261-41-3,16
4,7681-82-5;1856,2,7681-82-5,1856
5,13194-48-4;1646,2,13194-48-4,1646
6,1763-23-1;1374,1,1763-23-1,1374
7,1689-84-5;1611,2,1689-84-5,1611
8,60-01-5;1852,2,60-01-5,1852
9,121-75-5;1647,2,121-75-5,1647


In [None]:
#  New test dataframe after splitting the 'Id' field 
splitted_test_data = test_data["x"].str.split(";", n = -1, expand = True)
test_data["c_id"]= splitted_test_data[0] 
test_data["assay_id"]= splitted_test_data[1] 
test_data.head(n=10)

Unnamed: 0,x,c_id,assay_id
0,88-60-8;1682,88-60-8,1682
1,122931-48-0;1656,122931-48-0,1656
2,NOCAS_47311;36,NOCAS_47311,36
3,55589-62-3;1850,55589-62-3,1850
4,79902-63-9;30,79902-63-9,30
5,NOCAS_47353;1372,NOCAS_47353,1372
6,51338-27-3;4,51338-27-3,4
7,123-31-9;1382,123-31-9,1382
8,120-83-2;1388,120-83-2,1388
9,119515-38-7;1644,119515-38-7,1644


In [None]:
# Loading Features Dataset
features_data = pd.read_csv("feamat.csv")
features_data.head(n=10)

Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,V11,V12,V13,V14,V15,V16,V17,V18,V19,V20,V21,V22,V23,V24,V25,V26,V27,V28,V29,V30,V31,V32,V33,V34,V35,V36,V37,V38,V39,V40,...,V1036,V1037,V1038,V1039,V1040,V1041,V1042,V1043,V1044,V1045,V1046,V1047,V1048,V1049,V1050,V1051,V1052,V1053,V1054,V1055,V1056,V1057,V1058,V1059,V1060,V1061,V1062,V1063,V1064,V1065,V1066,V1067,V1068,V1069,V1070,V1071,V1072,V1073,V1074,V1075
0,60-35-5,178,59.037114,-0.808,43.09,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,8.755965,0,0,9,2,1,5.134449,3.498274,4.051736,0.0,0.0,9,2.584963,1,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,103-90-2,1983,151.063329,0.87,49.33,0.0,0.0,0.0,0.083333,0.142259,0.0,0.0,0.0,0.027778,9.090909,22.785137,6,6,20,2,2,12.908918,11.996548,16.048283,12.32864,9.162458,115,4.459432,1,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,968-81-0,1989,324.114378,2.96,100.72,0.0,0.0,0.0,0.185395,0.161948,0.0,0.0,0.0,0.12984,18.340265,48.04386,6,6,42,6,2,31.94559,33.058769,42.184565,42.829163,37.981351,452,5.523562,1,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,520-45-6,122903,168.042259,-0.551,60.44,0.0,0.0,0.0,0.055556,0.198742,0.0,0.0,0.0,0.01701,10.083333,22.622344,0,0,20,4,0,15.097876,13.66046,20.431205,20.32092,21.196989,102,4.584963,1,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,50594-66-6,44073,360.996485,4.557,89.67,0.0,0.0,0.0,0.136083,0.276855,0.0,0.0,0.0,0.048113,20.3136,38.268551,12,12,31,2,1,40.451408,31.298599,51.693067,52.809931,49.936323,465,5.643856,1,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
5,79-06-1,6579,71.037114,-0.516,43.09,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,10.515965,0,0,10,2,1,6.134449,4.498274,5.051736,2.498274,0.0,19,3.0,1,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
6,107-13-1,6579,71.037114,-0.516,43.09,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,10.515965,0,0,10,2,1,6.134449,4.498274,5.051736,2.498274,0.0,19,3.0,1,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
7,309-00-2,12310947,361.875716,4.156,0.0,0.0,0.0,0.279752,0.970099,2.74808,0.0,0.0,0.230052,0.845332,11.795918,39.534344,0,0,26,0,0,64.277699,32.710624,74.037739,139.929603,113.315438,198,5.392317,1,0,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
8,97-59-6,204,158.04399,-1.169,113.32,0.0,0.0,0.096225,0.260491,0.271592,0.0,0.0,0.036084,0.047505,9.090909,17.846758,0,0,17,7,4,14.763328,13.159549,19.677887,14.293817,12.23792,100,4.459432,1,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
9,107-18-6,70400,242.126681,5.943,0.0,0.0,0.0,0.0,0.306186,0.176777,0.0,0.0,0.0,0.096225,13.959184,44.711895,18,18,34,0,0,18.810205,20.700341,26.400682,26.400682,26.700341,261,5.392317,1,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [None]:
# Merging Train Dataset with Feature Dataset on c_id and V1
new_train_data=pd.merge(train_data, features_data, left_on='c_id', right_on='V1', how='left', indicator=True)
new_train_data.head(n=10)
print(new_train_data)

                    Id  Expected         c_id  ... V1074 V1075  _merge
0       2971-36-0;1644         2    2971-36-0  ...     0     0    both
1        693-54-9;2451         2     693-54-9  ...     0     0    both
2       7173-51-5;1384         2    7173-51-5  ...     0     0    both
3       138261-41-3;16         2  138261-41-3  ...     0     0    both
4       7681-82-5;1856         2    7681-82-5  ...     0     0    both
...                ...       ...          ...  ...   ...   ...     ...
77408     3337-71-1;33         2    3337-71-1  ...     0     0    both
77409  42509-80-8;1632         1   42509-80-8  ...     0     0    both
77410    149-30-4;1373         1     149-30-4  ...     0     0    both
77411       123-05-7;2         2     123-05-7  ...     0     0    both
77412     23128-74-7;2         1   23128-74-7  ...     0     0    both

[77413 rows x 1080 columns]


In [None]:
# Merging Test Dataset with Feature Dataset on c_id and V1
new_test_data=pd.merge(test_data, features_data, left_on='c_id', right_on='V1', how='left', indicator=True)
new_test_data.head(n=10)
print(new_test_data)

                      x         c_id assay_id  ... V1074  V1075  _merge
0          88-60-8;1682      88-60-8     1682  ...     0      0    both
1      122931-48-0;1656  122931-48-0     1656  ...     0      0    both
2        NOCAS_47311;36  NOCAS_47311       36  ...     0      0    both
3       55589-62-3;1850   55589-62-3     1850  ...     0      0    both
4         79902-63-9;30   79902-63-9       30  ...     0      0    both
...                 ...          ...      ...  ...   ...    ...     ...
11134    141517-21-7;38  141517-21-7       38  ...     0      0    both
11135        81-90-3;34      81-90-3       34  ...     0      0    both
11136   74223-64-6;1640   74223-64-6     1640  ...     0      0    both
11137        62-73-7;28      62-73-7       28  ...     0      0    both
11138    2634-33-5;1855    2634-33-5     1855  ...     0      0    both

[11139 rows x 1079 columns]


In [None]:
# Deleting dataframes which are of no use now, to clean up some memory
del test_data 
del train_data
del features_data
del splitted_train_data
del splitted_test_data

In [None]:
# Check for columns having only one unique value in newly created train dataset and remove them.
nunique_train  = new_train_data.apply(pd.Series.nunique)
cols_to_drop_train = nunique_train[nunique_train == 1].index
new_train_data = new_train_data.drop(cols_to_drop_train, axis=1)
new_train_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 77413 entries, 0 to 77412
Columns: 888 entries, Id to V1073
dtypes: float64(20), int64(864), object(4)
memory usage: 525.1+ MB


In [None]:
# Check for columns having only one unique value in newly created test dataset and remove them.
nunique = new_test_data.apply(pd.Series.nunique)
cols_to_drop = nunique[nunique == 1].index
new_test_data = new_test_data.drop(cols_to_drop, axis=1)
new_test_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 11139 entries, 0 to 11138
Columns: 862 entries, x to V1073
dtypes: float64(20), int64(838), object(4)
memory usage: 73.3+ MB


In [None]:
# Removing Id, c_id and V1 from the Train Dataset
new_train_data=new_train_data.drop(['Id','c_id','V1'], axis = 1) 
new_train_data.head()

Unnamed: 0,Expected,assay_id,V2,V3,V4,V5,V6,V7,V8,V9,V10,V11,V12,V13,V14,V15,V16,V17,V18,V19,V20,V21,V22,V23,V24,V25,V26,V27,V28,V31,V33,V34,V35,V36,V37,V38,V39,V40,V41,V42,...,V1009,V1010,V1011,V1012,V1013,V1014,V1015,V1016,V1017,V1018,V1019,V1020,V1021,V1022,V1023,V1025,V1026,V1028,V1029,V1030,V1031,V1032,V1033,V1034,V1035,V1036,V1037,V1038,V1043,V1052,V1055,V1056,V1058,V1059,V1061,V1062,V1063,V1064,V1067,V1073
0,2,1644,76302,315.982463,4.592,40.46,0.0,0.0,0.0,0.166667,0.262892,0.0,0.0,0.0,0.055556,15.39,40.118723,12,12,30,0,2,43.687788,26.519496,59.32253,41.038992,52.085432,290,5.321928,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,2,2451,12741,156.151415,3.852,17.07,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,11.0,31.73786,0,0,31,1,0,11.774469,10.332092,10.664184,8.332092,7.332092,139,4.321928,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,2,1384,23558,361.347528,9.912,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,26.231405,74.006064,0,0,72,0,0,32.07293,22.664728,24.332364,22.332364,21.332364,1050326996,5.459432,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,2,16,86418,255.052302,2.294,83.66,0.0,0.0,0.117851,0.25,0.29741,0.0,0.0,0.05,0.081872,13.432099,31.79193,6,6,27,4,1,28.061789,22.74813,32.352497,26.617141,23.522689,273,5.169925,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,2,1856,5238,149.894242,1.05,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,inf,28.95,0,0,2,0,0,115.302448,0.0,0.0,0.0,0.0,0,0.0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [None]:
# Removing x, c_id and V1 from the Test Dataset
new_test_data=new_test_data.drop(['x','c_id','V1'], axis = 1) 
new_test_data.head()

Unnamed: 0,assay_id,V2,V3,V4,V5,V6,V7,V8,V9,V10,V11,V12,V13,V14,V15,V16,V17,V18,V19,V20,V21,V22,V23,V24,V25,V26,V27,V28,V31,V33,V34,V35,V36,V37,V38,V39,V40,V41,V42,V43,...,V1007,V1008,V1010,V1011,V1012,V1013,V1014,V1015,V1016,V1017,V1018,V1019,V1020,V1021,V1022,V1023,V1025,V1026,V1028,V1029,V1030,V1031,V1033,V1034,V1035,V1036,V1037,V1038,V1043,V1052,V1055,V1056,V1058,V1059,V1061,V1062,V1063,V1064,V1067,V1073
0,1682,6937,164.120115,3.659,20.23,0.0,0.0,0.0,0.068041,0.170103,0.0,0.0,0.0,0.024056,10.083333,30.830688,6,6,28,0,1,12.774469,12.332092,18.664184,16.996276,14.66046,106,4.584963,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,1656,91779,431.05694,1.258,183.3,0.0,0.0,0.0,0.151375,0.227671,0.0,0.0,0.0,0.03595,24.271106,52.889481,12,12,45,10,2,47.476691,49.504506,62.259656,71.196703,77.917913,590,5.857981,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,36,53257735,695.250845,6.365,95.92,0.0,0.0,0.174792,0.78911,1.360933,0.0,0.0,0.099536,0.401739,36.36214,104.13172,21,22,88,7,1,68.9681,68.337655,99.85624,110.392516,108.648356,1715,6.754888,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,1850,11074431,200.94981,-1.34,68.82,0.0,0.0,0.0,0.058926,0.235702,0.0,0.0,0.0,0.0,11.0,60.315172,0,0,15,5,0,30.182453,20.61302,24.039282,19.054704,13.150049,-1474836500,4.321928,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,30,54454,418.271924,4.775,72.83,0.0,0.0,0.0,0.179152,0.516591,0.0,0.0,0.0,0.105379,24.638672,73.348134,0,0,68,5,1,33.872345,34.324644,51.869858,55.313472,51.866134,632,6.0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [None]:
# Check for any columns having infinite values in Train Dataset
floatset=new_train_data.select_dtypes(exclude=['object'])
col_names=floatset.columns.to_series()[np.isinf(floatset).any()]
print(col_names)

V15    V15
dtype: object


In [None]:
# Finding the max value in column V15 for train dataset and replacing the infinte value by max value
column = new_train_data.loc[new_train_data['V15'] != np.inf, 'V15']
max_value = column.max()
print(max_value)
new_train_data['V15'].replace(np.inf,max_value,inplace=True)

102.513889


In [None]:
# Check for any columns having infinite values in Test Dataset
floatset=new_test_data.select_dtypes(exclude=['object'])
col_names=floatset.columns.to_series()[np.isinf(floatset).any()]
print(col_names)

V15    V15
dtype: object


In [None]:
# Finding the max value in column V15 for test dataset and replacing the infinte values to max value
column = new_test_data.loc[new_test_data['V15'] != np.inf, 'V15']
max_value = column.max()
print(max_value)
new_test_data['V15'].replace(np.inf,max_value,inplace=True)

102.513889


In [None]:
# Drop the columns from train set which are not available in test set except 'Expected'
train_columns = new_train_data.columns
test_columns = new_test_data.columns
remove_columns = train_columns.difference(test_columns)
print(remove_columns)
# Don't remove 'Expected' column
remove_columns = remove_columns.delete(0)
new_train_data = new_train_data.drop(columns=remove_columns)
new_train_data.info()

Index(['Expected', 'V1002', 'V1009', 'V1032', 'V245', 'V248', 'V251', 'V254',
       'V256', 'V271', 'V297', 'V300', 'V307', 'V319', 'V326', 'V357', 'V358',
       'V366', 'V409', 'V437', 'V720', 'V752', 'V822', 'V939', 'V946', 'V969'],
      dtype='object')
<class 'pandas.core.frame.DataFrame'>
Int64Index: 77413 entries, 0 to 77412
Columns: 860 entries, Expected to V1073
dtypes: float64(20), int64(839), object(1)
memory usage: 508.5+ MB


In [None]:
new_test_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 11139 entries, 0 to 11138
Columns: 859 entries, assay_id to V1073
dtypes: float64(20), int64(838), object(1)
memory usage: 73.1+ MB


In [None]:
# Data normalization with sklearn for Train Dataset. Ignoring 'Expected' from getting normalized
x = new_train_data.drop('Expected', axis = 1).values #returns a numpy array
min_max_scaler = preprocessing.MinMaxScaler()
x_scaled = min_max_scaler.fit_transform(x)
train = pd.DataFrame(x_scaled)
firstCol = new_train_data["Expected"]
train = train.join(firstCol)
train.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,...,820,821,822,823,824,825,826,827,828,829,830,831,832,833,834,835,836,837,838,839,840,841,842,843,844,845,846,847,848,849,850,851,852,853,854,855,856,857,858,Expected
0,0.669931,0.000678,0.171224,0.592731,0.052006,0.0,0.0,0.0,0.046648,0.023239,0.0,0.0,0.0,0.013679,0.133215,0.160843,0.2,0.2,0.14433,0.0,0.08,0.08524,0.175708,0.177127,0.128922,0.129114,0.503415,0.66157,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2
1,0.999184,0.000113,0.075526,0.571145,0.021941,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.08954,0.123629,0.0,0.0,0.149485,0.025,0.0,0.018751,0.068457,0.031841,0.026175,0.018175,0.503415,0.53726,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2
2,0.563851,0.000209,0.198386,0.747914,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.241075,0.311314,0.0,0.0,0.360825,0.0,0.0,0.061041,0.150168,0.072652,0.070156,0.052881,0.751707,0.678663,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2
3,0.005712,0.000768,0.134742,0.525699,0.107535,0.0,0.0,0.129409,0.069972,0.02629,0.0,0.0,0.053082,0.020159,0.113737,0.123869,0.1,0.1,0.128866,0.1,0.04,0.052684,0.15072,0.096599,0.083616,0.05831,0.503415,0.642674,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2
4,0.756426,4.7e-05,0.07178,0.489411,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.11125,0.0,0.0,0.0,0.0,0.0,0.234443,0.0,0.0,0.0,0.0,0.503415,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2


In [None]:
# Data normalization with sklearn for Test Dataset
y = new_test_data.values #returns a numpy array
min_max_scaler = preprocessing.MinMaxScaler()
y_scaled = min_max_scaler.fit_transform(y)
test = pd.DataFrame(y_scaled)
test.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,...,819,820,821,822,823,824,825,826,827,828,829,830,831,832,833,834,835,836,837,838,839,840,841,842,843,844,845,846,847,848,849,850,851,852,853,854,855,856,857,858
0,0.685435,6.2e-05,0.080297,0.565515,0.026003,0.0,0.0,0.0,0.019044,0.015037,0.0,0.0,0.0,0.005923,0.08042,0.130612,0.1,0.1,0.134021,0.0,0.04,0.020834,0.081708,0.079173,0.053393,0.036342,0.511242,0.569958,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.674827,0.000816,0.240124,0.495479,0.23561,0.0,0.0,0.0,0.042368,0.020126,0.0,0.0,0.0,0.008852,0.221572,0.237578,0.2,0.2,0.221649,0.25,0.08,0.093133,0.327998,0.264104,0.22366,0.19315,0.511243,0.728207,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.013872,0.4734,0.398309,0.644449,0.123294,0.0,0.0,0.191935,0.220862,0.120303,0.0,0.0,0.10567,0.098919,0.341865,0.486059,0.35,0.366667,0.443299,0.175,0.04,0.137909,0.452779,0.423587,0.346792,0.269327,0.511243,0.839701,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.753978,0.098439,0.102349,0.419695,0.08846,0.0,0.0,0.0,0.016493,0.020836,0.0,0.0,0.0,0.0,0.08954,0.273587,0.0,0.0,0.06701,0.125,0.0,0.057102,0.136574,0.101974,0.059859,0.032598,0.157176,0.53726,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.011424,0.000484,0.232469,0.598069,0.093614,0.0,0.0,0.0,0.050142,0.045666,0.0,0.0,0.0,0.025947,0.225229,0.336785,0.0,0.0,0.340206,0.125,0.04,0.06479,0.227422,0.22003,0.173764,0.12857,0.511243,0.745861,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
train_df_corr = train.drop(['Expected'], axis=1)

In [None]:
# Function for generating the correlation
def get_correlation(data, threshold):
    col_corr = set()
    corr_matrix =data.corr()
    for i in range(len(corr_matrix.columns)):
        for j in range(i):
            if abs(corr_matrix.iloc[i, j])> threshold:
                colname = corr_matrix.columns[i]
                col_corr.add(colname)
    return col_corr

In [None]:
# Getting Correlation of Train Dataset with a threshold of 0.7
corr_feature = get_correlation(train_df_corr, 0.7)
corr_feature

{6,
 8,
 9,
 10,
 11,
 12,
 13,
 14,
 15,
 17,
 18,
 19,
 22,
 23,
 24,
 25,
 27,
 46,
 57,
 64,
 68,
 79,
 80,
 82,
 83,
 84,
 85,
 87,
 88,
 91,
 94,
 95,
 97,
 105,
 112,
 120,
 123,
 127,
 131,
 134,
 141,
 143,
 145,
 148,
 149,
 151,
 153,
 154,
 158,
 159,
 161,
 167,
 168,
 169,
 170,
 171,
 172,
 174,
 177,
 179,
 180,
 181,
 182,
 185,
 186,
 187,
 189,
 195,
 201,
 202,
 205,
 206,
 207,
 210,
 211,
 213,
 215,
 217,
 219,
 222,
 223,
 228,
 231,
 235,
 236,
 237,
 238,
 240,
 244,
 248,
 249,
 250,
 252,
 254,
 255,
 256,
 257,
 258,
 259,
 260,
 262,
 263,
 266,
 275,
 279,
 280,
 281,
 282,
 285,
 286,
 287,
 289,
 296,
 301,
 304,
 305,
 307,
 310,
 312,
 314,
 317,
 319,
 321,
 322,
 323,
 324,
 330,
 332,
 334,
 335,
 336,
 337,
 339,
 340,
 341,
 342,
 343,
 344,
 345,
 346,
 349,
 352,
 354,
 355,
 356,
 357,
 358,
 360,
 361,
 371,
 372,
 374,
 376,
 378,
 380,
 381,
 382,
 383,
 384,
 385,
 386,
 388,
 389,
 391,
 392,
 393,
 395,
 396,
 397,
 398,
 399,
 400,
 401

In [None]:
# Dropping columns/features based on correlation from Test and Train Datasets
X=train_df_corr.drop(corr_feature,axis=1)
test=test.drop(corr_feature,axis=1)

In [None]:
# Initializing y which would be used for Feature Selection Process

y = train["Expected"]

In [None]:
# Deleting dataframs which are of no use now, to clean up some memory
del new_train_data
del new_test_data
del train
del train_df_corr

In [None]:
# Feature Selection Process begins and initializing the number of features that each method should give
num_feats = 50

In [None]:
#Feature Selection using RFE and estimators as Decision Tree Classifier 
rfe_selector_DT = RFE(estimator=DecisionTreeClassifier(random_state= 0), n_features_to_select=num_feats, verbose=5)
rfe_selector_DT.fit(X, y)
rfe_support_DT = rfe_selector_DT.get_support()
rfe_feature_DT = X.loc[:,rfe_support_DT].columns.tolist()
print(str(len(rfe_feature_DT)), 'selected features')

Fitting estimator with 366 features.
Fitting estimator with 365 features.
Fitting estimator with 364 features.
Fitting estimator with 363 features.
Fitting estimator with 362 features.
Fitting estimator with 361 features.
Fitting estimator with 360 features.
Fitting estimator with 359 features.
Fitting estimator with 358 features.
Fitting estimator with 357 features.
Fitting estimator with 356 features.
Fitting estimator with 355 features.
Fitting estimator with 354 features.
Fitting estimator with 353 features.
Fitting estimator with 352 features.
Fitting estimator with 351 features.
Fitting estimator with 350 features.
Fitting estimator with 349 features.
Fitting estimator with 348 features.
Fitting estimator with 347 features.
Fitting estimator with 346 features.
Fitting estimator with 345 features.
Fitting estimator with 344 features.
Fitting estimator with 343 features.
Fitting estimator with 342 features.
Fitting estimator with 341 features.
Fitting estimator with 340 features.
F

In [None]:
#Feature Selection using RFE and estimators as Random Forest Classifier
rfe_selector_RF = RFE(estimator=RandomForestClassifier(n_jobs=-1, random_state= 0), n_features_to_select=num_feats, verbose=5)
rfe_selector_RF.fit(X, y)
rfe_support_RF = rfe_selector_RF.get_support()
rfe_feature_RF = X.loc[:,rfe_support_RF].columns.tolist()
print(str(len(rfe_feature_RF)), 'selected features')

Fitting estimator with 366 features.
Fitting estimator with 365 features.
Fitting estimator with 364 features.
Fitting estimator with 363 features.
Fitting estimator with 362 features.
Fitting estimator with 361 features.
Fitting estimator with 360 features.
Fitting estimator with 359 features.
Fitting estimator with 358 features.
Fitting estimator with 357 features.
Fitting estimator with 356 features.
Fitting estimator with 355 features.
Fitting estimator with 354 features.
Fitting estimator with 353 features.
Fitting estimator with 352 features.
Fitting estimator with 351 features.
Fitting estimator with 350 features.
Fitting estimator with 349 features.
Fitting estimator with 348 features.
Fitting estimator with 347 features.
Fitting estimator with 346 features.
Fitting estimator with 345 features.
Fitting estimator with 344 features.
Fitting estimator with 343 features.
Fitting estimator with 342 features.
Fitting estimator with 341 features.
Fitting estimator with 340 features.
F

In [None]:
#Feature Selection using RFE and estimators as XGB Classifier
rfe_selector_XGB = RFE(estimator=XGBClassifier(n_jobs=-1, random_state= 0), n_features_to_select=num_feats, verbose=5)
rfe_selector_XGB.fit(X, y)
rfe_support_XGB = rfe_selector_XGB.get_support()
rfe_feature_XGB = X.loc[:,rfe_support_XGB].columns.tolist()
print(str(len(rfe_feature_XGB)), 'selected features')

Fitting estimator with 366 features.
Fitting estimator with 365 features.
Fitting estimator with 364 features.
Fitting estimator with 363 features.
Fitting estimator with 362 features.
Fitting estimator with 361 features.
Fitting estimator with 360 features.
Fitting estimator with 359 features.
Fitting estimator with 358 features.
Fitting estimator with 357 features.
Fitting estimator with 356 features.
Fitting estimator with 355 features.
Fitting estimator with 354 features.
Fitting estimator with 353 features.
Fitting estimator with 352 features.
Fitting estimator with 351 features.
Fitting estimator with 350 features.
Fitting estimator with 349 features.
Fitting estimator with 348 features.
Fitting estimator with 347 features.
Fitting estimator with 346 features.
Fitting estimator with 345 features.
Fitting estimator with 344 features.
Fitting estimator with 343 features.
Fitting estimator with 342 features.
Fitting estimator with 341 features.
Fitting estimator with 340 features.
F

In [None]:
#Feature Selection using RFE and estimators as LGBM Classifier
rfe_selector_LGBM = RFE(estimator=LGBMClassifier(n_jobs=-1, random_state= 0), n_features_to_select=num_feats, verbose=5)
rfe_selector_LGBM.fit(X, y)
rfe_support_LGBM = rfe_selector_LGBM.get_support()
rfe_feature_LGBM = X.loc[:,rfe_support_LGBM].columns.tolist()
print(str(len(rfe_feature_LGBM)), 'selected features')

Fitting estimator with 366 features.
Fitting estimator with 365 features.
Fitting estimator with 364 features.
Fitting estimator with 363 features.
Fitting estimator with 362 features.
Fitting estimator with 361 features.
Fitting estimator with 360 features.
Fitting estimator with 359 features.
Fitting estimator with 358 features.
Fitting estimator with 357 features.
Fitting estimator with 356 features.
Fitting estimator with 355 features.
Fitting estimator with 354 features.
Fitting estimator with 353 features.
Fitting estimator with 352 features.
Fitting estimator with 351 features.
Fitting estimator with 350 features.
Fitting estimator with 349 features.
Fitting estimator with 348 features.
Fitting estimator with 347 features.
Fitting estimator with 346 features.
Fitting estimator with 345 features.
Fitting estimator with 344 features.
Fitting estimator with 343 features.
Fitting estimator with 342 features.
Fitting estimator with 341 features.
Fitting estimator with 340 features.
F

In [None]:
#Feature Selection with SelectFromModel using Decision Tree Classifer
SFM_selector_DT = SelectFromModel(DecisionTreeClassifier(random_state= 0), max_features=num_feats)
SFM_selector_DT.fit(X, y)
SFM_support_DT = SFM_selector_DT.get_support()
SFM_feature_DT = X.loc[:,SFM_support_DT].columns.tolist()
print(str(len(SFM_feature_DT)), 'selected features')

21 selected features


In [None]:
#Feature Selection with SelectFromModel using Random Forest Classifer
SFM_selector_RF = SelectFromModel(RandomForestClassifier(n_jobs=-1, random_state= 0), max_features=num_feats)
SFM_selector_RF.fit(X, y)
SFM_support_RF = SFM_selector_RF.get_support()
SFM_feature_RF = X.loc[:,SFM_support_RF].columns.tolist()
print(str(len(SFM_feature_RF)), 'selected features')

7 selected features


In [None]:
#Feature Selection with SelectFromModel using XGB Classifer
SFM_selector_XGB = SelectFromModel(XGBClassifier(n_jobs=-1, random_state= 0), max_features=num_feats)
SFM_selector_XGB.fit(X, y)
SFM_support_XGB = SFM_selector_XGB.get_support()
SFM_feature_XGB = X.loc[:,SFM_support_XGB].columns.tolist()
print(str(len(SFM_feature_XGB)), 'selected features')

50 selected features


In [None]:
#Feature Selection with SelectFromModel using Light GBM Classifier
SFM_selector_LGBM = SelectFromModel(LGBMClassifier(n_jobs=-1, random_state= 0), max_features=num_feats)
SFM_selector_LGBM.fit(X, y)

SFM_support_LGBM = SFM_selector_LGBM.get_support()
SFM_feature_LGBM = X.loc[:,SFM_support_LGBM].columns.tolist()
print(str(len(SFM_feature_LGBM)), 'selected features')

44 selected features


In [None]:
# Feature Selection with SelectKBest using chi
chi_selector = SelectKBest(chi2, k=num_feats)
chi_selector.fit(X, y)
chi_support = chi_selector.get_support()
skb_chi = X.loc[:,chi_support].columns.tolist()
print(str(len(skb_chi)), 'selected features')

50 selected features


In [None]:
feature_name = X.columns
# Combining all feature Selection results and creating a new dataset for feature importance
feature_selection_dataset = pd.DataFrame({'Feature':feature_name, 'RFE_DT':rfe_support_DT, 'RFE_RF':rfe_support_RF, 'RFE_XGB':rfe_support_XGB, 'RFE_LGBM':rfe_support_LGBM, 'SFM_DT':SFM_support_DT,
'SFM_RF':SFM_support_RF, 'SFM_XGB':SFM_support_XGB, 'SFM_LGBM':SFM_support_LGBM, 'SKB_CHI':chi_support})

# Counting selected times for each feature
feature_selection_dataset = feature_selection_dataset.fillna(0)
feature_selection_dataset= feature_selection_dataset*1
fet= feature_selection_dataset['Feature']
feature_selection_dataset= feature_selection_dataset.drop(['Feature'], axis=1)
feature_selection_dataset['Total'] = feature_selection_dataset.sum(axis=1)
feature_selection_dataset['Feature'] = fet
feature_selection_dataset = feature_selection_dataset.sort_values(['Total'] , ascending=False)
feature_selection_dataset.index = range(1, len(feature_selection_dataset)+1)

In [None]:
# Selecting the features which were suggested by atleast 3 models. The count for such features came out to be 80.
total_features_selected= feature_selection_dataset.head(80)
total_features_selected= total_features_selected['Feature']

In [None]:
# Based on Feature Selection, converted our train and test data set
X_final = X[total_features_selected]
y_final = y
test_final = test[total_features_selected]

In [None]:
test_final = test_final.values

In [None]:
test_final.shape

(11139, 80)

In [None]:
# Train test Split
X_train, X_test, y_train, y_test = train_test_split(X_final,y_final, test_size = 0.20, random_state = 0)
X_train.shape,X_test.shape

((61930, 80), (15483, 80))

In [None]:
# Initializing Smote for Oversampling
smote = SMOTE(random_state= 0)

In [None]:
# Appled Smote on X_train and y_train
X_train, y_train = smote.fit_sample(X_train,y_train)



In [None]:
X_test = X_test.values
y_test = y_test.values

In [None]:
# Using XGB Classifier with hyperparameter tuning to train the model and performing internal validation
classifier = XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.8, gamma=1.4,
              learning_rate=0.1, max_delta_step=0, max_depth=16,
              min_child_weight=5, missing=None, n_estimators=700, n_jobs=-1,
              nthread=None, objective='multi:softmax', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=0.8, verbosity=1, num_class = 2)
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)
print(confusion_matrix(y_test,y_pred))
print('Accuracy: ',accuracy_score(y_test, y_pred))
print(classification_report(y_test,y_pred))
print('F1 Score: ',f1_score(y_test, y_pred, average='macro'))

[[ 1355   835]
 [  534 12759]]
Accuracy:  0.9115804430665891
              precision    recall  f1-score   support

           1       0.72      0.62      0.66      2190
           2       0.94      0.96      0.95     13293

    accuracy                           0.91     15483
   macro avg       0.83      0.79      0.81     15483
weighted avg       0.91      0.91      0.91     15483

F1 Score:  0.8067308621037919


In [None]:
# Predicting the test dataset and saving it in a csv file for the submission
predictions = classifier.predict(test_final)
test_data = pd.read_csv("test.csv")
output = pd.DataFrame({'Id': test_data.x, 'Predicted': predictions})
output.to_csv('Chemical_Toxicity_Prediction.csv', index=False)
print("Submission done!")

Submission done!
