In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestRegressor
from tensorflow import keras


In [2]:
df = pd.read_csv('data_GWP_atoms.csv')
df.head()

Unnamed: 0,nAcid,ALogP,ALogp2,AMR,apol,naAromAtom,nAromBond,nAtom,nHeavyAtom,nH,...,WTPT-1,WTPT-2,WTPT-3,WTPT-4,WTPT-5,WPATH,WPOL,XLogP,Zagreb,GWP
0,0,-0.6854,0.469773,16.0871,11.416344,0,0,12,4,8,...,6.87132,1.71783,2.31066,2.31066,0.0,10,1,0.282,10,4.44
1,0,0.9449,0.892836,10.8232,7.301172,0,0,8,4,4,...,6.732051,1.683013,4.488034,0.0,0.0,9,0,1.542,12,6.4
2,0,-0.9734,0.947508,18.9987,14.50993,0,0,15,5,10,...,8.849874,1.769975,2.362437,2.362437,0.0,20,2,0.851,14,2.9
3,0,-0.6961,0.484555,20.6715,14.50993,0,0,15,5,10,...,8.696802,1.73936,2.313053,2.313053,0.0,18,2,0.742,16,4.29
4,0,0.1242,0.015426,21.5446,14.50993,0,0,15,5,10,...,8.696802,1.73936,2.329115,2.329115,0.0,18,2,0.579,16,2.9


In [3]:
# Splitting the dataset into features (X) and target (y)
X = df.iloc[:, :-1]  # All columns except the last one (GWP)
y = df.iloc[:, -1]   # The target column (GWP)

#train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=0.2, random_state=42)



# # Display the shape of X and y train
# print("Shape of X_train:", train_X.shape)
# print("Shape of y_train:", train_y.shape)

# # Display the shape of X and y test
# print("Shape of X_test:", test_X.shape)
# print("Shape of y_test:", test_y.shape)



In [5]:
from sklearn.feature_selection import VarianceThreshold, SelectFromModel
# Step 1: Variance Thresholding with Column Names
# Get the column indices retained after variance thresholding
threshold = 0.005  # This threshold can be adjusted as needed
variance_filter = VarianceThreshold(threshold=threshold)
# Fit the variance filter to your training data
variance_filter.fit(X) # This line is added to fit the filter to the data
selected_variance_indices = variance_filter.get_support(indices=True)
X_variance_filtered_named = X.iloc[:, selected_variance_indices]

print(X_variance_filtered_named.head())
vf_cols = X_variance_filtered_named.columns

print(vf_cols)

   nAcid   ALogP    ALogp2      AMR       apol  naAromAtom  nAromBond  nAtom  \
0      0 -0.6854  0.469773  16.0871  11.416344           0          0     12   
1      0  0.9449  0.892836  10.8232   7.301172           0          0      8   
2      0 -0.9734  0.947508  18.9987  14.509930           0          0     15   
3      0 -0.6961  0.484555  20.6715  14.509930           0          0     15   
4      0  0.1242  0.015426  21.5446  14.509930           0          0     15   

   nHeavyAtom  nH  ...       AMW    WTPT-1    WTPT-2    WTPT-3    WTPT-4  \
0           4   8  ...  5.004793  6.871320  1.717830  2.310660  2.310660   
1           4   4  ...  8.253513  6.732051  1.683013  4.488034  0.000000   
2           5  10  ...  4.938211  8.849874  1.769975  2.362437  2.362437   
3           5  10  ...  4.938211  8.696802  1.739360  2.313053  2.313053   
4           5  10  ...  4.938211  8.696802  1.739360  2.329115  2.329115   

   WTPT-5  WPATH  WPOL  XLogP  Zagreb  
0     0.0     10     1

In [7]:
# Step 2: Correlation Analysis with Column Names
# Calculate the correlation matrix of the variance-filtered data
X_correlation_matrix_named = X.corr().abs()

# Select the upper triangle of the correlation matrix
upper_triangle_named = X_correlation_matrix_named.where(np.triu(np.ones(X_correlation_matrix_named.shape), k=1).astype(bool))

# Find features with correlation greater than the specified threshold
threshold = 0.95
high_correlation_columns_named = [column for column in upper_triangle_named.columns if any(upper_triangle_named[column] > threshold)]

# Drop the highly correlated features
X_corr_filtered_named = X.drop(columns=high_correlation_columns_named)

# Display the shape and the retained column names after correlation analysis
print(X_corr_filtered_named.head)

cf_cols = X_corr_filtered_named.columns
print(cf_cols)


<bound method NDFrame.head of      nAcid   ALogP     ALogp2      AMR       apol  naAromAtom  nAtom  \
0        0 -0.6854   0.469773  16.0871  11.416344           0     12   
1        0  0.9449   0.892836  10.8232   7.301172           0      8   
2        0 -0.9734   0.947508  18.9987  14.509930           0     15   
3        0 -0.6961   0.484555  20.6715  14.509930           0     15   
4        0  0.1242   0.015426  21.5446  14.509930           0     15   
..     ...     ...        ...      ...        ...         ...    ...   
182      0  0.0211   0.000445  17.4128  10.884758           0     11   
183      0 -1.1870   1.408969  16.2111  10.089551           0     11   
184      0 -0.5281   0.278890  23.7679  14.943137           0     15   
185      0  5.1662  26.689622  49.8066  47.827032           0     37   
186      0 -0.1854   0.034373  14.7090  10.614344           0     11   

     nHeavyAtom  nH  nB  ...  JGI9       JGT         VE1_D         VE2_D  \
0             4   8   0  ... 

In [10]:
# Step 3: Random Forest Feature Selection with Column Names
# Using the RandomForestRegressor to determine feature importance
random_forest_named = RandomForestRegressor(n_estimators=200, random_state=42)
random_forest_named.fit(X, y)

# Use SelectFromModel to select important features based on feature importances from RandomForest
model_named = SelectFromModel(random_forest_named, threshold="mean", prefit=True)
selected_rf_indices = model_named.get_support(indices=True)
X_rf_filtered_named = X.iloc[:, selected_rf_indices]

# Display the shape and the retained column names after Random Forest-based feature selection
print(X_rf_filtered_named.head())
rf_cols = X_rf_filtered_named.columns
print(rf_cols)


    ALogP      AMR  nBr        ATS0m        ATS1v       ATS2e       ATS3e  \
0 -0.6854  16.0871    0   696.888876  2034.933251  141.285376  152.303580   
1  0.9449  10.8232    0  1014.471146  1430.117012  107.329920   82.363392   
2 -0.9734  18.9987    0   843.185125  2687.923385  184.014884  215.188480   
3 -0.6961  20.6715    0   843.185125  2687.923385  184.178432  223.845760   
4  0.1242  21.5446    0   843.185125  2687.923385  184.038600  217.723320   

        ATS4e      ATS0p      ATS1s  ...   MDEC-13  MLFER_BH  MLFER_E  MPC6  \
0   89.278848  12.566807  32.250000  ...  0.000000     0.385    0.205     0   
1    0.000000   7.976750   0.000000  ...  1.000000     0.003   -0.144     0   
2  152.303580  16.244933  37.500000  ...  0.000000     0.385    0.205     0   
3  172.295424  16.244933   0.000000  ...  1.414214     0.403    0.190     0   
4  218.868480  16.244933  38.666667  ...  2.000000     0.403    0.190     0   

   nHeteroRing      JGI2         VE2_D  TopoPSA      SRW2  WTP

In [18]:
import pandas as pd
import numpy as np

# Convert column names to sets for efficient intersection
rf_cols_set = set(rf_cols)
vf_cols_set = set(vf_cols)
cf_cols_set = set(cf_cols)

# Calculate the intersection of the three sets
intersection_cols = list(rf_cols_set.intersection(vf_cols_set, cf_cols_set))

intersection_cols

['GATS2m',
 'ETA_Epsilon_5',
 'MLFER_BH',
 'CIC2',
 'GATS5p',
 'AATSC4s',
 'MATS5p',
 'VE1_Dzs',
 'MATS2p',
 'AATSC1m',
 'bpol',
 'AATSC2p',
 'MIC1',
 'MATS3e',
 'CIC0',
 'ATSC6m',
 'ATSC2m',
 'MDEC-13',
 'GATS1i',
 'ETA_EtaP_F',
 'SM1_Dzs',
 'ATSC4s',
 'SpMax4_Bhv',
 'AMR',
 'AVP-3',
 'nBr',
 'GATS4c',
 'SM1_DzZ',
 'SpAbs_DzZ',
 'AVP-0',
 'ATS1s',
 'ETA_EtaP_L',
 'AATSC4p',
 'SpMax4_Bhm',
 'CIC3',
 'minHBint4',
 'MATS2e',
 'SpMin5_Bhs',
 'SpMin6_Bhs',
 'hmin',
 'MLFER_E',
 'GATS3s',
 'MATS4c',
 'VR1_Dzv',
 'MATS4i',
 'MATS6i',
 'nHeteroRing',
 'AATSC4m',
 'SpMin3_Bhe',
 'ATSC4m',
 'AATSC0m',
 'VR1_Dzs',
 'ETA_Eta_F_L',
 'ATSC6i',
 'SpMax1_Bhm',
 'MATS6c',
 'MATS2v',
 'MATS2c',
 'AATSC2v',
 'ATSC0e',
 'SpMax8_Bhs',
 'GATS3e',
 'nBondsD',
 'ATS0p',
 'MATS4m',
 'ETA_Eta_F',
 'MATS4s',
 'GATS5v',
 'ATSC7s',
 'GATS2i',
 'ATSC1m',
 'MATS2i',
 'GATS1v',
 'ATSC7e',
 'MATS2m',
 'SwHBa',
 'AATS2m',
 'SpMin7_Bhe',
 'AATSC3m',
 'minaasC',
 'maxssCH2',
 'GATS3m',
 'MATS2s',
 'VE3_Dze',
 'ALogP',
 

In [15]:
df = pd.read_csv('data_GWP_atoms.csv')
X = df.iloc[:, :-1]  # All columns except the last one (GWP)
y = df.iloc[:, -1]   # The target column (GWP)

In [16]:
train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=0.1, random_state=42)


rev_train_X_final = train_X[intersection_cols]
rev_test_X_final = test_X[intersection_cols]


train_y.to_csv('train_y.csv', index=False)
test_y.to_csv('test_y.csv', index=False)
rev_train_X_final.to_csv('rev_train_X_final.csv', index=False)
rev_test_X_final.to_csv('rev_test_X_final.csv', index=False)

print(train_X.shape)
print(test_X.shape)
print(train_y.shape)
print(test_y.shape)

(168, 1444)
(19, 1444)
(168,)
(19,)


In [17]:
rev_train_X_final.head()

Unnamed: 0,GATS2m,ETA_Epsilon_5,MLFER_BH,CIC2,GATS5p,AATSC4s,MATS5p,VE1_Dzs,MATS2p,AATSC1m,...,VE3_Dzm,ATSC5c,MATS4e,BCUTw-1l,GATS5i,AATSC5v,ATSC4i,SpMin8_Bhs,MATS1e,ATSC7v
19,0.47142,1.0631,0.412,0.67927,0.0,0.0,0.0,0.148942,-0.009198,-0.105982,...,-0.57212,0.0,0.0,11.998707,0.0,0.0,0.0,0.0,-0.026729,0.0
124,0.876169,0.82963,0.66,1.311752,0.776223,0.091821,0.190132,0.058714,-0.23847,0.499316,...,-2.745541,0.002927,-0.023815,11.85,0.67197,7.450535,-2.434006,1.027966,-0.016755,-2.345211
118,0.678344,0.90909,0.185,0.8,0.496114,-1.425926,0.330426,0.050197,-0.234917,-0.407907,...,-5.286617,-0.006965,-0.399388,11.85,0.454771,18.424129,-6.049219,0.867193,0.159821,-10.842771
5,0.856628,0.75238,0.436,1.807185,0.0,0.0,0.412218,0.16248,-0.052287,-3.88918,...,-1.216953,0.025333,-0.023352,11.89,0.0,21.858835,-0.751663,0.571429,-0.095039,0.0
148,0.310558,1.04286,0.073,1.228161,0.0,-0.023389,0.0,0.025121,0.428315,-0.193513,...,-0.613407,0.0,-1.220077,11.998635,0.0,0.0,1.303739,0.849903,-0.011928,0.0


In [None]:
from sklearn.svm import SVR
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(train_X_final)
X_test_scaled = scaler.transform(test_X_final)

# Model 1: SVM Regressor
svm_model = SVR(kernel='rbf', C=1.0, epsilon=0.1)
svm_model.fit(X_train_scaled, train_y)
y_pred_svm = svm_model.predict(X_test_scaled)

# Evaluation of SVM
mse_svm = mean_squared_error(test_y, y_pred_svm)
r2_svm = r2_score(test_y, y_pred_svm)

print("SVM Metrics:")
print("Mean Squared Error:", mse_svm)
print("R-squared:", r2_svm)

SVM Metrics:
Mean Squared Error: 11.411460617740474
R-squared: -0.12049732690185322


In [None]:
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(train_X_final, train_y)
y_pred_rf = rf_model.predict(test_X_final)

# Evaluation of Random Forest
mse_rf = mean_squared_error(test_y, y_pred_rf)
r2_rf = r2_score(test_y, y_pred_rf)

print("Random Forest Metrics:")
print("Mean Squared Error:", mse_rf)
print("R-squared:", r2_rf)


Random Forest Metrics:
Mean Squared Error: 194.63103139815794
R-squared: -18.11092345836516
