In [None]:
import numpy as np
import pandas as pd

In [None]:
df=pd.read_pickle('/content/finalDataOfCommonAcidsBases&Salts_withGraphs.pkl')
df.head()
df['Solubility']

0                       82.3 g/100 g at 32 °F (NTP, 1992)
1                                   277000mg/L (at 25 °C)
2       greater than or equal to 100 mg/mL at 70 °F (N...
3                                    0.745g/mL (at 20 °C)
4                                                    None
                              ...                        
2400          Soluble in water, diethyl ether and acetone
2401        Freely soluble in water; insoluble in alcohol
2402    greater than or equal to 100 mg/mL at 68 °F (N...
2403    SOL IN WATER, ALC, ETHER, CARBON DISULFIDE, AC...
2404                                   1g/ 7.5 ml at 25ºC
Name: Solubility, Length: 2405, dtype: object

In [None]:
all_columns = df.columns.tolist()
print(all_columns)

['Chemical_name', 'Category', 'SMILES', 'MolWt', 'ExactMolWt', 'NumValenceElectrons', 'NumRadicalElectrons', 'MaxPartialCharge', 'MinPartialCharge', 'MaxAbsPartialCharge', 'MinAbsPartialCharge', 'FpDensityMorgan1', 'FpDensityMorgan2', 'FpDensityMorgan3', 'BalabanJ', 'BertzCT', 'HallKierAlpha', 'Kappa1', 'Kappa2', 'Kappa3', 'Chi0', 'Chi0n', 'Chi0v', 'Chi1', 'Chi1n', 'Chi1v', 'Chi2n', 'Chi2v', 'Chi3n', 'Chi3v', 'Chi4n', 'Chi4v', 'MeltingPoint', 'BoilingPoint', 'Solubility', 'Graph']


In [None]:
df.drop_duplicates(subset=['Chemical_name'], keep='first', inplace=True)
columns_to_fill = ['MolWt', 'ExactMolWt', 'NumValenceElectrons', 'NumRadicalElectrons',
                   'MaxPartialCharge', 'MinPartialCharge', 'MaxAbsPartialCharge',
                   'MinAbsPartialCharge', 'FpDensityMorgan1', 'FpDensityMorgan2',
                   'FpDensityMorgan3', 'BalabanJ', 'BertzCT', 'HallKierAlpha',
                   'Kappa1', 'Kappa2', 'Kappa3', 'Chi0', 'Chi0n', 'Chi0v', 'Chi1',
                   'Chi1n', 'Chi1v', 'Chi2n', 'Chi2v', 'Chi3n', 'Chi3v', 'Chi4n',
                   'Chi4v',]

# Handle missing values by using median or mean

for col in columns_to_fill:
    if df[col].isnull().any():
        median_value = df[col].median()
        df[col].fillna(median_value, inplace=True)
print(len(df))

1004


In [None]:
#preprocessing mp and bp
import pandas as pd
from sklearn.impute import KNNImputer
import re

def convert_to_kelvin(sentence):
    pattern = r'(-?\d+(\.\d+)?)\s?(°[CFK])'

    sentence = str(sentence)
    matches = re.findall(pattern, sentence)


    converted_values = []
    for value, _, unit in matches:
        value = float(value)
        if unit == '°C':
            value += 273.15
        elif unit == '°F':
            value = (value - 32) * 5 / 9 + 273.15
        converted_values.append(value)
    if converted_values:
        return converted_values[0]
    else:
        return None

df['MeltingPoint_K'] = df['MeltingPoint'].apply(convert_to_kelvin)

df['BoilingPoint_K'] = df['BoilingPoint'].apply(convert_to_kelvin)

imputer = KNNImputer(n_neighbors=5)
df[['MeltingPoint_K', 'BoilingPoint_K']] = imputer.fit_transform(df[['MeltingPoint_K', 'BoilingPoint_K']])

print(df.head())
print(len(df))
print(df.columns.to_list)
p=df.copy()


               Chemical_name Category                                  SMILES  \
0          Hydrochloric acid     acid                                      Cl   
1           (R)-(+)-Cysteine     acid                     C([C@@H](C(=O)O)N)S   
2                Formic acid     acid                                  C(=O)O   
3           Calcium chloride     salt                      [Cl-].[Cl-].[Ca+2]   
4  Phosphazene base P1-t-Oct     base  CC(C)(C)CC(C)(C)N=P(N(C)C)(N(C)C)N(C)C   

   NumRadicalElectrons  MinPartialCharge    Kappa2     Kappa3  \
0                  0.0         -0.146589  1.738276  10.083103   
1                  0.0         -0.480064  2.872925   2.472042   
2                  0.0         -0.483467  1.470000   1.470000   
3                  0.0         -1.000000  9.148419   3.839740   
4                  0.0         -0.254411  6.256538   5.184275   

                                        MeltingPoint  \
0  -174.6 °F (Melting point is -13.7 °F for a 39....   
1         

In [None]:
print(p.columns.to_list)


In [None]:
# Select only numeric columns for correlation computation
numeric_df = df.select_dtypes(include=['float64', 'int64'])

# Compute correlation matrix
corr_matrix = numeric_df.corr()

# Set threshold for correlation
threshold = 0.4

# Find pairs of highly correlated columns
high_corr_pairs = (corr_matrix.abs() > threshold) & (corr_matrix != 1)

# Initialize a set to store columns to drop
columns_to_drop = set()

# Iterate over columns in the DataFrame
for col in high_corr_pairs.columns:
    # Get correlated columns for the current column
    correlated_cols = list(high_corr_pairs.index[high_corr_pairs[col]])

    # Add correlated columns to the set of columns to drop
    for correlated_col in correlated_cols:
        columns_to_drop.add(correlated_col)

# Drop the highly correlated columns from the DataFrame
df.drop(columns=columns_to_drop, inplace=True)

# Get all remaining columns
all_columns = df.columns.tolist()

# Print the remaining columns
print(all_columns)
print(p.head())


['Chemical_name', 'Category', 'SMILES', 'NumRadicalElectrons', 'MinPartialCharge', 'Kappa2', 'Kappa3', 'MeltingPoint', 'BoilingPoint', 'Solubility', 'Graph']
               Chemical_name Category                                  SMILES  \
0          Hydrochloric acid     acid                                      Cl   
1           (R)-(+)-Cysteine     acid                     C([C@@H](C(=O)O)N)S   
2                Formic acid     acid                                  C(=O)O   
3           Calcium chloride     salt                      [Cl-].[Cl-].[Ca+2]   
4  Phosphazene base P1-t-Oct     base  CC(C)(C)CC(C)(C)N=P(N(C)C)(N(C)C)N(C)C   

   NumRadicalElectrons  MinPartialCharge    Kappa2     Kappa3  \
0                  0.0         -0.146589  1.738276  10.083103   
1                  0.0         -0.480064  2.872925   2.472042   
2                  0.0         -0.483467  1.470000   1.470000   
3                  0.0         -1.000000  9.148419   3.839740   
4                  0.0        

In [None]:
print(p)


                       Chemical_name Category  \
0                  Hydrochloric acid     acid   
1                   (R)-(+)-Cysteine     acid   
2                        Formic acid     acid   
3                   Calcium chloride     salt   
4          Phosphazene base P1-t-Oct     base   
...                              ...      ...   
2381  Bromodichloroacetic acid-1-13C     acid   
2382   Magnesium sulfate monohydrate     salt   
2393       Potassium thiocyanate-13C     salt   
2397               di-Boron trioxide     salt   
2400                 Thioacetic acid     acid   

                                      SMILES  NumRadicalElectrons  \
0                                         Cl                  0.0   
1                        C([C@@H](C(=O)O)N)S                  0.0   
2                                     C(=O)O                  0.0   
3                         [Cl-].[Cl-].[Ca+2]                  0.0   
4     CC(C)(C)CC(C)(C)N=P(N(C)C)(N(C)C)N(C)C                  0.0 

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
X = p[['Chemical_name', 'Category', 'SMILES', 'NumRadicalElectrons',
       'MinPartialCharge', 'Kappa2', 'Kappa3', 'BoilingPoint_K']]
y = p['MeltingPoint_K']
encoder = OneHotEncoder()
X_encoded = encoder.fit_transform(X)
X_train_encoded, X_test_encoded, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, random_state=42)
print("Training set feature names:", encoder.get_feature_names_out())
print("Test set feature names:", encoder.get_feature_names_out())


Training set feature names: ['Chemical_name_(-)-Quinic acid' 'Chemical_name_(-)-Shikimic acid'
 'Chemical_name_(1R,2R)-(-)-1,2-Cyclohexanedicarboxylic acid' ...
 'BoilingPoint_K_3873.15' 'BoilingPoint_K_4273.15'
 'BoilingPoint_K_5098.15']
Test set feature names: ['Chemical_name_(-)-Quinic acid' 'Chemical_name_(-)-Shikimic acid'
 'Chemical_name_(1R,2R)-(-)-1,2-Cyclohexanedicarboxylic acid' ...
 'BoilingPoint_K_3873.15' 'BoilingPoint_K_4273.15'
 'BoilingPoint_K_5098.15']


In [None]:
import numpy as np
X_train_array = X_train_encoded.toarray()
X_test_array = X_test_encoded.toarray()
print("Data type of X_train_array:", type(X_train_array))
print("Data type of X_test_array:", type(X_test_array))


Data type of X_train_array: <class 'numpy.ndarray'>
Data type of X_test_array: <class 'numpy.ndarray'>


In [None]:
import xgboost as xgb
from sklearn.metrics import mean_squared_error
model = xgb.XGBRegressor()
model.fit(X_train_array, y_train)
y_pred = model.predict(X_test_array)
r_squared = model.score(X_test_array, y_test)
print("R-squared:", r_squared)

R-squared: 0.31884397407054754
