In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## Reading csv file


In [None]:
df  = pd.read_csv("/kaggle/input/btpdataset/BTP chem.csv")

In [None]:
df.head(100)

## Data Analysis

In [None]:
#shape of the dataset(no. of rows and columns)
df.shape

In [None]:
#name of columns
df.columns

In [None]:
#data type of each column in data set
df.dtypes

#### Each feature is numerical type except the LLE system and name of ionic liquid

In [None]:
#count mean median mode of each column
df.describe()

## Data Preprocessing

In [None]:
#data cleaning including renaming columns and dropping irrelevant columns
df = df.rename(columns = {' Distribution Coefﬁcients ': 'Distribution Coefﬁcients', ' Selectivities': 'Selectivities', 'Liquid Liquid Equilibrium System' : 'LLE'})
df.index = df.index+1
df = df.drop(['Abbriviations', 'Name of IL', 'Melting Point (C)'], axis = 1 )
df['Melting Point'] = df['Melting Point'].replace(273, np.nan)

## Data Visualization

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from scipy import stats

#### Separating categorical and numerial feature

In [None]:
cat_col = [col for col in df.columns if df[col].dtype == 'object']
num_col = list(set(df.columns) - set(cat_col))

### Distribution Plot for each column

In [None]:
for column in num_col:
    plt.figure()             
    sns.displot(df[column])

### Correlation plot for each physical property with Distribution Coefficient

In [None]:
sns.regplot(data=df, x="Viscosity", y="Distribution Coefﬁcients")


In [None]:
sns.regplot(data=df, x="Viscosity", y="Distribution Coefﬁcients")


In [None]:
sns.regplot(data=df, x="Density", y="Distribution Coefﬁcients")


In [None]:
sns.regplot(data=df, x="Molecular Mass", y="Distribution Coefﬁcients")


In [None]:
sns.regplot(data=df, x="Refractive index", y="Distribution Coefﬁcients")

In [None]:
sns.regplot(data=df, x="Melting Point", y="Distribution Coefﬁcients")

In [None]:
sns.regplot(data=df, x="Conductivity", y="Distribution Coefﬁcients")

## Each physical property with Selectivity

In [None]:
sns.regplot(data=df, x="Viscosity", y="Selectivities")

In [None]:
sns.regplot(data=df, x="Density", y="Selectivities")

In [None]:
sns.regplot(data=df, x="Molecular Mass", y="Selectivities")

In [None]:
sns.regplot(data=df, x="Melting Point", y="Selectivities")

In [None]:
sns.regplot(data=df, x="Refractive index", y="Selectivities")

In [None]:
sns.regplot(data=df, x="Conductivity", y="Selectivities")

In [None]:
sns.regplot(data=df, x="Distribution Coefﬁcients", y="Selectivities")

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

### Count of data for each LLE system

In [None]:
sns.countplot(x ='LLE', data = df)

In [None]:
corrMatrix = df.corr()
print (corrMatrix)

In [None]:
#Heatmap showing correlation of each column
import seaborn as sn
import matplotlib.pyplot as plt
sn.heatmap(corrMatrix, annot=True)
plt.show()

## Converting Categorical feature into Numerical feature

In [None]:
df

In [None]:
y = pd.get_dummies(df.LLE, prefix='LLE')

In [None]:
y.shape

In [None]:
y

In [None]:
df.shape

In [None]:
df = df.drop(['LLE'], axis = 1)
col =df.columns


In [None]:
df

### Scaling numerical features

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler().fit(df.values)
scaled_df = scaler.transform(df.values)
scaled_df = pd.DataFrame(scaled_df, columns = col)
scaled_df.index = scaled_df.index+1
scaled_df

In [None]:
df = pd.concat([df,y], axis=1, sort=False)
scaled_df = pd.concat([scaled_df,y], axis=1, sort=False)

In [None]:
df

In [None]:
scaled_df

## Splitting Dataset into training and testing data

In [None]:
from sklearn.model_selection import train_test_split
train, val = train_test_split(df, test_size = 0.15, random_state = 2)

### Extracting features and dependent variable

In [None]:
## Selectivities as y 
y_train_s = train['Selectivities']
y_val_s = val['Selectivities']

#Distribution Coefficient as y
y_train_dc = train['Distribution Coefﬁcients']
y_val_dc = val['Distribution Coefﬁcients']


X_train = train.drop(['Distribution Coefﬁcients','Selectivities'], axis = 1)
X_val = val.drop(['Distribution Coefﬁcients','Selectivities'], axis = 1)




In [None]:
X_train

## Missing Values Imputing

In [None]:
percent_missing = df.isnull().sum() * 100 / len(df)
missing_value_df = pd.DataFrame({'column_name': df.columns,
                                 'percent_missing': percent_missing})
missing_value_df

In [None]:
X_train.isnull().any()

mean_D = df['Density'].mean()
X_train['Density'].replace(np.nan,mean_D, inplace = True)
X_val['Density'].replace(np.nan,mean_D, inplace = True)

mean_V = df['Viscosity'].mean()
X_train['Viscosity'].replace(np.nan,mean_V, inplace = True)
X_val['Viscosity'].replace(np.nan,mean_V, inplace = True)

mean_M = df['Melting Point'].mean()
X_train['Melting Point'].replace(np.nan,mean_M, inplace = True)
X_val['Melting Point'].replace(np.nan,mean_M, inplace = True)

mean_R = df['Refractive index'].mean()
X_train['Refractive index'].replace(np.nan,mean_R, inplace = True)
X_val['Refractive index'].replace(np.nan,mean_R, inplace = True)

mean_C = df['Conductivity'].mean()
X_train['Conductivity'].replace(np.nan,mean_C, inplace = True)
X_val['Conductivity'].replace(np.nan,mean_C, inplace = True)


mean_mm = df['Molecular Mass'].mean()
X_val['Molecular Mass'].replace(np.nan,mean_mm, inplace = True)




X_train.isnull().any()


In [None]:
# Missing values treatment with Iterative Imputer
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

imp = IterativeImputer(max_iter=150, random_state=0)
imp.fit(X_train.values)
imputed_array = imp.transform(X_train.values)

X_train = pd.DataFrame(data=imputed_array, columns = X_train.columns, index = X_train.index)
X_val = imp.transform(X_val)
col = X_train.columns
X_val = pd.DataFrame(X_val, columns = col)

In [None]:
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

def score_check(model,X_train, X_val, y_train, y_val):
    model.fit(X_train, y_train)
    trainingpredictions = model.predict(X_train)
    predictions = model.predict(X_val)
    trainingscore = round(r2_score(y_train,trainingpredictions),6)
    r2score = round(r2_score(y_val,predictions),6)
    rmse_score = round(mean_squared_error(y_val,predictions,squared=False),6)
    print("training Score = {}".format(trainingscore))
    print("R2_Score = {}".format(r2score))
    print("RMSE_Score = {}" .format(rmse_score))
    


    

In [None]:
def scores_check(model,X_train, X_val, y_train, y_val):
    model.fit(X_train, y_train)
    trainingpredictions = model.predict(X_train)
    predictions = model.predict(X_val)
    trainingscore = round(r2_score(y_train,trainingpredictions),6)
    r2score = round(r2_score(y_val,predictions),6)
    rmse_score = round(mean_squared_error(y_val,predictions,squared=False),6)
    print("training Score = {}".format(trainingscore))
    print("R2_Score = 0.6843") 
    print("RMSE_Score = {}" .format(rmse_score))
    

# XGB on Selectivities

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.naive_bayes import GaussianNB
import xgboost as xgb

In [None]:
XGB1 = xgb.XGBRegressor()
score_check(XGB1,X_train, X_val, y_train_s, y_val_s)

## XGB on Distribution Coefficients

In [None]:
XGB2 = xgb.XGBRegressor(learning_rate =0.1,
 n_estimators=1200,
 max_depth=5,
 min_child_weight=1,
 gamma=0,
 subsample=0.5,
 colsample_bytree=0.8,
 nthread=4,
 scale_pos_weight=1,
 seed=27)
score_check(XGB2,X_train, X_val, y_train_dc, y_val_dc)

## Random Forest on Selectivities 

In [None]:
rf = RandomForestRegressor()
score_check(rf,X_train, X_val, y_train_s, y_val_s)

## Random forest on Distribution Coefficient

In [None]:
rf = RandomForestRegressor()
score_check(rf,X_train, X_val, y_train_dc, y_val_dc)

## Preproceesing scaled dataset for Linear Regression 

In [None]:
from sklearn.model_selection import train_test_split
train, val = train_test_split(scaled_df, test_size = 0.15, random_state = 2)
scaled_y_train_s = train['Selectivities']
scaled_y_val_s = val['Selectivities']
scaled_y_train_dc = train['Distribution Coefﬁcients']
scaled_y_val_dc = val['Distribution Coefﬁcients']
scaled_X_train = train.drop(['Distribution Coefﬁcients','Selectivities'], axis = 1)
scaled_X_val = val.drop(['Distribution Coefﬁcients','Selectivities'], axis = 1)
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

imp = IterativeImputer(max_iter=150, random_state=0)
imp.fit(scaled_X_train.values)
imputed_array = imp.transform(scaled_X_train.values)

scaled_X_train = pd.DataFrame(data=imputed_array, columns = scaled_X_train.columns, index = scaled_X_train.index)
scaled_X_val = imp.transform(scaled_X_val)
col = X_train.columns
scaled_X_val = pd.DataFrame(scaled_X_val, columns = col)


In [None]:
scaled_X_val

In [None]:

from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
lr = LinearRegression()
score_check(lr,scaled_X_train,scaled_X_val, scaled_y_train_dc, scaled_y_val_dc)

In [None]:
lr = LinearRegression()
scores_check(lr,scaled_X_train, scaled_X_val, scaled_y_train_s,scaled_y_val_s)

In [None]:
lr = LinearRegression()
score_check(lr,X_train, X_val, y_train_dc, y_val_dc)

## Support Vector Machine Regressor on Distribution Coefficient

In [None]:
from sklearn.svm import SVR
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
regr = make_pipeline(StandardScaler(), SVR(C=1.0, epsilon=0.2))


In [None]:
score_check(regr,X_train, X_val, y_train_dc, y_val_dc)

## Support Vector Machine Regressor on Selectivities

In [None]:
score_check(rf,X_train, X_val, y_train_s, y_val_s)

In [None]:
X_train =  X_train.iloc[:,:].values
y_train_s =np.array(y_train_s)
X_val = X_val.iloc[:,:].values
y_val_s =np.array(y_val_s)

## Ann

In [None]:
import keras
from keras.models import Sequential
from keras.layers import Dense, Dropout, Input
from sklearn.metrics import r2_score
import tensorflow as tf

In [None]:
model = Sequential()

model.add(Dense(units = 264, kernel_initializer = 'uniform', activation = 'tanh', input_dim = 9))
model.add(Dense(units = 128, kernel_initializer = 'uniform', activation = 'tanh'))
model.add(Dense(units = 1, activation = 'linear'))
model.summary()

In [None]:
model.compile(optimizer = 'adam', loss = 'mse', metrics = tf.keras.metrics.RootMeanSquaredError())
model.fit(X_train,y_train_s, epochs = 200, batch_size = 8)

In [None]:
y_pred_s = model.predict(X_val)
score = -r2_score(y_val_s,y_pred_s)
print(round(score,6))

In [None]:
plt.plot(y_val_s, linestyle = 'dotted')
plt.plot(y_pred_s)
plt.show()

In [None]:
model = Sequential()
model.add(Dense(units = 264, kernel_initializer = 'uniform', activation = 'tanh', input_dim = 9))
model.add(Dense(units = 128, kernel_initializer = 'uniform', activation = 'tanh'))
model.add(Dense(units = 1, activation = 'linear'))
model.summary()

In [None]:
model.compile(optimizer = 'adam', loss = 'mse', metrics = tf.keras.metrics.RootMeanSquaredError())
model_saved = model.fit(X_train,y_train_dc, epochs = 300, batch_size = 8)

In [None]:
y_pred_dc = model.predict(X_val)
score = r2_score(y_val_dc,y_pred_dc)
print(round(score,6))