In [2]:
import pandas as pd

In [3]:
dataset = pd.read_excel("./example_data_mucoadhesive_polymers.xlsx")

In [4]:
dataset.head()

Unnamed: 0,Polymer,Class,Thiolation,TG,TG SD,pH,Prep,AT,AT SD,AT CoV,WA,WA SD,WA CoV
0,Carbopol 971P,Carbopol,not thiolated,0.0,0.0,3.0,lyo,7.2,0.6,8.333333,110.1,23.1,20.980926
1,Carbopol 971P,Carbopol,not thiolated,0.0,0.0,3.0,pr,5.5,0.8,14.545455,130.9,16.7,12.75783
2,Carbopol 971P,Carbopol,not thiolated,0.0,0.0,5.37,lyo,2.6,0.5,19.230769,43.2,6.4,14.814815
3,Carbopol 971P,Carbopol,not thiolated,0.0,0.0,7.0,lyo,3.1,0.4,12.903226,118.7,17.2,14.490312
4,Carbopol 971P,Carbopol,not thiolated,0.0,0.0,7.0,pr,7.2,0.6,8.333333,156.1,35.2,22.549648


In [5]:
y = dataset["WA"]
X = dataset[["Class", "TG", "pH", "Prep", "AT"]]

In [6]:
X.head()

Unnamed: 0,Class,TG,pH,Prep,AT
0,Carbopol,0.0,3.0,lyo,7.2
1,Carbopol,0.0,3.0,pr,5.5
2,Carbopol,0.0,5.37,lyo,2.6
3,Carbopol,0.0,7.0,lyo,3.1
4,Carbopol,0.0,7.0,pr,7.2


In [7]:
y.head()

0    110.1
1    130.9
2     43.2
3    118.7
4    156.1
Name: WA, dtype: float64

## One Hot Encoding

In [14]:
# Show difference in column
pd.DataFrame(X["Class"])

Unnamed: 0,Class
0,Carbopol
1,Carbopol
2,Carbopol
3,Carbopol
4,Carbopol
...,...
74,PCP
75,PCP
76,PCP
77,PCP


In [16]:
# Show what get_dummies does
pd.get_dummies(X["Class"])

Unnamed: 0,Carbopol,PAA100,PAA2,PAA250,PAA45,PAA450,PCP
0,1,0,0,0,0,0,0
1,1,0,0,0,0,0,0
2,1,0,0,0,0,0,0
3,1,0,0,0,0,0,0
4,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...
74,0,0,0,0,0,0,1
75,0,0,0,0,0,0,1
76,0,0,0,0,0,0,1
77,0,0,0,0,0,0,1


In [17]:
## Get dummies for each column and assign a name
prep_OHE = pd.get_dummies(X["Prep"])
class_OHE = pd.get_dummies(X["Class"])

In [18]:
## Drop the non-OHE columns
X_drop = X.drop(["Class", "Prep"], axis =1)

In [19]:
## Join the OHE columns to the original data
X = X_drop.join([prep_OHE, class_OHE])

In [20]:
X.head()

Unnamed: 0,TG,pH,AT,lyo,pr,Carbopol,PAA100,PAA2,PAA250,PAA45,PAA450,PCP
0,0.0,3.0,7.2,1,0,1,0,0,0,0,0,0
1,0.0,3.0,5.5,0,1,1,0,0,0,0,0,0
2,0.0,5.37,2.6,1,0,1,0,0,0,0,0,0
3,0.0,7.0,3.1,1,0,1,0,0,0,0,0,0
4,0.0,7.0,7.2,0,1,1,0,0,0,0,0,0


## Split our model into training and testing

In [21]:
from sklearn.model_selection import train_test_split

In [22]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.25, random_state=42)

In [24]:
X_train.head()

Unnamed: 0,TG,pH,AT,lyo,pr,Carbopol,PAA100,PAA2,PAA250,PAA45,PAA450,PCP
9,0.0,7.0,10.3,0,1,1,0,0,0,0,0,0
64,0.0,3.0,4.1,0,1,0,0,0,0,0,0,1
5,0.0,3.0,8.2,1,0,1,0,0,0,0,0,0
47,113.4,4.0,4.86,1,0,0,0,0,0,0,1,0
34,0.0,3.0,1.1,1,0,0,0,0,0,0,1,0


In [25]:
X_test.head()

Unnamed: 0,TG,pH,AT,lyo,pr,Carbopol,PAA100,PAA2,PAA250,PAA45,PAA450,PCP
30,404.1,5.0,14.91,1,0,0,0,0,1,0,0,0
0,0.0,3.0,7.2,1,0,1,0,0,0,0,0,0
22,0.0,7.0,0.0,1,0,0,1,0,0,0,0,0
31,22.7,7.5,1.17,1,0,0,0,0,1,0,0,0
18,12.15,5.37,1.45,1,0,1,0,0,0,0,0,0


## Define and Fit our model

In [26]:
## Import the relevant library
from sklearn.tree import DecisionTreeRegressor

## Define the model - give it a random state so that it is reproducible
polymer_model = DecisionTreeRegressor(random_state=42)

## Fit the model

In [27]:
# Fit model
polymer_model.fit(X_train, y_train)

## Testing the model

In [25]:
## Give the model the test data through the predict function
polymer_model.predict(X_test)

array([412.3 , 138.94,  25.7 , 152.2 ,  75.4 , 109.89, 107.3 , 242.91,
       219.3 ,  92.72, 179.3 ,  43.57,  53.9 , 191.3 , 103.82, 214.28,
        42.3 ,  53.9 , 707.79,  84.8 ])

In [26]:
## Assign the results of the prediction to a variable
test_results = polymer_model.predict(X_test)

## Get the real results
real_results = y_test

In [54]:
from sklearn.metrics import r2_score

In [55]:
r2_score(test_results, real_results)

0.7138957105690047

In [56]:
from sklearn.metrics import mean_absolute_error

In [57]:
mean_absolute_error(test_results, real_results)

60.72600000000001

## Plotting the results

In [30]:
import matplotlib.pyplot as plt
import seaborn as sns

In [31]:
sns.scatterplot(test_results, real_results)
plt.show()

NameError: name 'test_results' is not defined

In [None]:
## Make the plot nicer:
g = sns.scatterplot(test_results, real_results, s = 100, c = 'r')

plt.axis('square')


g.set_xlim(0,800)
g.set_ylim(0,800)

g.plot((0,1000), (0,1000), c = "k", linestyle = "--")

g.set_xlabel("Predicted Total Work of Adhesion", size = 12)
g.set_ylabel("Real Total Work of Adhesion", size = 12)

g.annotate("r2 score: 0.714", (500,100), size = 12)

plt.show()