In [1]:
# Shiwei SU
# Takai Lab, Department of Bioengineering, School of Engineering
# The University of Tokyo, Japan

Prediction Accuracy vs Data Size

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from tqdm import tqdm

from sklearn.preprocessing import MaxAbsScaler

from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.linear_model import Lasso
from sklearn.neural_network import MLPRegressor
from sklearn.linear_model import Ridge
from xgboost import XGBRegressor

from sklearn.metrics import r2_score
from sklearn.model_selection import GridSearchCV, KFold
from statistics import mean

In [3]:
dataset = pd.read_csv(input("Enter the name of the dataset file: ")+".csv")

columns_to_drop = ["ref", "Sample", "Smiles"]
for column in columns_to_drop:
    if column in dataset:
        dataset = dataset.drop(column, axis=1)

# MaxAbsScaler for descriptors
CA = dataset.filter(regex="Contact")
df_raw = dataset.drop("Contact angle (deg)", axis = 1)
transformer = MaxAbsScaler()
df_scaled = transformer.fit_transform(df_raw)
dataset_scaled = pd.DataFrame(df_scaled, columns=df_raw.columns)
dataset = pd.concat([CA, dataset_scaled], axis = 1)

# Distribution
y_name = "Contact Angle"
number_of_bins = 7
if dataset.iloc[:, 0].dtype=="float":
    plt.figure(figsize=(7.5, 3))
    plt.rcParams["font.size"] = 18
    plt.hist(dataset.iloc[:, 0], bins=number_of_bins, color="dodgerblue")
    plt.xlabel("Contact Angle", font="Arial")
    plt.ylabel("Frequency", font="Arial")
    plt.xticks([0, 30, 60, 90, 120])
    plt.yticks([0,5,10,15,20,25,30])

In [6]:
X = np.array(dataset.drop("Contact angle (deg)", axis = 1))
Y = np.array(dataset["Contact angle (deg)"])

In [7]:
# Hyperparameter Tuning
param_lasso = {"alpha": np.arange(0.05, 3., 0.01),
              "fit_intercept": [True, False]}

param_ridge = {"alpha": np.arange(0.05, 3., 0.01),
              "fit_intercept": [True, False]}

param_tree = {"max_depth": np.arange(1, 20),
             "min_samples_leaf": np.arange(1, 10, 1),
             "random_state": [42]}

param_forest = {"max_depth": np.arange(1, 20),
             "min_samples_leaf": np.arange(1, 10, 1),
             "n_estimators": [50, 100, 200, 300],
             "random_state": [42]}

param_knn = {"n_neighbors": np.arange(1, 10),
              "weights": ["uniform", "distance"]}

param_svr = {"C": [0.1, 1, 10, 100, 1000],
              "epsilon": [0.01, 0.1, 1, 10],
              "kernel": ["linear", "poly", "rbf", "sigmoid"]}

param_mlp = {"hidden_layer_sizes": [(50,), (100,), (150,),
                                    (50, 50), (100, 100), (150, 150),
                                    (50, 100), (50, 150), (100, 150),
                                    (50,50,50), (100,100,100), (150, 150, 150),
                                    (50, 100, 50), (50, 150, 50), (100, 100, 150), (100, 150, 100)],
             "activation": ["relu", "logistic", "tanh"],
             "solver": ["adam", "sgd"],
             "alpha": [0.0001, 0.001, 0.01],
             "learning_rate_init": [0.0001, 0.001, 0.01],
             "shuffle": [False],
             "max_iter": [1000]}

param_xgb = {"max_depth": np.arange(1, 20),
             "learning_rate": [0.01, 0.05, 0.1, 0.2, 0.3],
             "n_estimators": [50, 100, 150, 250, 500, 1000],
             "random_state": [42]}

models_param = {
    "Lasso": {"model": Lasso(), "param": param_lasso},
    "Ridge": {"model": Ridge(), "param": param_ridge},
    "Linear Regression": {"model": LinearRegression(), "param": param_linear},
    "Decision Tree": {"model": DecisionTreeRegressor(), "param": param_tree},
    "Random Forest": {"model": RandomForestRegressor(), "param": param_forest},
    "kNN": {"model": KNeighborsRegressor(), "param": param_knn},
    "SVR": {"model": SVR(), "param": param_svr},
    "MLP": {"model": MLPRegressor(), "param": param_mlp},
    "XGBoost": {"model": XGBRegressor(), "param": param_xgb}
}

optimized_models = {"Lasso":{},
                    "Ridge":{},
                    "Linear Regression":{},
                    "Decision Tree":{},
                    "Random Forest":{},
                    "kNN":{},
                    "SVR":{},
                    "MLP":{},
                    "XGBoost":{}}

In [None]:
inner_splits = 5
outer_splits = 20
inner = KFold(n_splits=inner_splits, shuffle=True, random_state=42)
outer = KFold(n_splits=outer_splits, shuffle=True, random_state=42)



# Data Preparation
block_size = len(X) // outer_splits
data_split = {}

for k, (train_idx, test_idx) in tqdm(enumerate(outer.split(X))):
    for i in range(outer_splits-1):
        training_size = (i + 1) * block_size
        train_indices = train_idx[: training_size]
        X_train, X_test = X[train_indices], X[test_idx]
        Y_train, Y_test = Y[train_indices], Y[test_idx]
        dict_key1 = f"training_size_{training_size}"
        dict_key2 = f"outer_cv_no_{k + 1}"
        if dict_key1 not in data_split:
            data_split[dict_key1] = {}
        data_split[dict_key1][dict_key2] = ((X_train, Y_train), (X_test, Y_test))

# Nested CV
for model_name, model_param in models_param.items():
    print("Training " + model_name + "...")
    model = model_param["model"]
    param = model_param["param"]
    test_scores_vs_training_size = []
    for k in range(outer_splits-1):
        training_size = (k + 1) * block_size
        data_key1 = f"training_size_{training_size}"
        optimized_models[model_name][data_key1] = {}
        print(f"best parameter for training_size={training_size}")
        best_para = []
        train_scores = []
        test_scores = []
        for i in range(outer_splits - 1):
            print(str(data_key1))
            data_key2 = f"outer_cv_no_{i + 1}"
            print(data_key2)
            (X_train, y_train), (X_test, y_test) = data_split[data_key1][data_key2]
            grid = GridSearchCV(model, param, cv=inner, n_jobs=-1, verbose=0)
            grid.fit(X_train, y_train)
            best_para.append(grid.best_params_)
            train_scores.append(grid.best_score_)
            model = grid.best_estimator_.fit(X_train, y_train)
            y_pred = model.predict(X_test)
            r2_pred = r2_score(y_test, y_pred)
            test_scores.append(r2_pred)
        test_scores_mean = mean(test_scores)
        test_scores_vs_training_size.append(test_scores_mean)
    optimized_models[model_name]["test_scores"] = test_scores_vs_training_size
    print("-------------------------NEXT-------------------------")

In [4]:
lasso_score = optimized_models["Lasso"]["test_scores"]
ridge_score = optimized_models["Ridge"]["test_scores"]
linear_score = optimized_models["Linear Regression"]["test_scores"]
tree_score = optimized_models["Decision Tree"]["test_scores"]
forest_score = optimized_models["Random Forest"]["test_scores"]
knn_score = optimized_models["kNN"]["test_scores"]
svr_score = optimized_models["SVR"]["test_scores"]
mlp_score = optimized_models["MLP"]["test_scores"]
xgb_score = optimized_models["XGBoost"]["test_scores"]


In [None]:
scores = [lasso_score, ridge_score, linear_score, tree_score, forest_score, knn_score, svr_score, mlp_score xgb_score]
for score in scores:
    fig,ax=plt.subplots(figsize=(6, 3))
    x = [5, 10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 60, 65, 70, 75, 80]
    score = score[:16]
    y = [max(0, value) for value in score]
    plt.plot(x, y, linestyle="-")
    ax.set_xlim([0,85])
    ax.set_ylim([-0.05,1.05])
    x_ticks = np.linspace(0, 80, 9)
    plt.xticks(x_ticks, fontweight="bold")
    y_ticks = np.linspace(0, 1, 6)
    plt.yticks(y_ticks, fontweight="bold")
    plt.xlabel("Data Size for Training Set", fontsize=20, fontweight="bold", font="Arial")
    plt.ylabel("$R\u00b2$ Score", fontsize=20, fontweight='bold', font="Arial")
    plt.plot()

In [None]:
fig, ax = plt.subplots(figsize=(6, 3))
x = [5, 10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 60, 65, 70, 75, 80, 85, 90, 95]
y_lasso = [max(0, value) for value in lasso_score[:]]
y_ridge = [max(0, value) for value in ridge_score[:]]
y_linear = [max(0, value) for value in linear_score[:]]
y_tree = [max(0, value) for value in tree_score[:]]
y_forest = [max(0, value) for value in forest_score[:]]
y_knn = [max(0, value) for value in knn_score[:]]
y_svr = [max(0, value) for value in svr_score[:]]
y_mlp = [max(0, value) for value in mlp_score[:]]
y_xgb = [max(0, value) for value in xgb_score[:]]
plt.plot(x, y_lasso, label="Lasso", linestyle="-")
plt.plot(x, y_ridge, label="Ridge", linestyle="-")
plt.plot(x, y_linear, label="Linear Regression", linestyle="-")
plt.plot(x, y_tree, label="Decision Tree", linestyle="-")
plt.plot(x, y_forest, label="Random Forest", linestyle="-")
plt.plot(x, y_knn, label="kNN", linestyle="-")
plt.plot(x, y_svr, label="SVR", linestyle="-")
plt.plot(x, y_mlp, label="MLP", linestyle="-")
plt.plot(x, y_xgb, label="XGBoost", linestyle="-")
ax.set_xlim([0,100])
ax.set_ylim([-0.05,1.05])
x_ticks = np.linspace(0, 100,11)
plt.xticks(x_ticks, fontweight="bold")
y_ticks = np.linspace(0, 1, 6)
plt.yticks(y_ticks, fontweight="bold")
plt.xlabel("Data Size for Training Set", fontsize=20, fontweight="bold", font="Arial")
plt.ylabel("$R\u00b2$ Score", fontsize=20, fontweight="bold", font="Arial")
plt.legend(loc="upper left", fontsize=10)

