In [None]:
print('Hello')

Hello


In [None]:
import pandas as pd

# Load the dataset
file_path = '/content/alloy-confp-train-data_v2.csv'
data = pd.read_csv(file_path)

# Display the first few rows to understand the dataset
data.head()

Unnamed: 0,ID,HV,C.al,C.co,C.cr,C.cu,C.fe,C.ni
0,7,139,0.0,0.25,0.25,0.0,0.25,0.25
1,48,295,0.153846,0.307692,0.153846,0.076923,0.153846,0.153846
2,108,545,0.222222,0.111111,0.0,0.222222,0.222222,0.222222
3,146,701,0.428571,0.214286,0.071429,0.071429,0.071429,0.142857
4,37,207,0.090909,0.181818,0.181818,0.181818,0.181818,0.181818


In [None]:
# Splitting data into features and target variable
from sklearn.model_selection import train_test_split
X = data.drop(columns=['ID', 'HV'])
y = data['HV']

# Splitting into train and test sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# prompt: standarize the above split of data

from sklearn.preprocessing import StandardScaler

# Initialize the StandardScaler
scaler = StandardScaler()

# Fit the scaler on the training data and transform it
X_train= scaler.fit_transform(X_train)

# Transform the test data using the same scaler fitted on the training data
X_test= scaler.transform(X_test)


In [None]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error

# Decision Tree Regressor
dt_model = DecisionTreeRegressor(random_state=42)
dt_model.fit(X_train, y_train)
dt_pred = dt_model.predict(X_test)
dt_rmse = mean_squared_error(y_test, dt_pred, squared=False)

In [None]:
dt_rmse

70.98503363385834

In [None]:
from sklearn.ensemble import RandomForestRegressor


# Random Forest Regressor
rf_model = RandomForestRegressor(random_state=42)
rf_model.fit(X_train, y_train)
rf_pred = rf_model.predict(X_test)
rf_rmse = mean_squared_error(y_test, rf_pred, squared=False)
rf_rmse

50.64008585926108

In [None]:
from xgboost import XGBRegressor

# XGBoost Regressor
xgb_model = XGBRegressor(random_state=42)
xgb_model.fit(X_train, y_train)
xgb_pred = xgb_model.predict(X_test)
xgb_rmse = mean_squared_error(y_test, xgb_pred, squared=False)
xgb_rmse

45.6848930425652

In [None]:
from sklearn.model_selection import GridSearchCV
# Setting up hyperparameter grids for tuning
dt_param_grid = {'max_depth': [5, 10, 15, None], 'min_samples_split': [2, 5, 10]}

# Decision Tree tuning
dt_grid_search = GridSearchCV(DecisionTreeRegressor(random_state=42), dt_param_grid, cv=5, scoring='neg_mean_squared_error')
dt_grid_search.fit(X_train, y_train)
best_dt_model = dt_grid_search.best_estimator_

In [None]:
# Getting the tuned RMSE values
best_dt_pred = best_dt_model.predict(X_test)
best_dt_rmse = mean_squared_error(y_test, best_dt_pred, squared=False)
best_dt_rmse

70.70567341673166

In [None]:
rf_param_grid = {'n_estimators': [50, 100, 200], 'max_depth': [5, 10, 15, None], 'min_samples_split': [2, 5, 10]}

# Random Forest tuning
rf_grid_search = GridSearchCV(RandomForestRegressor(random_state=42), rf_param_grid, cv=5, scoring='neg_mean_squared_error')
rf_grid_search.fit(X_train, y_train)
best_rf_model = rf_grid_search.best_estimator_


best_rf_pred = best_rf_model.predict(X_test)


best_rf_rmse = mean_squared_error(y_test, best_rf_pred, squared=False)

In [None]:
best_rf_rmse

49.97395182951148

In [None]:
# XGBoost hyperparameter tuning
xgb_param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [3, 6, 10],
    'learning_rate': [0.01, 0.1, 0.3]
}

xgb_grid_search = GridSearchCV(XGBRegressor(random_state=42), xgb_param_grid, cv=5, scoring='neg_mean_squared_error')
xgb_grid_search.fit(X_train, y_train)

# Getting the best tuned XGBoost model
best_xgb_model = xgb_grid_search.best_estimator_
best_xgb_pred = best_xgb_model.predict(X_test)
best_xgb_rmse = mean_squared_error(y_test, best_xgb_pred, squared=False)

best_xgb_rmse


49.17626514168451

In [None]:
from sklearn.svm import SVR
# Define the SVR model
svr_model = SVR()
svr_model=svr_model.fit(X_train,y_train)
y_predict=svr_model.predict(X_test)
svr_rmse=mean_squared_error(y_predict,y_test,squared=False)
svr_rmse

243.790140502042

In [None]:

# Define the hyperparameters grid to search over
param_grid = {
    'C': [0.1, 1, 10, 100],    # Regularization parameter
    'gamma': ['scale', 'auto', 0.001, 0.01, 0.1, 1],  # Kernel coefficient
    'kernel': ['linear', 'rbf', 'poly']  # Kernel type
}

# Perform GridSearchCV to find the best hyperparameters
grid_search = GridSearchCV(estimator=svr_model, param_grid=param_grid, cv=5, scoring='neg_mean_squared_error', verbose=2)
grid_search.fit(X_train, y_train)

# Print the best parameters
print("Best Hyperparameters: ", grid_search.best_params_)

# Train the SVR model with the best parameters
best_svr = grid_search.best_estimator_

# Make predictions on the test set
y_pred = best_svr.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred,squared=False)
print(f"Mean Squared Error: {mse}")



Fitting 5 folds for each of 72 candidates, totalling 360 fits
[CV] END ..................C=0.1, gamma=scale, kernel=linear; total time=   0.0s
[CV] END ..................C=0.1, gamma=scale, kernel=linear; total time=   0.0s
[CV] END ..................C=0.1, gamma=scale, kernel=linear; total time=   0.0s
[CV] END ..................C=0.1, gamma=scale, kernel=linear; total time=   0.0s
[CV] END ..................C=0.1, gamma=scale, kernel=linear; total time=   0.0s
[CV] END .....................C=0.1, gamma=scale, kernel=rbf; total time=   0.0s
[CV] END .....................C=0.1, gamma=scale, kernel=rbf; total time=   0.0s
[CV] END .....................C=0.1, gamma=scale, kernel=rbf; total time=   0.0s
[CV] END .....................C=0.1, gamma=scale, kernel=rbf; total time=   0.0s
[CV] END .....................C=0.1, gamma=scale, kernel=rbf; total time=   0.0s
[CV] END ....................C=0.1, gamma=scale, kernel=poly; total time=   0.0s
[CV] END ....................C=0.1, gamma=scale

In [None]:
def polt_parity(y_cv_test,y_pred_test, y_cv_train=None,y_pred_train=None, label=None, ylim=[50,900]):
    """
    Function to make parity plots.
    """

    # Plot Parity plot
    rmse_test = np.sqrt(mean_squared_error(y_cv_test,y_pred_test))
    r2_test = r2_score(y_cv_test,y_pred_test)

    if y_cv_train is None:
        fig, ax1 = plt.subplots(nrows=1, ncols=1, figsize=(5,4), sharey=True, sharex=True)
    else:
        fig, (ax1,ax2) = plt.subplots(nrows=1, ncols=2, figsize=(9,4), sharey=True, sharex=True)

    ax1.scatter(y_cv_test,y_pred_test)
    ax1.text(0.95, 0.26, label, transform=ax1.transAxes, ha='right', fontsize=14)
    ax1.text(0.95, 0.18, "RMSE: %.2f"%rmse_test, transform=ax1.transAxes, ha='right', fontsize=14)
    ax1.text(0.95, 0.1, "R$^2$: %.2f"%r2_test, transform=ax1.transAxes, ha='right', fontsize=14)
    ax1.plot(ylim, ylim, '--k')
    ax1.set_xlabel('True y', fontsize=14)
    ax1.set_ylabel('Pred y', fontsize=14)
    ax1.set_xlim(ylim[0],ylim[1])
    ax1.set_ylim(ylim[0],ylim[1])

    if y_cv_train is not None:
        rmse_train = np.sqrt(mean_squared_error(y_cv_train,y_pred_train))
        r2_train = r2_score(y_cv_train,y_pred_train)

        ax2.scatter(y_cv_train,y_pred_train, c='m')
        ax2.text(0.95, 0.26, "Train", transform=ax2.transAxes, ha='right', fontsize=14)
        ax2.text(0.95, 0.18, "RMSE: %.2f"%rmse_train, transform=ax2.transAxes, ha='right', fontsize=14)
        ax2.text(0.95, 0.1, "R2: %.2f"%r2_train, transform=ax2.transAxes, ha='right', fontsize=14)
        ax2.plot(ylim, ylim, '--k')

        ax2.set_xlabel('True y', fontsize=14)
        ax2.set_xlim(ylim[0],ylim[1])
        ax2.set_ylim(ylim[0],ylim[1])

    plt.tight_layout()
    plt.show()

    return None

In [None]:
kf = KFold(n_splits=5)

errors = []

for idx, (train, val) in enumerate(kf.split(X)):
    X_cv_train = X.values[train]
    X_cv_val = X.values[val]

    y_cv_train = y.values[train]
    y_cv_val = y.values[val]


    # Model fit and prediction
    model = lr.fit(X_cv_train,y_cv_train)
    y_pred_val = model.predict(X_cv_val)
    y_pred_train = model.predict(X_cv_train)


    # Computing errors
    rmse_val = np.sqrt(mean_squared_error(y_cv_val, y_pred_val))
    rmse_train = np.sqrt(mean_squared_error(y_cv_train, y_pred_train))

    r2_val = r2_score(y_cv_val, y_pred_val)
    r2_train = r2_score(y_cv_train, y_pred_train)


    # Plot Parity plot
    polt_parity(y_cv_val,y_pred_val, y_cv_train,y_pred_train, label="Val")

    print("Root mean squared error: %.2f" % rmse_val)
    print("Coefficient of determination: %.2f" % r2_val)

    error = {'kfold': idx+1,
             'rmse_train': rmse_train,
             'r2_train': r2_train,
             'rmse_val': rmse_val,
             'r2_val': r2_val
            }

    errors.append(error)

In [None]:
def make_new_feature(data, cols, powers, new_col_name):
    data[new_col_name] = 1
    for power, col in zip(powers, cols):
        data[new_col_name] = data[new_col_name] * (data[Xcols[col]] ** power)

    return data

In [None]:
cols = [0,1]
powers = [1,1]
new_data = make_new_feature(new_data, cols, powers, 'n.1')

In [None]:
cols = [1,3]
powers = [1,1]
new_data = make_new_feature(new_data, cols, powers, 'n.2')

cols = [2,4]
powers = [1,2]
new_data = make_new_feature(new_data, cols, powers, 'n.3')

cols = [3,4,5]
powers = [1,1,1]
new_data = make_new_feature(new_data, cols, powers, 'n.4')

cols = [4,5]
powers = [1,1]
new_data = make_new_feature(new_data, cols, powers, 'n.5')

In [None]:
kf = KFold(n_splits=5)

errors_pr = []

for idx, (train, val) in enumerate(kf.split(X)):
    X_cv_train = X.values[train]
    X_cv_val = X.values[val]

    y_cv_train = y.values[train]
    y_cv_val = y.values[val]


    # Model fit and prediction
    model = lr.fit(X_cv_train,y_cv_train)
    y_pred_val = model.predict(X_cv_val)
    y_pred_train = model.predict(X_cv_train)


    # Computing errors
    rmse_val = np.sqrt(mean_squared_error(y_cv_val, y_pred_val))
    rmse_train = np.sqrt(mean_squared_error(y_cv_train, y_pred_train))

    r2_val = r2_score(y_cv_val, y_pred_val)
    r2_train = r2_score(y_cv_train, y_pred_train)


    # Plot Parity plot
    polt_parity(y_cv_val,y_pred_val, y_cv_train,y_pred_train, label="Val")

    print("Root mean squared error: %.2f" % rmse_val)
    print("Coefficient of determination: %.2f" % r2_val)

    error = {'kfold': idx+1,
             'rmse_train': rmse_train,
             'r2_train': r2_train,
             'rmse_val': rmse_val,
             'r2_val': r2_val
            }

    errors_pr.append(error)
errors_pr = pd.DataFrame(errors_pr)

In [None]:
errors_pr_l2 = []

for idx, (train, val) in enumerate(kf.split(X)):
    X_cv_train = X.values[train]
    X_cv_val = X.values[val]

    y_cv_train = y.values[train]
    y_cv_val = y.values[val]


    # Model fit and prediction
    model = pr_l2.fit(X_cv_train,y_cv_train)
    y_pred_val = model.predict(X_cv_val)
    y_pred_train = model.predict(X_cv_train)


    # Computing errors
    rmse_val = np.sqrt(mean_squared_error(y_cv_val, y_pred_val))
    rmse_train = np.sqrt(mean_squared_error(y_cv_train, y_pred_train))

    r2_val = r2_score(y_cv_val, y_pred_val)
    r2_train = r2_score(y_cv_train, y_pred_train)


    # Plot Parity plot
    polt_parity(y_cv_val,y_pred_val, y_cv_train,y_pred_train, label="Val")

    print("Mean squared error: %.2f" % rmse_val)
    print("Coefficient of determination: %.2f" % r2_val)

    error = {'kfold': idx+1,
             'rmse_train': rmse_train,
             'rmse_val': rmse_val,
             'r2_val': r2_val,
             'r2_train': r2_train}

    errors_pr_l2.append(error)
errors_pr_l2 = pd.DataFrame(errors_pr_l2)

In [None]:
pr_ridge_c_opt = Ridge(alpha=c_opt)

errors_l2_c_opt = []

for idx, (train, val) in enumerate(kf.split(X)):
    X_cv_train = X.values[train]
    X_cv_val = X.values[val]

    y_cv_train = y.values[train]
    y_cv_val = y.values[val]


    # Model fit and prediction
    model = pr_ridge_c_opt.fit(X_cv_train,y_cv_train)
    y_pred_val = model.predict(X_cv_val)
    y_pred_train = model.predict(X_cv_train)


    # Computing errors
    rmse_val = np.sqrt(mean_squared_error(y_cv_val, y_pred_val))
    rmse_train = np.sqrt(mean_squared_error(y_cv_train, y_pred_train))

    r2_val = r2_score(y_cv_val, y_pred_val)
    r2_train = r2_score(y_cv_train, y_pred_train)


    # Plot Parity plot
    polt_parity(y_cv_val,y_pred_val, y_cv_train,y_pred_train, label="Val")

    print("Root Mean squared error: %.2f" % rmse_val)
    print("Coefficient of determination: %.2f" % r2_val)

    error = {'kfold': idx+1,
             'rmse_train': rmse_train,
             'rmse_val': rmse_val,
             'r2_val': r2_val,
             'r2_train': r2_train}

    errors_l2_c_opt.append(error)
errors_l2_c_opt = pd.DataFrame(errors_l2_c_opt)

In [None]:
#c_values = [0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 1, 2, 3]

c_values = [1, 2, 3, 3.5, 4, 5, 6, 7, 8, 10]
errors_norm = []

for alp in c_values:
    pr_l2_norm = Ridge(alpha=alp)
    for idx, (train, val) in enumerate(kf.split(X)):
        _X_cv_train = X.values[train]
        _X_cv_val = X.values[val]

        X_cv_train = scaler.fit_transform(_X_cv_train)
        X_cv_val = scaler.transform(_X_cv_val)

        y_cv_train = y.values[train]
        y_cv_val = y.values[val]


        # Model fit and prediction
        model = pr_l2_norm.fit(X_cv_train,y_cv_train)
        y_pred_val = model.predict(X_cv_val)
        y_pred_train = model.predict(X_cv_train)


        # Computing errors
        rmse_val = np.sqrt(mean_squared_error(y_cv_val, y_pred_val))
        rmse_train = np.sqrt(mean_squared_error(y_cv_train, y_pred_train))

        r2_val = r2_score(y_cv_val, y_pred_val)
        r2_train = r2_score(y_cv_train, y_pred_train)


        # Plot Parity plot
        polt_parity(y_cv_val,y_pred_val, y_cv_train,y_pred_train, label="Val")

        print("Root Mean squared error: %.2f" % rmse_val)
        print("Coefficient of determination: %.2f" % r2_val)

        error = {'c':alp,
                 'kfold': idx+1,
                 'rmse_train': rmse_train,
                 'rmse_val': rmse_val,
                 'r2_val': r2_val,
                 'r2_train': r2_train}

        errors_norm.append(error)

errors_norm = pd.DataFrame(errors_norm)

In [None]:
pr_l2_copt_rfe = Ridge(alpha=c_opt)
selector = RFECV(pr_l2_copt_rfe, step=1, cv=5, scoring='neg_mean_squared_error')
selector = selector.fit(X, y)
sel_features = Xcols[selector.support_]

In [None]:
fig, ax1 = plt.subplots(nrows=1, ncols=1, figsize=(5,4))
ax1.plot(np.sqrt(selector.cv_results_['mean_test_score']*-1), '-bo')
ax1.axhline(errors_l2_c_opt.mean()['rmse_val'], linestyle='--', color='k')
ax1.set_ylim(70,100)
ax1.set_xlabel('# of features', fontsize=14)
ax1.set_ylabel('RMSE', fontsize=14)
plt.tight_layout()

In [None]:
c_values = [0.0001, 0.0005, 0.00065, 0.0008, 0.001, 0.0012, 0.0014, 0.002, 0.01, 0.05, 0.1, 1, 2]
hyper_params = {'alpha':c_values}

pr_rfe_ridge = Ridge()
clf = GridSearchCV(pr_rfe_ridge, hyper_params, cv=5, scoring='neg_mean_squared_error')

_ = clf.fit(X,y)