In [1]:
from sklearn.cluster import KMeans
from sklearn.datasets import load_diabetes,fetch_california_housing,load_breast_cancer,load_iris
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler,PolynomialFeatures
from sklearn.linear_model import Ridge, LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVR, SVC
from sklearn.metrics import mean_squared_error, accuracy_score
from lightgbm import LGBMRegressor, LGBMClassifier
from sklearn.ensemble import IsolationForest
import numpy as np
import pandas as pd

In [33]:
data, target = load_diabetes(return_X_y=True)
#data = PolynomialFeatures().fit_transform(data)

In [2]:
data = pd.read_csv("CASP.csv")
target = np.array(data["RMSD"])
data = data.drop(columns="RMSD")

In [70]:
data = PolynomialFeatures(degree=2).fit_transform(data)

In [3]:
X_train, X_test, y_train, y_test = train_test_split(data,target,random_state=42)

In [66]:
def isolation(X_train,X_test):
    iso = IsolationForest()
    X_train_isolate = iso.fit_predict(X_train)
    X_test_isolate = iso.predict(X_test)
    return X_train_isolate, X_test_isolate

In [67]:
X_train_isolate, X_test_isolate = isolation(X_train,X_test)
unique, counts = np.unique(X_train_isolate, return_counts=True)
print(np.asarray((unique, counts)).T)

[[   -1  2984]
 [    1 31313]]


In [4]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [5]:
def k_meanscluster(X_train,X_test,n_clusters):
    cluster = KMeans(n_clusters=n_clusters,random_state=42).fit(X_train)
    train_labels = cluster.labels_
    test_labels = cluster.predict(X_test)
    return train_labels,test_labels
def data_spliter(X_train, X_test, y_train, y_test,train_labels,test_labels,index):
    train_index = np.where(train_labels==index)
    test_index = np.where(test_labels==index)
    X_train_loc = X_train[train_index]
    y_train_loc = y_train[train_index]
    X_test_loc = X_test[test_index]
    y_test_loc = y_test[test_index]
    return X_train_loc,X_test_loc,y_train_loc,y_test_loc

In [79]:
def predic_ridge(X_train, X_test, y_train, y_test):
    parameters = {'alpha':np.logspace(-4,1,50)}
    model = Ridge()
    Ridge_reg= GridSearchCV(model, parameters, scoring='neg_mean_squared_error',cv=5)
    Ridge_reg.fit(X_train,y_train)
    print("std",Ridge_reg.cv_results_['std_test_score'][Ridge_reg.best_index_])
    print("mean_error",Ridge_reg.cv_results_['mean_test_score'][Ridge_reg.best_index_])
    best_model = Ridge_reg.best_estimator_
    best_model.fit(X_train,y_train)
    y_pred = best_model.predict(X_test)
    error = mean_squared_error(y_test,y_pred) * len(y_test)
    return error

In [78]:
predic_ridge(X_train, X_test, y_train, y_test)

0.2854720976558183
-23.8419786409233


272688.3614439119

In [81]:
for cluster in range(1,11):
    try:
        train_labels,test_labels = k_meanscluster(X_train,X_test,n_clusters=cluster)
        total_error = 0
        for index in range(0,cluster):
            X_train_loc,X_test_loc,y_train_loc,y_test_loc = data_spliter(X_train, X_test, y_train, y_test,train_labels,test_labels,index=index)
            scaler = StandardScaler()
            X_train_loc = scaler.fit_transform(X_train_loc)
            X_test_loc = scaler.transform(X_test_loc)
            print("cluster ",cluster,".",index+1,sep="")
            error = predic_ridge(X_train_loc,X_test_loc,y_train_loc,y_test_loc)
            total_error += error
        print("cluster total", cluster)
        print("mse", total_error / len(y_test))
        print("*****")
    except:
        continue


cluster 1 . 0
std 0.28547209765224085
mean_error -23.84197864094498
cluster total 1
mse 23.850989368133163
*****
cluster 2 . 0
std 0.3005573243144621
mean_error -22.759683034687463
cluster 2 . 1
cluster 3 . 0
std 0.2027696764150684
mean_error -21.856788413149644
cluster 3 . 1
std 2.1099824307029644
mean_error -20.54092568710251
cluster 3 . 2
std 0.7294813455838982
mean_error -23.465005039412784
cluster total 3
mse 23.42863055802718
*****
cluster 4 . 0
std 0.28917032828427824
mean_error -20.67222556186487
cluster 4 . 1
std 1.720830399810003
mean_error -17.992733862087825
cluster 4 . 2
std 0.6183631909153756
mean_error -22.739020759958386
cluster 4 . 3
std 0.8127990095073181
mean_error -23.99947093998399
cluster total 4
mse 22.266799896784512
*****
cluster 5 . 0
std 0.41772539763550226
mean_error -22.626009484740056
cluster 5 . 1
std 0.28254899312269477
mean_error -20.578002265726653
cluster 5 . 2
std 1.3002429215189721
mean_error -18.12339633907049
cluster 5 . 3
std 1.038669736668566
me

In [9]:
def predic_svr(X_train, X_test, y_train, y_test):
    parameters = {"C":np.arange(1,10), "epsilon":np.logspace(-4,0,10)}
    model = SVR()
    Ridge_reg= GridSearchCV(model, parameters, scoring='neg_mean_squared_error',cv=5,verbose=3)
    Ridge_reg.fit(X_train,y_train)
    print("std",Ridge_reg.cv_results_['std_test_score'][Ridge_reg.best_index_])
    print("mean_error",Ridge_reg.cv_results_['mean_test_score'][Ridge_reg.best_index_])
    best_model = Ridge_reg.best_estimator_
    best_model.fit(X_train,y_train)
    y_pred = best_model.predict(X_test)
    error = mean_squared_error(y_test,y_pred) * len(y_test)
    return error

In [10]:
predic_svr(X_train, X_test, y_train, y_test)

Fitting 5 folds for each of 90 candidates, totalling 450 fits
[CV 1/5] END .............C=1, epsilon=0.0001;, score=-24.146 total time=  45.2s


KeyboardInterrupt: 

In [84]:
for cluster in range(1,11):
    try:
        train_labels,test_labels = k_meanscluster(X_train,X_test,n_clusters=cluster)
        total_error = 0
        for index in range(0,cluster):
            X_train_loc,X_test_loc,y_train_loc,y_test_loc = data_spliter(X_train, X_test, y_train, y_test,train_labels,test_labels,index=index)
            scaler = StandardScaler()
            X_train_loc = scaler.fit_transform(X_train_loc)
            X_test_loc = scaler.transform(X_test_loc)
            print("cluster ",cluster,".",index+1,sep="")
            error = predic_svr(X_train_loc,X_test_loc,y_train_loc,y_test_loc)
            total_error += error
        print("cluster total", cluster)
        print("mse", total_error / len(y_test))
        print("*****")
    except:
        continue

cluster 1.1


In [21]:
def predic_lgbm(X_train, X_test, y_train, y_test):
    parameters = {"n_estimators":[100,150,200], "learning_rate":np.logspace(-4,0,5),"max_depth":[4,5,6]}
    model = LGBMRegressor()
    Ridge_reg= GridSearchCV(model, parameters, scoring='neg_mean_squared_error',cv=5)
    
    Ridge_reg.fit(X_train,y_train)
    print("std",Ridge_reg.cv_results_['std_test_score'][Ridge_reg.best_index_])
    print("mean_error",Ridge_reg.cv_results_['mean_test_score'][Ridge_reg.best_index_])
    best_model = Ridge_reg.best_estimator_
    best_model.fit(X_train,y_train)
    y_pred = best_model.predict(X_test)
    error = mean_squared_error(y_test,y_pred) * len(y_test)
    return error

In [22]:
for cluster in range(1,11):
    try:
        train_labels,test_labels = k_meanscluster(X_train,X_test,n_clusters=cluster)
        total_error = 0
        for index in range(0,cluster):
            X_train_loc,X_test_loc,y_train_loc,y_test_loc = data_spliter(X_train, X_test, y_train, y_test,train_labels,test_labels,index=index)
            #scaler = StandardScaler()
            #X_train_loc = scaler.fit_transform(X_train_loc)
            #X_test_loc = scaler.transform(X_test_loc)
            error = predic_lgbm(X_train_loc,X_test_loc,y_train_loc,y_test_loc)
            total_error += error
        print("cluster", cluster)
        print("mse", total_error / len(y_test))
        print("*****")
    except:
        continue

std 0.1985798004071326
mean_error -15.93134644059198
cluster 1
mse 15.64402837168209
*****
std 0.1567057420496744
mean_error -14.752340884204802
std 0.4891326062878382
mean_error -15.411960594391283
cluster 2
mse 14.60423293185712
*****
std 0.32062729738285684
mean_error -13.52399014517511
std 0.23941798166194728
mean_error -16.206072396363034
std 0.5867832325133582
mean_error -14.960731098313943
cluster 3
mse 14.27278816513029
*****
std 0.219738353200157
mean_error -11.932273824704225
std 1.0553795788640135
mean_error -14.213608273610927
std 0.560833270926266
mean_error -15.252351190252213
std 60.90095449268752
mean_error -47.62389398612481
std 0.2924743494677846
mean_error -16.003297039043527
cluster 5
mse 14.001515788966199
*****
std 0.2668177121133112
mean_error -13.452784058240315
std 1.1924298513352491
mean_error -14.366447665861582
std 0.27180740561442746
mean_error -12.147076600055748
std 0.2955669114970417
mean_error -16.46275655352468
std 0.8376044228917736
mean_error -16.152

In [7]:
def predic_logistic(X_train, X_test, y_train, y_test):
    parameters = {'C':np.logspace(-4,1,50)}
    model = LogisticRegression()
    Ridge_reg= GridSearchCV(model, parameters, scoring='accuracy',cv=5)
    Ridge_reg.fit(X_train,y_train)
    best_model = Ridge_reg.best_estimator_
    best_model.fit(X_train,y_train)
    y_pred = best_model.predict(X_test)
    error = accuracy_score(y_test,y_pred) * len(y_test)
    return error

In [8]:
for cluster in range(1,11):

    train_labels,test_labels = k_meanscluster(X_train,X_test,n_clusters=cluster)
    total_error = 0
    for index in range(0,cluster):
        X_train_loc,X_test_loc,y_train_loc,y_test_loc = data_spliter(X_train, X_test, y_train, y_test,train_labels,test_labels,index=index)
        error = predic_logistic(X_train_loc,X_test_loc,y_train_loc,y_test_loc)
        print(error)
        total_error += error
    print("cluster", cluster)
    print("mse", total_error / len(y_test))
    print("*****")

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

2349.0
cluster 1
mse 0.6272363150867823
*****


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

2351.0


ValueError: Cannot have number of splits n_splits=5 greater than the number of samples: n_samples=1.