In [127]:
import numpy as np
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import MultiTaskElasticNet, Ridge, RidgeCV, ElasticNet, ElasticNetCV, Lasso, MultiTaskLassoCV
from sklearn.metrics import mean_squared_error, median_absolute_error, r2_score, explained_variance_score, mean_absolute_percentage_error
from scipy.io import loadmat, savemat
from sklearn.preprocessing import StandardScaler
from big_sleep.clip import load
from big_sleep.big_spose_sleep import create_clip_img_transform
import glob
import torch
from os.path import exists
from ridge import ridge, ridge_corr, bootstrap_ridge
#https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.ElasticNet.html#sklearn.linear_model.ElasticNet

from torchvision import datasets, transforms

from sklearn.datasets import make_regression
from sklearn.model_selection import train_test_split

from sklearn.multioutput import MultiOutputRegressor
from sklearn.ensemble import GradientBoostingRegressor

## Extract CLIP features

In [128]:
thingsroot = "/Users/katja/Documents/Data/THINGS/"

#thingscats = [catdir.split("/")[-1] for catdir in glob.glob(thingsroot+"/images/*")]
thingsimgfns = thingsroot+"/images/{}/*.*"

regr_data_fn = "../data_spose_to_clip.mat"

reextract_data = False

thingscats = []
with open(thingsroot+"THINGS_unique_IDs.txt", 'r') as handle:
    lines = handle.readlines()
    for line in lines:
        line = line.strip()
        if len(line)>0:
            thingscats.append(line)

assert(len(thingscats)==1854)

spose_cat_emb = np.loadtxt(thingsroot+"spose_embedding_49d_sorted.txt")

assert(len(thingscats)==spose_cat_emb.shape[0])

#num_imgs = 0
#for thingscat in thingscats:
#    thingscatimgfns = glob.glob(thingsimgfns.format(thingscat))
#    num_imgs += len(thingscatimgfns)
#print("Number of THINGS images found:", num_imgs)
    
clip_perceptor, _ = load('ViT-B/32', jit = False)
clip_transform = create_clip_img_transform(224)

In [129]:
thingsimgs = datasets.ImageFolder(thingsroot+'/images/', transform=clip_transform)
thingsloader = torch.utils.data.DataLoader(thingsimgs, batch_size=1, shuffle=False)

print("Number of THINGS images found:", len(thingsloader))

print(clip_transform)

num_imgs = len(thingsloader)

Number of THINGS images found: 26107
Compose(
    Resize(size=224, interpolation=PIL.Image.BILINEAR)
    CenterCrop(size=(224, 224))
    ToTensor()
    Normalize(mean=[0.48145466, 0.4578275, 0.40821073], std=[0.26862954, 0.26130258, 0.27577711])
)


In [130]:
# Looping through it, get a batch on each loop 
show_num = 50
for i, (thingsimg, catID) in enumerate(thingsloader): 
    #print(thingscats[catID])
    #print(thingsimg.shape)
    if i > show_num:
        break

In [131]:
%%time

if not exists(regr_data_fn) and reextract_data:
    # CPU times: user 1h 7min 42s, sys: 2min 53s, total: 1h 10min 36s
    # Wall time: 1h 8min 37s

    x_spose_vecs = np.zeros([num_imgs,49])  # X: SPoSE vectors (same for each cat)
    y_clip_vecs = np.zeros([num_imgs,512])  # Y: clip vectors

    # Load data
    img_i = 0
    for i, (thingsimg, catID) in enumerate(thingsloader):

        x_spose_vecs[i,:] = spose_cat_emb[catID,:] 
        y_clip_vecs[i,:] = clip_perceptor.encode_image(thingsimg).detach().numpy().squeeze()

        img_i += 1

    savemat(regr_data_fn, {"x_spose":x_spose_vecs, "y_clip":y_clip_vecs})
else: 
    regr_data = loadmat(regr_data_fn)
    x_spose_vecs = regr_data["x_spose"]
    y_clip_vecs = regr_data["y_clip"]

CPU times: user 18.2 ms, sys: 36.1 ms, total: 54.3 ms
Wall time: 60.3 ms


In [132]:
print("X shape:", x_spose_vecs.shape)
print("Y shape:", y_clip_vecs.shape)

xspose_train, xspose_test, yclip_train, yclip_test = train_test_split( x_spose_vecs, 
                                                                       y_clip_vecs, 
                                                                       test_size=0.10, 
                                                                       random_state=42)

xclip_train, xclip_test, yspose_train, yspose_test = train_test_split( y_clip_vecs, 
                                                                       x_spose_vecs, 
                                                                       test_size=0.10, 
                                                                       random_state=42)

X shape: (26107, 49)
Y shape: (26107, 512)


In [112]:
if False: # scaling
    scaler = StandardScaler()
    xspose_train = scaler.fit_transform(xspose_train)
    xspose_test = scaler.fit_transform(xspose_test)
    
    xclip_train = scaler.fit_transform(xclip_train)
    xclip_test = scaler.fit_transform(xclip_test)

In [104]:
# checking data for issues by doing CLIP-to-SPoSE (which is supposed to work)
model = Ridge()
model.fit(xclip_train, yspose_train)
yspose_test_pred = model.predict(xclip_test)

print("Default Ridge SPoSE-to-CLIP R2:", r2_score(yspose_test, yspose_test_pred))

# TODO: find best
# TODO: determine best parameters

Default Ridge SPoSE-to-CLIP R2: 0.48681972609016083


In [105]:
# checking data for issues by doing CLIP-to-SPoSE (which is supposed to work)
model = RidgeCV()
clf = model.fit(xclip_train, yspose_train)
clf.score(xclip_train, yspose_train)

0.5152470793049417

In [113]:
wt, corr, valphas, bscorrs, valinds = bootstrap_ridge(xclip_train, yspose_train, 
                                                      xclip_test, yspose_test,
                                                      alphas=np.logspace(-100, 100, 60),
                                                      nboots=5,
                                                      chunklen=10, nchunks=15, return_wt=True)

In [125]:
print(np.mean(corr))  # 0.68

0.42757745111063405


In [126]:
wt.shape

(49, 512)

In [116]:
wt, corr, valphas, bscorrs, valinds = bootstrap_ridge(xspose_train, yclip_train, 
                                                      xspose_test, yclip_test,
                                                      alphas=np.logspace(-2, 100, 100),
                                                      nboots=5,
                                                      chunklen=10, nchunks=15, return_wt=True)

In [124]:
np.max(corr)

0.795284011669666

In [133]:
# train on all data
wt, corr, valphas, bscorrs, valinds = bootstrap_ridge(x_spose_vecs, y_clip_vecs, 
                                                      xspose_test, yclip_test,
                                                      alphas=np.logspace(-2, 100, 100),
                                                      nboots=5,
                                                      chunklen=10, nchunks=15, return_wt=True)

In [134]:
print(np.mean(corr))  # 0.68

0.42767548855293525


In [135]:
savemat('big_sleep/data/W_aridge_spose_to_clip.mat', {'W':wt} )

In [101]:
# checking data for issues by doing CLIP-to-SPoSE (which is supposed to work)
model = MultiTaskLassoCV()
clf = model.fit(xclip_train, yspose_train)
clf.score(xclip_train, yspose_train)

0.5128427207844467

In [102]:
# checking data for issues by doing CLIP-to-SPoSE (which is supposed to work)
model = MultiTaskElasticNetCV()
clf = model.fit(xclip_train, yspose_train)
clf.score(xclip_train, yspose_train)

0.5127160482374192

In [None]:
# TODO: set up modeling with CLIP-to-spose

# TODO: standard scaler
# TODO: compare regression methods: https://towardsdatascience.com/quickly-test-multiple-models-a98477476f0
# TODO: determine best parameters via CV
# TODO: try alexridge

In [None]:
# TODO: set up modeling with CLIP-to-spose



In [31]:
model = MultiTaskElasticNet()
model.fit(xspose_train, yclip_train)
yclip_test_pred = model.predict(xspose_test)

# TODO: determine best parameters

In [32]:
model = MultiTaskLasso()
model.fit(xspose_train, yclip_train)
yclip_test_pred = model.predict(xspose_test)

In [36]:
%%time
#yclip_test_pred = MultiOutputRegressor(GradientBoostingRegressor(random_state=0)).fit(xspose_train, yclip_train).predict(xspose_test)

CPU times: user 3 µs, sys: 0 ns, total: 3 µs
Wall time: 5.96 µs


In [None]:
r2_score(yclip_test, yclip_test_pred)

In [None]:
model = LinearRegression()  # TODO: elasticNet
# fit model
model.fit(X, y)
# make a prediction

# define the evaluation procedure
cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)
# evaluate the model and collect the scores
n_scores = cross_val_score(model, X, y, scoring='neg_mean_absolute_error', cv=cv, n_jobs=-1)
# force the scores to be positive
n_scores = absolute(n_scores)
# summarize performance
print('MAE: %.3f (%.3f)' % (mean(n_scores), std(n_scores)))


In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score

from sklearn.preprocessing import StandardScaler
from statistics import mean

X_train, X_test, Y_train, Y_test = train_test_split(df,target,test_size=0.3)
scaler = StandardScaler()
features = scaler.fit_transform(X_train)
scaled_x_test = scaler.fit_transform(X_test)
regression = LinearRegression()
model = regression.fit(features, Y_train)
preds = model.predict(scaled_x_test)

mean(cross_val_score(regression, preds.reshape(-1,1), Y_test, cv=10))


In [None]:
# TODO: ensemble randomforest
# TODO: test model accuracy

import xgboost as xgb
from sklearn.metrics import mean_squared_error

regressor = xgb.XGBRegressor(
    learning_rate=0.01,
    colsample_bytree=0.8,
    n_estimators=430,
    reg_lambda=1,
    gamma=1,
    max_depth=3,
    subsample=0.55
)
model = regressor.fit(features,Y_train)
preds = regressor.predict(scaled_x_test)
r2_score=mean_squared_error(Y_test, preds)