# svm_preds 475

https://www.kaggle.com/maunish/clrp-roberta-svm?scriptVersionId=64598846

In [None]:
W475 = [.2]*5

In [None]:
import os
import gc
import sys
import cv2
import math
import time
import tqdm
import random
import numpy as np
import pandas as pd
import seaborn as sns
from tqdm import tqdm
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings('ignore')

from sklearn.svm import SVR

from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold,StratifiedKFold

import torch
import torchvision
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.optim import Adam, lr_scheduler
from torch.utils.data import Dataset, DataLoader

from transformers import (AutoModel, AutoTokenizer, 
                          AutoModelForSequenceClassification)

import plotly.express as px
import plotly.graph_objs as go
import plotly.figure_factory as ff


from colorama import Fore, Back, Style
y_ = Fore.YELLOW
r_ = Fore.RED
g_ = Fore.GREEN
b_ = Fore.BLUE
m_ = Fore.MAGENTA
c_ = Fore.CYAN
sr_ = Style.RESET_ALL

In [None]:
train_data = pd.read_csv('../input/commonlitreadabilityprize/train.csv')
test_data = pd.read_csv('../input/commonlitreadabilityprize/test.csv')
sample = pd.read_csv('../input/commonlitreadabilityprize/sample_submission.csv')

num_bins = int(np.floor(1 + np.log2(len(train_data))))
train_data.loc[:,'bins'] = pd.cut(train_data['target'],bins=num_bins,labels=False)

target = train_data['target'].to_numpy()
bins = train_data.bins.to_numpy()

def rmse_score(y_true,y_pred):
    return np.sqrt(mean_squared_error(y_true,y_pred))

In [None]:
config = {
    'batch_size':128,
    'max_len':256,
    'nfolds':5,
    'seed':42,
}

def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONASSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

seed_everything(seed=config['seed'])

In [None]:
class CLRPDataset(Dataset):
    def __init__(self,df,tokenizer):
        self.excerpt = df['excerpt'].to_numpy()
        self.tokenizer = tokenizer
    
    def __getitem__(self,idx):
        encode = self.tokenizer(self.excerpt[idx],return_tensors='pt',
                                max_length=config['max_len'],
                                padding='max_length',truncation=True)
        return encode
    
    def __len__(self):
        return len(self.excerpt)

In [None]:
class AttentionHead(nn.Module):
    def __init__(self, in_features, hidden_dim, num_targets):
        super().__init__()
        self.in_features = in_features
        self.middle_features = hidden_dim

        self.W = nn.Linear(in_features, hidden_dim)
        self.V = nn.Linear(hidden_dim, 1)
        self.out_features = hidden_dim

    def forward(self, features):
        att = torch.tanh(self.W(features))

        score = self.V(att)

        attention_weights = torch.softmax(score, dim=1)

        context_vector = attention_weights * features
        context_vector = torch.sum(context_vector, dim=1)

        return context_vector

In [None]:
class Model(nn.Module):
    def __init__(self):
        super(Model,self).__init__()
        self.roberta = AutoModel.from_pretrained('../input/roberta-base')    
        self.head = AttentionHead(768,768,1)
        self.dropout = nn.Dropout(0.1)
        self.linear = nn.Linear(self.head.out_features,1)

    def forward(self,**xb):
        x = self.roberta(**xb)[0]
        x = self.head(x)
        return x

In [None]:
def get_embeddings(df,path,plot_losses=True, verbose=True):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"{device} is used")
            
    model = Model()
    model.load_state_dict(torch.load(path))
    model.to(device)
    model.eval()
    
    tokenizer = AutoTokenizer.from_pretrained('../input/roberta-base')
    
    ds = CLRPDataset(df,tokenizer)
    dl = DataLoader(ds,
                  batch_size = config["batch_size"],
                  shuffle=False,
                  num_workers = 4,
                  pin_memory=True,
                  drop_last=False
                 )
        
    embeddings = list()
    with torch.no_grad():
        for i, inputs in tqdm(enumerate(dl)):
            inputs = {key:val.reshape(val.shape[0],-1).to(device) for key,val in inputs.items()}
            outputs = model(**inputs)
            outputs = outputs.detach().cpu().numpy()
            embeddings.extend(outputs)
    return np.array(embeddings)

In [None]:
# train_embeddings1 =  get_embeddings(train_data,'../input/clrprobertamodels/model0/model0.bin')
# test_embeddings1 = get_embeddings(test_data,'../input/clrprobertamodels/model0/model0.bin')

# train_embeddings2 =  get_embeddings(train_data,'../input/clrprobertamodels/model1/model1.bin')
# test_embeddings2 = get_embeddings(test_data,'../input/clrprobertamodels/model1/model1.bin')

# train_embeddings3 =  get_embeddings(train_data,'../input/clrprobertamodels/model2/model2.bin')
# test_embeddings3 = get_embeddings(test_data,'../input/clrprobertamodels/model2/model2.bin')

# train_embeddings4 =  get_embeddings(train_data,'../input/clrprobertamodels/model3/model3.bin')
# test_embeddings4 = get_embeddings(test_data,'../input/clrprobertamodels/model3/model3.bin')

# train_embeddings5 =  get_embeddings(train_data,'../input/clrprobertamodels/model4/model4.bin')
# test_embeddings5 = get_embeddings(test_data,'../input/clrprobertamodels/model4/model4.bin')

## svm

In [None]:
def get_preds_svm(X,y,X_test,bins=bins,nfolds=5,C=10,kernel='rbf'):
    scores = list()
    preds = np.zeros((X_test.shape[0]))
    
    kfold = StratifiedKFold(n_splits=config['nfolds'],shuffle=True,random_state=config['seed'])
    for k, (train_idx,valid_idx) in enumerate(kfold.split(X,bins)):
        model = SVR(C=C,kernel=kernel,gamma='auto')
        X_train,y_train = X[train_idx], y[train_idx]
        X_valid,y_valid = X[valid_idx], y[valid_idx]
        
        model.fit(X_train,y_train)
        prediction = model.predict(X_valid)
        score = rmse_score(prediction,y_valid)
        print(f'Fold {k} , rmse score: {score}')
        scores.append(score)
        preds += model.predict(X_test)
        
    print("mean rmse",np.mean(scores))
    return np.array(preds)/nfolds

In [None]:
# svm_preds1 = get_preds_svm(train_embeddings1,target,test_embeddings1).mean(axis=1)
# svm_preds2 = get_preds_svm(train_embeddings2,target,test_embeddings2).mean(axis=1)
# svm_preds3 = get_preds_svm(train_embeddings3,target,test_embeddings3).mean(axis=1)
# svm_preds4 = get_preds_svm(train_embeddings4,target,test_embeddings4).mean(axis=1)
# svm_preds5 = get_preds_svm(train_embeaddings5,target,test_embeddings5).mean(axis=1)

In [None]:
# train_embeddings1 =  get_embeddings(train_data,'../input/clrprobertamodels/model0/model0.bin')
# test_embeddings1 = get_embeddings(test_data,'../input/clrprobertamodels/model0/model0.bin')
# svm_preds1 = get_preds_svm(train_embeddings1,target,test_embeddings1)

In [None]:
train_embeddings1 =  get_embeddings(train_data,'../input/clrprobertamodels/model0/model0.bin')
test_embeddings1 = get_embeddings(test_data,'../input/clrprobertamodels/model0/model0.bin')
svm_preds1 = get_preds_svm(train_embeddings1,target,test_embeddings1)
del train_embeddings1,test_embeddings1
gc.collect()

train_embeddings2 =  get_embeddings(train_data,'../input/clrprobertamodels/model1/model1.bin')
test_embeddings2 = get_embeddings(test_data,'../input/clrprobertamodels/model1/model1.bin')
svm_preds2 = get_preds_svm(train_embeddings2,target,test_embeddings2)
del train_embeddings2,test_embeddings2
gc.collect()

train_embeddings3 =  get_embeddings(train_data,'../input/clrprobertamodels/model2/model2.bin')
test_embeddings3 = get_embeddings(test_data,'../input/clrprobertamodels/model2/model2.bin')
svm_preds3 = get_preds_svm(train_embeddings3,target,test_embeddings3)
del train_embeddings3,test_embeddings3
gc.collect()

train_embeddings4 =  get_embeddings(train_data,'../input/clrprobertamodels/model3/model3.bin')
test_embeddings4 = get_embeddings(test_data,'../input/clrprobertamodels/model3/model3.bin')
svm_preds4 = get_preds_svm(train_embeddings4,target,test_embeddings4)
del train_embeddings4,test_embeddings4
gc.collect()

train_embeddings5 =  get_embeddings(train_data,'../input/clrprobertamodels/model4/model4.bin')
test_embeddings5 = get_embeddings(test_data,'../input/clrprobertamodels/model4/model4.bin')
svm_preds5 = get_preds_svm(train_embeddings5,target,test_embeddings5)
del train_embeddings5,test_embeddings5
del train_data, test_data
gc.collect()

In [None]:
# svm_preds = (svm_preds1 + svm_preds2 + svm_preds3 + svm_preds4 + svm_preds5)/5
svm_preds475 = svm_preds1*W475[0] + svm_preds2*W475[1] + \
    svm_preds3*W475[2] + svm_preds4*W475[3] + svm_preds5*W475[4]

In [None]:
# svm_preds475 = svm_preds
svm_preds475[:10]

# svm_preds 476

https://www.kaggle.com/maunish/clrp-roberta-svm?scriptVersionId=64600791

In [None]:
W476 = [.2]*5

In [None]:
import os
import gc
import sys
import cv2
import math
import time
import tqdm
import random
import numpy as np
import pandas as pd
import seaborn as sns
from tqdm import tqdm
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings('ignore')

from sklearn.svm import SVR

from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold,StratifiedKFold

import torch
import torchvision
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.optim import Adam, lr_scheduler
from torch.utils.data import Dataset, DataLoader

from transformers import (AutoModel, AutoTokenizer, 
                          AutoModelForSequenceClassification)

import plotly.express as px
import plotly.graph_objs as go
import plotly.figure_factory as ff


from colorama import Fore, Back, Style
y_ = Fore.YELLOW
r_ = Fore.RED
g_ = Fore.GREEN
b_ = Fore.BLUE
m_ = Fore.MAGENTA
c_ = Fore.CYAN
sr_ = Style.RESET_ALL

In [None]:
train_data = pd.read_csv('../input/commonlitreadabilityprize/train.csv')
test_data = pd.read_csv('../input/commonlitreadabilityprize/test.csv')
sample = pd.read_csv('../input/commonlitreadabilityprize/sample_submission.csv')

num_bins = int(np.floor(1 + np.log2(len(train_data))))
train_data.loc[:,'bins'] = pd.cut(train_data['target'],bins=num_bins,labels=False)

target = train_data['target'].to_numpy()
bins = train_data.bins.to_numpy()

def rmse_score(y_true,y_pred):
    return np.sqrt(mean_squared_error(y_true,y_pred))

In [None]:
config = {
    'batch_size':128,
    'max_len':256,
    'nfolds':5,
    'seed':42,
}

def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONASSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

seed_everything(seed=config['seed'])

In [None]:
class CLRPDataset(Dataset):
    def __init__(self,df,tokenizer):
        self.excerpt = df['excerpt'].to_numpy()
        self.tokenizer = tokenizer
    
    def __getitem__(self,idx):
        encode = self.tokenizer(self.excerpt[idx],return_tensors='pt',
                                max_length=config['max_len'],
                                padding='max_length',truncation=True)
        return encode
    
    def __len__(self):
        return len(self.excerpt)

In [None]:
class AttentionHead(nn.Module):
    def __init__(self, in_features, hidden_dim, num_targets):
        super().__init__()
        self.in_features = in_features
        self.middle_features = hidden_dim

        self.W = nn.Linear(in_features, hidden_dim)
        self.V = nn.Linear(hidden_dim, 1)
        self.out_features = hidden_dim

    def forward(self, features):
        att = torch.tanh(self.W(features))

        score = self.V(att)

        attention_weights = torch.softmax(score, dim=1)

        context_vector = attention_weights * features
        context_vector = torch.sum(context_vector, dim=1)

        return context_vector

In [None]:
class Model(nn.Module):
    def __init__(self):
        super(Model,self).__init__()
        self.roberta = AutoModel.from_pretrained('../input/roberta-base')    
        self.head = AttentionHead(768,768,1)
        self.dropout = nn.Dropout(0.1)
        self.linear = nn.Linear(self.head.out_features,1)

    def forward(self,**xb):
        x = self.roberta(**xb)[0]
        x = self.head(x)
        return x

In [None]:
def get_embeddings(df,path,plot_losses=True, verbose=True):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"{device} is used")
            
    model = Model()
    model.load_state_dict(torch.load(path))
    model.to(device)
    model.eval()
    
    tokenizer = AutoTokenizer.from_pretrained('../input/roberta-base')
    
    ds = CLRPDataset(df,tokenizer)
    dl = DataLoader(ds,
                  batch_size = config["batch_size"],
                  shuffle=False,
                  num_workers = 4,
                  pin_memory=True,
                  drop_last=False
                 )
        
    embeddings = list()
    with torch.no_grad():
        for i, inputs in tqdm(enumerate(dl)):
            inputs = {key:val.reshape(val.shape[0],-1).to(device) for key,val in inputs.items()}
            outputs = model(**inputs)
            outputs = outputs.detach().cpu().numpy()
            embeddings.extend(outputs)
    return np.array(embeddings)

In [None]:
def get_preds_svm(X,y,X_test,bins=bins,nfolds=5,C=20,kernel='rbf'):
    scores = list()
    preds = np.zeros((X_test.shape[0]))
    
    kfold = StratifiedKFold(n_splits=config['nfolds'],shuffle=True,random_state=config['seed'])
    for k, (train_idx,valid_idx) in enumerate(kfold.split(X,bins)):
        model = SVR(C=C,kernel=kernel,gamma='auto')
        X_train,y_train = X[train_idx], y[train_idx]
        X_valid,y_valid = X[valid_idx], y[valid_idx]
        
        model.fit(X_train,y_train)
        prediction = model.predict(X_valid)
        score = rmse_score(prediction,y_valid)
        print(f'Fold {k} , rmse score: {score}')
        scores.append(score)
        preds += model.predict(X_test)
        
    print("mean rmse",np.mean(scores))
    return np.array(preds)/nfolds

In [None]:
# train_embeddings1 =  get_embeddings(train_data,'../input/clrprobertamodels/model0/model0.bin')
# test_embeddings1 = get_embeddings(test_data,'../input/clrprobertamodels/model0/model0.bin')
# svm_preds1 = get_preds_svm(train_embeddings1,target,test_embeddings1)
# del train_embeddings1,test_embeddings1
# gc.collect()

# train_embeddings2 =  get_embeddings(train_data,'../input/clrprobertamodels/model1/model1.bin')
# test_embeddings2 = get_embeddings(test_data,'../input/clrprobertamodels/model1/model1.bin')
# svm_preds2 = get_preds_svm(train_embeddings2,target,test_embeddings2)
# del train_embeddings2,test_embeddings2
# gc.collect()

# train_embeddings3 =  get_embeddings(train_data,'../input/clrprobertamodels/model2/model2.bin')
# test_embeddings3 = get_embeddings(test_data,'../input/clrprobertamodels/model2/model2.bin')
# svm_preds3 = get_preds_svm(train_embeddings3,target,test_embeddings3)
# del train_embeddings3,test_embeddings3
# gc.collect()

# train_embeddings4 =  get_embeddings(train_data,'../input/clrprobertamodels/model3/model3.bin')
# test_embeddings4 = get_embeddings(test_data,'../input/clrprobertamodels/model3/model3.bin')
# svm_preds4 = get_preds_svm(train_embeddings4,target,test_embeddings4)
# del train_embeddings4,test_embeddings4
# gc.collect()

# train_embeddings5 =  get_embeddings(train_data,'../input/clrprobertamodels/model4/model4.bin')
# test_embeddings5 = get_embeddings(test_data,'../input/clrprobertamodels/model4/model4.bin')
# svm_preds5 = get_preds_svm(train_embeddings5,target,test_embeddings5)
# del train_embeddings5,test_embeddings5
# del train_data, test_data
# gc.collect()

In [None]:
# svm_preds = (svm_preds1 + svm_preds2 + svm_preds3 + svm_preds4 + svm_preds5)/5
svm_preds476 = svm_preds1*W476[0] + svm_preds2*W476[1] + \
    svm_preds3*W476[2] + svm_preds4*W476[3] + svm_preds5*W476[4]

# SVM 4780
https://www.kaggle.com/duttadebadri/eda-basline-modeling-commonlit-passages?scriptVersionId=64615496

In [None]:
W4780 = [.2]*5

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.offline as py
color = sns.color_palette()
import plotly.graph_objs as go
py.init_notebook_mode(connected=True)
import plotly.tools as tls
import warnings
warnings.filterwarnings('ignore')
import os
os.listdir("../input/commonlitreadabilityprize")
from nltk.corpus import stopwords
import string
eng_stopwords = set(stopwords.words("english"))
pd.options.mode.chained_assignment = None

In [None]:
from sklearn.svm import SVR
import random
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold,StratifiedKFold
from tqdm import tqdm

import torch
import torchvision
import torch.nn as nn
import torch.optim as optim
from torch.nn import Parameter
import torch.nn.functional as F
from torch.optim import Adam, lr_scheduler
from torch.utils.data import Dataset, DataLoader
from torch.optim.optimizer import Optimizer
from torch.optim.lr_scheduler import _LRScheduler
from torch.optim.lr_scheduler import (CosineAnnealingWarmRestarts, CosineAnnealingLR, 
                                      ReduceLROnPlateau)

from transformers import (AutoModel, AutoTokenizer, 
                          AutoModelForSequenceClassification,get_constant_schedule_with_warmup)

train_data = pd.read_csv('../input/commonlitreadabilityprize/train.csv')
test_data = pd.read_csv('../input/commonlitreadabilityprize/test.csv')
sample = pd.read_csv('../input/commonlitreadabilityprize/sample_submission.csv')

target = train_data['target'].to_numpy()

#for kfold  
num_bins = int(np.floor(1 + np.log2(len(train_data))))
train_data.loc[:,'bins'] = pd.cut(train_data['target'],bins=num_bins,labels=False)
bins = train_data.bins.to_numpy()

def rmse_score(y_true,y_pred):
    return np.sqrt(mean_squared_error(y_true,y_pred))

config = {
    'batch_size':32,
    'max_len':512,
    'seed':23,
}

def seed_everything(seed=23):
    random.seed(seed)
    os.environ['PYTHONASSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

seed_everything(seed=config['seed'])

class CLRPDataset(nn.Module):
    def __init__(self,df,tokenizer,max_len=128):
        self.excerpt = df['excerpt'].to_numpy()
        self.max_len = max_len
        self.tokenizer = tokenizer
    
    def __getitem__(self,idx):
        encode = self.tokenizer(self.excerpt[idx],
                                return_tensors='pt',
                                max_length=self.max_len,
                                padding='max_length',
                                truncation=True)  
        return encode
    
    def __len__(self):
        return len(self.excerpt)
    
def get_embeddings(df,path,plot_losses=True, verbose=True):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"{device} is used")
            
    MODEL_PATH = path
    model = AutoModel.from_pretrained(MODEL_PATH)
    tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
    model.to(device)
    model.eval()

    ds = CLRPDataset(df,tokenizer,config['max_len'])
    dl = DataLoader(ds,
                  batch_size = config["batch_size"],
                  shuffle=False,
                  num_workers = 4,
                  pin_memory=True,
                  drop_last=False
                 )
        
    embeddings = list()
    with torch.no_grad():
        for i, inputs in tqdm(enumerate(dl)):
            inputs = {key:val.reshape(val.shape[0],-1).to(device) for key,val in inputs.items()}
            outputs = model(**inputs)
            outputs = outputs[0][:,0].detach().cpu().numpy()
            embeddings.extend(outputs)
    return np.array(embeddings)



def get_preds_svm(X,y,X_test,bins=bins,nfolds=5,C=10,kernel='rbf'):
    kfold = StratifiedKFold(n_splits=nfolds)
    scores = list()
    preds = np.zeros((X_test.shape[0]))
    for k, (train_idx,valid_idx) in enumerate(kfold.split(X,bins)):
        model = SVR()
        X_train,y_train = X[train_idx], y[train_idx]
        X_valid,y_valid = X[valid_idx], y[valid_idx]
        
        model.fit(X_train,y_train)
        prediction = model.predict(X_valid)
        score = rmse_score(prediction,y_valid)
        print(f'Fold {k} , rmse score: {score}')
        scores.append(score)
        preds += model.predict(X_test)
        
    print("mean rmse",np.mean(scores))
    return np.array(preds)/nfolds

train_embeddings1 =  get_embeddings(train_data,'../input/modelf1')
test_embeddings1 = get_embeddings(test_data,'../input/modelf1')

train_embeddings2 =  get_embeddings(train_data,'../input/modelf2')
test_embeddings2 = get_embeddings(test_data,'../input/modelf2')

train_embeddings3 =  get_embeddings(train_data,'../input/modelf3')
test_embeddings3 = get_embeddings(test_data,'../input/modelf3')

train_embeddings4 =  get_embeddings(train_data,'../input/modelf4')
test_embeddings4 = get_embeddings(test_data,'../input/modelf4')

train_embeddings5 =  get_embeddings(train_data,'../input/modelf5')
test_embeddings5 = get_embeddings(test_data,'../input/modelf5')

svm_preds1 = get_preds_svm(train_embeddings1,target,test_embeddings1)
svm_preds2 = get_preds_svm(train_embeddings2,target,test_embeddings2)
svm_preds3 = get_preds_svm(train_embeddings3,target,test_embeddings3)
svm_preds4 = get_preds_svm(train_embeddings4,target,test_embeddings4)
svm_preds5 = get_preds_svm(train_embeddings5,target,test_embeddings5)

# svm_preds = (svm_preds1 + svm_preds2 + svm_preds3 + svm_preds4 + svm_preds5)/5

# sample.target = svm_preds
# sample.to_csv('submission.csv',index=False)

In [None]:
# svm_preds = (svm_preds1 + svm_preds2 + svm_preds3 + svm_preds4 + svm_preds5)/5
svm_preds4780 = svm_preds1*W4780[0] + svm_preds2*W4780[1] + \
    svm_preds3*W4780[2] + svm_preds4*W4780[3] + svm_preds5*W4780[4]

# SVM 4781

https://www.kaggle.com/lars123/neural-tangent-kernel-2?scriptVersionId=63130079

In [None]:
W4781 = [.2]*5

In [None]:
import os
import gc
import sys
import cv2
import math
import time
import tqdm
import random
import numpy as np
import pandas as pd
import seaborn as sns
from tqdm import tqdm
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings('ignore')

import optuna
import lightgbm as lgb
import xgboost as xgb
from sklearn.svm import SVR
from catboost import CatBoostRegressor, Pool, CatBoost

from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold,StratifiedKFold

import torch
import torchvision
import torch.nn as nn
import torch.optim as optim
from torch.nn import Parameter
import torch.nn.functional as F
from torch.optim import Adam, lr_scheduler
from torch.utils.data import Dataset, DataLoader
from torch.optim.optimizer import Optimizer
from torch.optim.lr_scheduler import _LRScheduler
from torch.optim.lr_scheduler import (CosineAnnealingWarmRestarts, CosineAnnealingLR, 
                                      ReduceLROnPlateau)

from transformers import (AutoModel, AutoTokenizer, 
                          AutoModelForSequenceClassification,get_constant_schedule_with_warmup)

import plotly.express as px
import plotly.graph_objs as go
import plotly.figure_factory as ff


from colorama import Fore, Back, Style
y_ = Fore.YELLOW
r_ = Fore.RED
g_ = Fore.GREEN
b_ = Fore.BLUE
m_ = Fore.MAGENTA
c_ = Fore.CYAN
sr_ = Style.RESET_ALL

In [None]:
train_data = pd.read_csv('../input/commonlitreadabilityprize/train.csv')
test_data = pd.read_csv('../input/commonlitreadabilityprize/test.csv')
sample = pd.read_csv('../input/commonlitreadabilityprize/sample_submission.csv')

target = train_data['target'].to_numpy()

#for kfold  
num_bins = int(np.floor(1 + np.log2(len(train_data))))
train_data.loc[:,'bins'] = pd.cut(train_data['target'],bins=num_bins,labels=False)
bins = train_data.bins.to_numpy()

def rmse_score(y_true,y_pred):
    return np.sqrt(mean_squared_error(y_true,y_pred))

In [None]:
config = {
    'batch_size':128,
    'max_len':256,
    'seed':42,
}

def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONASSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

seed_everything(seed=config['seed'])

In [None]:
class CLRPDataset(nn.Module):
    def __init__(self,df,tokenizer,max_len=128):
        self.excerpt = df['excerpt'].to_numpy()
        self.max_len = max_len
        self.tokenizer = tokenizer
    
    def __getitem__(self,idx):
        encode = self.tokenizer(self.excerpt[idx],
                                return_tensors='pt',
                                max_length=self.max_len,
                                padding='max_length',
                                truncation=True)  
        return encode
    
    def __len__(self):
        return len(self.excerpt)

In [None]:
def get_embeddings(df,path,plot_losses=True, verbose=True):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"{device} is used")
            
    MODEL_PATH = path
    model = AutoModel.from_pretrained(MODEL_PATH)
    tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
    model.to(device)
    model.eval()

    ds = CLRPDataset(df,tokenizer,config['max_len'])
    dl = DataLoader(ds,
                  batch_size = config["batch_size"],
                  shuffle=False,
                  num_workers = 4,
                  pin_memory=True,
                  drop_last=False
                 )
        
    embeddings = list()
    with torch.no_grad():
        for i, inputs in tqdm(enumerate(dl)):
            inputs = {key:val.reshape(val.shape[0],-1).to(device) for key,val in inputs.items()}
            outputs = model(**inputs)
            outputs = outputs[0][:,0].detach().cpu().numpy()
            embeddings.extend(outputs)
    return np.array(embeddings)

In [None]:
from jax import random
from neural_tangents import stax
import neural_tangents as nt

def get_preds_svm(X,y,X_test,bins=bins,nfolds=5,C=10,kernel='rbf'):
    kfold = StratifiedKFold(n_splits=nfolds)
    scores = list()
    preds = np.zeros((X_test.shape[0]))
    for k, (train_idx,valid_idx) in enumerate(kfold.split(X,bins)):
        X_train,y_train = X[train_idx], y[train_idx]
        X_valid,y_valid = X[valid_idx], y[valid_idx]

        ResBlock = stax.serial(
                        stax.FanOut(2),
                        stax.parallel(
                            stax.serial(
                                stax.Erf(),
                                stax.Dense(1, W_std=1.25, b_std=0.0),
                                stax.Erf(),
                                stax.Dense(1, W_std=1.25, b_std=0.0),
                                stax.Erf(),
                                stax.Dense(1, W_std=1.25, b_std=0.0),
                            ),
                            stax.Identity(),
                        ),
                        stax.FanInSum()
                    )

        init_fn, apply_fn, kernel_fn = stax.serial(
                stax.Dense(1, W_std=1.0, b_std=0),
                ResBlock, ResBlock, stax.Erf(),
                stax.Dense(1, W_std=2.5, b_std=0.1)
        )

        key = random.PRNGKey(10)
        _, params = init_fn(key, input_shape=X_train.shape)
        predict_fn = nt.predict.gradient_descent_mse_ensemble(kernel_fn,
                                                                  X_train,
                                                                  y_train[:,np.newaxis],
                                                                  diag_reg=1e-1,
                                                                  lr=1)
        prediction = predict_fn(x_test=X_valid, get='nngp', t=None)#model.predict(X_valid)
        score = rmse_score(prediction,y_valid)
        print(f'Fold {k} , rmse score: {score}')
        scores.append(score)
        preds += predict_fn(x_test=X_test, get='nngp', t=None)#model.predict(X_test)
        
    print("mean rmse",np.mean(scores))
    return np.array(preds)/nfolds

In [None]:
# train_embeddings1 =  get_embeddings(train_data,'../input/modelf1')
# test_embeddings1 = get_embeddings(test_data,'../input/modelf1')
# svm_p1 = get_preds_svm(train_embeddings1,target,test_embeddings1).mean(axis=1)
# del train_embeddings1,test_embeddings1

# train_embeddings2 =  get_embeddings(train_data,'../input/modelf2')
# test_embeddings2 = get_embeddings(test_data,'../input/modelf2')
# svm_p2 = get_preds_svm(train_embeddings2,target,test_embeddings2).mean(axis=1)
# del train_embeddings2,test_embeddings2


# train_embeddings3 =  get_embeddings(train_data,'../input/modelf3')
# test_embeddings3 = get_embeddings(test_data,'../input/modelf3')
# svm_p3 = get_preds_svm(train_embeddings3,target,test_embeddings3).mean(axis=1)
# del train_embeddings3,test_embeddings3

# train_embeddings4 =  get_embeddings(train_data,'../input/modelf4')
# test_embeddings4 = get_embeddings(test_data,'../input/modelf4')
# svm_p4 = get_preds_svm(train_embeddings4,target,test_embeddings4).mean(axis=1)
# del train_embeddings4,test_embeddings4

# train_embeddings5 =  get_embeddings(train_data,'../input/modelf5')
# test_embeddings5 = get_embeddings(test_data,'../input/modelf5')
# svm_p5 = get_preds_svm(train_embeddings5,target,test_embeddings5).mean(axis=1)
# del train_embeddings5,test_embeddings5

# del train_data, test_data

In [None]:
# # svm_preds = (svm_preds1 + svm_preds2 + svm_preds3 + svm_preds4 + svm_preds5)/5
# svm_preds4781 = svm_preds1*W4781[0] + svm_preds2*W4781[1] + \
#     svm_preds3*W4781[2] + svm_preds4*W4781[3] + svm_preds5*W4781[4]

# SVM 479

https://www.kaggle.com/sourabhy/commonlit-roberta-ensemble-multichannel-cnn?scriptVersionId=63751318

In [None]:
import numpy as np
import pandas as pd 
import os
import gc
import sys
import time
import random
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm

import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import KFold, StratifiedKFold
import xgboost as xgb

import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from sklearn.metrics import mean_squared_error

from transformers import AutoModel, AutoTokenizer
import json
from tensorflow.keras.models import load_model
import re
import pandas as pd
import string
import keras
from sklearn.svm import SVR

In [None]:
data_dir = '../input/commonlitreadabilityprize/'
train = pd.read_csv(data_dir + 'train.csv')
test = pd.read_csv(data_dir + 'test.csv')
sample_submission = pd.read_csv(data_dir + 'sample_submission.csv')

target = train['target'].to_numpy()


def rmse_score(y_true,y_pred):
    return np.sqrt(mean_squared_error(y_true,y_pred))

In [None]:
# source: https://www.kaggle.com/maunish/clrp-roberta-lgbm

def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONASSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True


class CLRPDataset(nn.Module):
    def __init__(self, df, tokenizer, max_len=128):
        self.excerpt = df['excerpt'].to_numpy()
        self.max_len = max_len
        self.tokenizer = tokenizer
    
    def __getitem__(self,idx):
        encode = self.tokenizer(self.excerpt[idx],
                                return_tensors='pt',
                                max_length=self.max_len,
                                padding='max_length',
                                truncation=True)
        return encode
    
    def __len__(self):
        return len(self.excerpt)
    

def get_embeddings(df, path, plot_losses=True, verbose=True):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"{device} is used")
            
    MODEL_PATH = path
    model = AutoModel.from_pretrained(MODEL_PATH)
    tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
    model.to(device)
    model.eval()

    ds = CLRPDataset(df, tokenizer, config['max_len'])
    dl = DataLoader(ds,
                    batch_size=config["batch_size"],
                    shuffle=False,
                    num_workers = 4,
                    pin_memory=True,
                    drop_last=False)
        
    embeddings = list()
    with torch.no_grad():
        for i, inputs in tqdm(enumerate(dl)):
            inputs = {key:val.reshape(val.shape[0], -1).to(device) for key, val in inputs.items()}
            outputs = model(**inputs)
            outputs = outputs[0][:, 0].detach().cpu().numpy()
            embeddings.extend(outputs)
    return np.array(embeddings)

config = {
    'batch_size': 128,
    'max_len': 256,
    'seed': 42,
}
seed_everything(seed=config['seed'])

# train_embeddings =  get_embeddings(train,'../input/modelf1')
# test_embeddings = get_embeddings(test,'../input/modelf1')

# train_embeddings2 =  get_embeddings(train,'../input/modelf2')
# test_embeddings2 = get_embeddings(test,'../input/modelf2')

# train_embeddings3 =  get_embeddings(train,'../input/modelf3')
# test_embeddings3 = get_embeddings(test,'../input/modelf3')

# train_embeddings4 =  get_embeddings(train,'../input/modelf4')
# test_embeddings4 = get_embeddings(test,'../input/modelf4')

# train_embeddings5 =  get_embeddings(train,'../input/modelf5')
# test_embeddings5 = get_embeddings(test,'../input/modelf5')

In [None]:
from sklearn.model_selection import train_test_split 
from keras.utils import to_categorical
from keras.preprocessing.text import Tokenizer
from keras.models import Sequential
from keras.layers import Dense, Activation, Embedding, LSTM,Dropout,concatenate
from sklearn.feature_extraction.text import CountVectorizer
import tensorflow as tf
from tensorflow.python.keras.layers import Dense, Activation, Embedding, LSTM,Dropout,Bidirectional,GRU
from keras.utils import plot_model
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense , Flatten ,Embedding,Input,Conv1D,GlobalAveragePooling1D,GlobalMaxPooling1D,Dropout,MaxPooling1D,Bidirectional,GRU,Concatenate
from keras.models import Sequential,Model
from keras.wrappers.scikit_learn import KerasRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

In [None]:
def crt_model():
    i1=Input(shape=(768,1))
    l1=Conv1D(64,5,padding='valid', kernel_initializer='normal',activation='relu')(i1)
    l2 =MaxPooling1D(2) (l1)
    l3=Conv1D(128,5,padding='valid', kernel_initializer='normal',activation='relu')(l2)
    l3 =MaxPooling1D(2) (l3)
    l3=Conv1D(256,5,padding='valid', kernel_initializer='normal',activation='relu')(l3)
    l41=GlobalMaxPooling1D()(l3)
    
    
    
    i2=Input(shape=(768,1))
    l12=Conv1D(64,5,padding='valid', kernel_initializer='normal',activation='relu')(i2)
    l22 =MaxPooling1D(2) (l12)
    l32=Conv1D(128,5,padding='valid', kernel_initializer='normal',activation='relu')(l22)
    l32 =MaxPooling1D(2) (l32)
    l32=Conv1D(256,5,padding='valid', kernel_initializer='normal',activation='relu')(l32)
    l42=GlobalMaxPooling1D()(l32)
    
    
    
    i3=Input(shape=(768,1))
    l13=Conv1D(64,5,padding='valid', kernel_initializer='normal',activation='relu')(i3)
    l23 =MaxPooling1D(2) (l13)
    l33=Conv1D(128,5,padding='valid', kernel_initializer='normal',activation='relu')(l23)
    l33 =MaxPooling1D(2) (l33)
    l33=Conv1D(256,5,padding='valid', kernel_initializer='normal',activation='relu')(l33)
    l43=GlobalMaxPooling1D()(l33)
    
    
    
    
    i4=Input(shape=(768,1))
    l14=Conv1D(64,5,padding='valid', kernel_initializer='normal',activation='relu')(i4)
    l24 =MaxPooling1D(2) (l14)
    l34=Conv1D(128,5,padding='valid', kernel_initializer='normal',activation='relu')(l24)
    l34 =MaxPooling1D(2) (l34)
    l34=Conv1D(256,5,padding='valid', kernel_initializer='normal',activation='relu')(l34)
    l44=GlobalMaxPooling1D()(l34)
    
    
    
    
    i5=Input(shape=(768,1))
    l15=Conv1D(64,5,padding='valid', kernel_initializer='normal',activation='relu')(i5)
    l25 =MaxPooling1D(2) (l15)
    l35=Conv1D(128,5,padding='valid', kernel_initializer='normal',activation='relu')(l25)
    l35 =MaxPooling1D(2) (l35)
    l35=Conv1D(256,5,padding='valid', kernel_initializer='normal',activation='relu')(l35)
    l45=GlobalMaxPooling1D()(l35)
    
    
    
    
    
    i6=Input(shape=(768,1))
    l16=Conv1D(64,5,padding='valid', kernel_initializer='normal',activation='relu')(i6)
    l26 =MaxPooling1D(2) (l16)
    l36=Conv1D(128,5,padding='valid', kernel_initializer='normal',activation='relu')(l26)
    l36 =MaxPooling1D(2) (l36)
    l36=Conv1D(256,5,padding='valid', kernel_initializer='normal',activation='relu')(l36)
    l46=GlobalMaxPooling1D()(l36)
    
    
    
    i7=Input(shape=(768,1))
    l17=Conv1D(64,5,padding='valid', kernel_initializer='normal',activation='relu')(i7)
    l27 =MaxPooling1D(2) (l17)
    l37=Conv1D(128,5,padding='valid', kernel_initializer='normal',activation='relu')(l27)
    l37 =MaxPooling1D(2) (l37)
    l37=Conv1D(256,5,padding='valid', kernel_initializer='normal',activation='relu')(l37)
    l47=GlobalMaxPooling1D()(l37)
    
    
    
    i8=Input(shape=(768,1))
    l18=Conv1D(64,5,padding='valid', kernel_initializer='normal',activation='relu')(i8)
    l28 =MaxPooling1D(2) (l18)
    l38=Conv1D(128,5,padding='valid', kernel_initializer='normal',activation='relu')(l28)
    l38 =MaxPooling1D(2) (l38)
    l38=Conv1D(256,5,padding='valid', kernel_initializer='normal',activation='relu')(l38)
    l48=GlobalMaxPooling1D()(l38)
    
    
    
    
    i9=Input(shape=(768,1))
    l19=Conv1D(64,5,padding='valid', kernel_initializer='normal',activation='relu')(i9)
    l29 =MaxPooling1D(2) (l19)
    l39=Conv1D(128,5,padding='valid', kernel_initializer='normal',activation='relu')(l29)
    l39 =MaxPooling1D(2) (l39)
    l39=Conv1D(256,5,padding='valid', kernel_initializer='normal',activation='relu')(l39)
    l49=GlobalMaxPooling1D()(l39)
    
    
    
    
    i10=Input(shape=(768,1))
    l110=Conv1D(64,5,padding='valid', kernel_initializer='normal',activation='relu')(i10)
    l210=MaxPooling1D(2) (l110)
    l310=Conv1D(128,5,padding='valid', kernel_initializer='normal',activation='relu')(l210)
    l310 =MaxPooling1D(2) (l310)
    l310=Conv1D(256,5,padding='valid', kernel_initializer='normal',activation='relu')(l310)
    l410=GlobalMaxPooling1D()(l310)
    
    
    
    
    
    
    
    
       
    
    l4 = concatenate([l41, l42, l43, l44, l45,l46,l47,l48,l49,l410])
    
    
    l5=Dense(120, kernel_initializer='normal',activation='relu')(l4)
    l5=Dense(240, kernel_initializer='normal',activation='relu')(l5)
    l7=Dense(1, kernel_initializer='normal')(l5)
    model=Model(inputs=[i1,i2,i3,i4,i5,i6,i7,i8,i9,i10], outputs=l7)
    model.compile(loss='mean_squared_error', optimizer='adam',metrics=[keras.metrics.MeanSquaredError()])
    return model

In [None]:
model=crt_model()
keras.utils.plot_model(model)

In [None]:
# model.summary()

In [None]:
def get_res(train_embedd,target,test_embedd):
    nfolds = 5
    scores =[]
    preds = np.zeros((test_embedd.shape[0]))
    kf = KFold(n_splits=nfolds, shuffle=True, random_state=config['seed'])
    for k, (train_idx, valid_idx) in enumerate(kf.split(train)): 
        model=crt_model()
        train_x,train_y,test_x,test_y=train_embedd[train_idx], target[train_idx],train_embedd[valid_idx], target[valid_idx]
        train_x=train_x.reshape(train_x.shape+(1,))
        traindata=[train_x for i in range(10)]
        val=[test_x for i in range(10)]
        model.fit(traindata,train_y,epochs=7,validation_data=(val,test_y),batch_size=8)
        y_pred=model.predict(val)
        score = rmse_score(y_pred,test_y)
        scores.append(score)
        print(f'Fold {k} , rmse score: {score}')
        test=[test_embedd for i in range(10)]
        y_preds = model.predict(test)
        y_preds=y_preds.reshape(-1)
        preds+=y_preds
   
        
        
    print("mean rmse",np.mean(scores))
    return np.array(preds)/nfolds  

In [None]:
def get_preds_svm(X,y,X_test,nfolds=5,C=10,kernel='rbf'):
    scores = list()
    preds = np.zeros((X_test.shape[0]))
    kf = KFold(n_splits=nfolds, shuffle=True, random_state=config['seed'])
    for k, (train_idx, valid_idx) in enumerate(kf.split(train)): 
        model = SVR(C=C,kernel=kernel,gamma='auto')
        train_x,train_y,val_x,val_y=X[train_idx], y[train_idx],X[valid_idx], y[valid_idx]
        
        
        model.fit(train_x,train_y)
        prediction = model.predict(val_x)
        score = rmse_score(prediction,val_y)
        print(f'Fold {k} , rmse score: {score}')
        scores.append(score)
        preds += model.predict(X_test)
        
    print("mean rmse",np.mean(scores))
    return np.array(preds)/nfolds

In [None]:
train_embeddings =  get_embeddings(train,'../input/modelf1')
test_embeddings = get_embeddings(test,'../input/modelf1')
pred1=get_res(train_embeddings,target,test_embeddings)
svm_preds1 = get_preds_svm(train_embeddings,target,test_embeddings)
del train_embeddings,test_embeddings
gc.collect()

train_embeddings2 =  get_embeddings(train,'../input/modelf2')
test_embeddings2 = get_embeddings(test,'../input/modelf2')
pred2=get_res(train_embeddings2,target,test_embeddings2)
svm_preds2 = get_preds_svm(train_embeddings2,target,test_embeddings2)
del train_embeddings2,test_embeddings2
gc.collect()

train_embeddings3 =  get_embeddings(train,'../input/modelf3')
test_embeddings3 = get_embeddings(test,'../input/modelf3')
pred3=get_res(train_embeddings3,target,test_embeddings3)
svm_preds3 = get_preds_svm(train_embeddings3,target,test_embeddings3)
del train_embeddings3,test_embeddings3
gc.collect()

train_embeddings4 =  get_embeddings(train,'../input/modelf4')
test_embeddings4 = get_embeddings(test,'../input/modelf4')
pred4=get_res(train_embeddings4,target,test_embeddings4)
svm_preds4 = get_preds_svm(train_embeddings4,target,test_embeddings4)
del train_embeddings4,test_embeddings4
gc.collect()

train_embeddings5 =  get_embeddings(train,'../input/modelf5')
test_embeddings5 = get_embeddings(test,'../input/modelf5')
pred5=get_res(train_embeddings5,target,test_embeddings5)
svm_preds5 = get_preds_svm(train_embeddings5,target,test_embeddings5)
del train_embeddings5,test_embeddings5
gc.collect()

In [None]:
preds=(pred1+pred2+pred3+pred4+pred5)/5
svm_preds = (svm_preds1 + svm_preds2 + svm_preds3 + svm_preds4 + svm_preds5)/5

In [None]:
svm_preds479 = (preds+svm_preds)/2

# SVM 480

In [None]:
W480 = [.2]*5

In [None]:
import torch
import pandas as pd
import numpy as np
from transformers import AutoModelForSequenceClassification, AutoTokenizer

In [None]:
class Dataset:
    def __init__(self, excerpt, tokenizer, max_len):
        self.excerpt = excerpt
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.excerpt)

    def __getitem__(self, item):
        text = str(self.excerpt[item])
        inputs = self.tokenizer(
            text, 
            max_length=self.max_len, 
            padding="max_length", 
            truncation=True
        )

        ids = inputs["input_ids"]
        mask = inputs["attention_mask"]

        return {
            "input_ids": torch.tensor(ids, dtype=torch.long),
            "attention_mask": torch.tensor(mask, dtype=torch.long),
        }

In [None]:
def generate_predictions(model_path, max_len):
    model = AutoModelForSequenceClassification.from_pretrained(model_path)
    tokenizer = AutoTokenizer.from_pretrained(model_path)

    model.to("cuda")
    model.eval()
    
    df = pd.read_csv("../input/commonlitreadabilityprize/test.csv")
    
    dataset = Dataset(excerpt=df.excerpt.values, tokenizer=tokenizer, max_len=max_len)
    data_loader = torch.utils.data.DataLoader(
        dataset, batch_size=32, num_workers=4, pin_memory=True, shuffle=False
    )

    final_output = []

    for b_idx, data in enumerate(data_loader):
        with torch.no_grad():
            for key, value in data.items():
                data[key] = value.to("cuda")
            output = model(**data)
            output = output.logits.detach().cpu().numpy().ravel().tolist()
            final_output.extend(output)
    
    torch.cuda.empty_cache()
    return np.array(final_output)

In [None]:
# preds1 = generate_predictions("../input/a81653/", max_len=256)
# preds2 = generate_predictions("../input/a81656/", max_len=256)
# preds3 = generate_predictions("../input/a81657/", max_len=256)
# preds4 = generate_predictions("../input/a81660/", max_len=256)
# preds5 = generate_predictions("../input/a81675/", max_len=192)
# preds6 = generate_predictions("../input/a87832/", max_len=256)
gc.collect()

preds480 = (preds1 + preds2 + preds3 + preds4 + preds5 + preds6) / 6

In [None]:
# # svm_preds = (svm_preds1 + svm_preds2 + svm_preds3 + svm_preds4 + svm_preds5)/5
# svm_preds480 = svm_preds1*W480[0] + svm_preds2*W480[1] + \
#     svm_preds3*W480[2] + svm_preds4*W480[3] + svm_preds5*W480[4]

# Make sub

In [None]:
svm_preds475
svm_preds476
svm_preds4780
svm_preds4781
svm_preds479
preds480

In [None]:
sub = pd.read_csv('../input/commonlitreadabilityprize/sample_submission.csv')
# sub['target'] = ((RBWRTbase_pred_df.mean(axis=1) + RBWRTlarge_pred_df.mean(axis=1))/2).values.tolist()
# sub['target'] = pds1*W_M['RoBL'] + pds2*W_M['RoBB'] + pds3*W_M['SVM']
# sub['target'] = (svm_preds475 + svm_preds476)/2
# sub['target'] = (svm_preds475 + svm_preds476 + svm_preds4780 + svm_preds479)/4

# sub['target'] = (svm_preds475 + svm_preds4780 + svm_preds479)/3
sub.to_csv('submission.csv', index=False)

In [None]:
sub.head()