## Скачивание файлов, деление на фолды

In [1]:
import pandas as pd, numpy as np, matplotlib.pyplot as plt, math
from tqdm.notebook import tqdm

URL = 'https://storage.googleapis.com/oleg-zyablov/misc/VoiceMOS'
!wget -q {URL}/data.csv

data = pd.read_csv('data.csv')
data = data[data.subset.isin(['train', 'val']) & data.file_exists]
#data = data[data.subset.isin(['train']) & data.file_exists] #TODO заменить

n_folds = 5
fold_size = math.ceil(len(data) / n_folds)
cumsum = data.groupby('system').file.count().cumsum()
system_to_fold = {}
for i in range(n_folds):
  systems = cumsum[(cumsum > fold_size*i) & (cumsum <= fold_size*(i+1))].index.tolist()
  for s in systems:
    system_to_fold[s] = i

data['fold'] = None
for i in data.index:
  data.loc[i, 'fold'] = system_to_fold[data.loc[i, 'system']]

folds = data.fold.to_numpy()
is_val = data.subset == 'val'
y = data.score_mean.to_numpy()

def get_split(X, y, fold_idx):
  if fold_idx == 'default':
    train_idx = ~is_val
    val_idx = is_val
  else:
    train_idx = folds != i
    val_idx = folds == i
  return (X[train_idx], y[train_idx]), (X[val_idx], y[val_idx])

data.sample(5)

Unnamed: 0,subset,system,utterance,file,file_exists,score_mean,score_std,n_votes,votes1,votes2,votes3,votes4,votes5,fold
4302,train,sysb526f,uttfa3d263,sysb526f-uttfa3d263.wav,True,3.375,0.744,8,0,1,3,4,0,3
2695,train,sys56eb0,utt186960c,sys56eb0-utt186960c.wav,True,3.0,1.069,8,0,3,3,1,1,1
5575,train,sysf53fb,utt65c20f7,sysf53fb-utt65c20f7.wav,True,1.0,0.0,8,8,0,0,0,0,4
3322,train,sys83090,utt6258cf1,sys83090-utt6258cf1.wav,True,3.125,0.991,8,0,2,4,1,1,2
3334,train,sys83090,uttceea72e,sys83090-uttceea72e.wav,True,3.25,1.0351,8,0,2,3,2,1,2


In [2]:
data_sources = [
    'wave2vec2.feature_extractor.conv_layers.3',
    'wave2vec2.feature_extractor.conv_layers.5',
    'wave2vec2.feature_extractor.conv_layers.6',
    'wave2vec2.transformer.layers.0',
    'wave2vec2.transformer.layers.2',
    'wave2vec2.transformer.layers.4',
    'wave2vec2.transformer.layers.6',
    'wave2vec2.transformer.layers.8',
    'wave2vec2.transformer.layers.10',
    'wave2vec2.mix1',
    'wave2vec2.mix2',
    'wave2vec2.mix3',
]

data_processing_and_reduction = {
    'none': {'mean', 'mean_std', 'max'},
    'random_projection': {'mean', 'mean_std', 'max'},
    'random_rnn': {'last', 'mean', 'mean_std', 'max'},
    'random_esn': {'last', 'mean', 'mean_std', 'max'},
}

def get_filename(src_name, proc_name, reduction_name):
  return f'{src_name}__{proc_name}__{reduction_name}.npz'

filenames = []
for source in data_sources:
  for proc, reductions in data_processing_and_reduction.items():
    for reduction in reductions:
      filenames.append(get_filename(source, proc, reduction))

for filename in tqdm(filenames):
  !wget -q {URL}/datasets/{filename}

def load_dataset(source, proc, reduction):
  filename = get_filename(source, proc, reduction)
  return np.load(filename)['arr_0']

all_procs_and_reductions = []
for proc, reductions in data_processing_and_reduction.items():
    for reduction in reductions:
      all_procs_and_reductions.append(f'{proc}__{reduction}')

  0%|          | 0/168 [00:00<?, ?it/s]

In [5]:
import sklearn.linear_model, sklearn.ensemble
from sklearn.metrics import mean_squared_error as mse
from sklearn.preprocessing import StandardScaler

sum_mse = 0
for i in range(5):
  (_, y_train), (_, y_val) = get_split(y, y, i)
  prediction = y_val.mean()
  sum_mse += mse(y_val, [prediction] * len(y_val))
print(sum_mse / 5)

0.890975555542332


In [None]:
results = pd.DataFrame(columns=all_procs_and_reductions, index=data_sources)

MIN = 1000

for source in data_sources:
  for proc, reductions in data_processing_and_reduction.items():
    for reduction in reductions:
      X = load_dataset(source, proc, reduction)
      X = StandardScaler().fit_transform(X)
      val_mse_sum = 0
      for i in range(5):
        (X_train, y_train), (X_val, y_val) = get_split(X, y, i)
        model = sklearn.linear_model.Ridge()
        model.fit(X_train, y_train)
        train_mse = mse(y_train, model.predict(X_train))
        val_mse = mse(y_val, model.predict(X_val))
        val_mse_sum += val_mse
      val_mse_mean = val_mse_sum / 5
      print(source, f'{proc}__{reduction}', val_mse_mean)
      results.loc[source, f'{proc}__{reduction}'] = val_mse_mean
      if MIN > val_mse_mean:
        print('MIN')
        MIN = val_mse_mean

In [7]:
f = get_filename('wave2vec2.feature_extractor.conv_layers.3', 'none', 'mean')
print(f)
np.load(f)['arr_0'].shape

wave2vec2.feature_extractor.conv_layers.3__none__mean.npz


(2641, 512)

In [8]:
f = get_filename('wave2vec2.transformer.layers.10', 'none', 'mean')
print(f)
np.load(f)['arr_0'].shape

wave2vec2.transformer.layers.10__none__mean.npz


(2641, 768)

In [None]:
from IPython.display import HTML, display

pd.set_option('display.float_format', '{:.3f}'.format)

def format_vertical_headers(df):
    """Display a dataframe with vertical column headers"""
    styles = [dict(selector="th", props=[('width', '40px')]),
              dict(selector="th.col_heading",
                   props=[("writing-mode", "vertical-rl"),
                          ('transform', 'rotateZ(180deg)'), 
                          ('height', '290px'),
                          ('vertical-align', 'top')])]
    return (df.round(3).style.set_table_styles(styles))

format_vertical_headers(results)

In [18]:
results.to_csv('1.csv')