In [101]:
from scipy.stats import pearsonr
import numpy
import pandas

def ccc(y_true, y_pred):
    true_mean = np.mean(y_true)
    true_variance = np.var(y_true)
    pred_mean = np.mean(y_pred)
    pred_variance = np.var(y_pred)

    rho,_ = pearsonr(y_pred,y_true)

    std_predictions = numpy.std(y_pred)

    std_gt = numpy.std(y_true)


    ccc = 2 * rho * std_gt * std_predictions / (
        std_predictions ** 2 + std_gt ** 2 +
        (pred_mean - true_mean) ** 2)

    return ccc, rho

def ccc_scorer(model, x, y):
    preds = model.predict(x)
    return ccc(y, preds)[0]


In [102]:
import pandas as pd
import numpy as np
import nltk
import os

tr_X = pd.read_csv('omg_TrainTranscripts.csv').transcript.values.astype(np.str)
tr_y = pd.read_csv('omg_TrainVideos.csv')
val_X = pd.read_csv('omg_ValidationTranscripts.csv').transcript.values.astype(np.str)
val_y = pd.read_csv('omg_ValidationVideos.csv')
ts_X = pd.read_csv('omg_TestTranscripts.csv').transcript.values.astype(np.str)
ts_y = pd.read_csv('omg_TestVideos_WithoutLabels.csv')

tr_mask = tr_X != 'nan'
val_mask = val_X != 'nan'
ts_mask = ts_X != 'nan'

tr_X = tr_X[tr_mask]
tr_y = tr_y.loc[tr_mask]

val_X = val_X[val_mask]
val_y = val_y.loc[val_mask]

In [103]:
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.porter import *

stemmer = SnowballStemmer("english")
stemmer = PorterStemmer()

def get_pos(X):
    ret = []
    for x in X:
        _, next_x = zip(*nltk.pos_tag(x.split()))
        next_x_lower = x#' '.join(next_x)#.lower()
        next_x_stemmed = ' '.join(stemmer.stem(k) for k in x.lower())
        next_x_pos = ' '.join(['POS::' + k for k in next_x])# + \
            #' ' + ' '.join(['POS::FIRST::' + k[0] for k in next_x])
        ret.append(next_x_lower + ' ' +
                   #next_x_stemmed + ' ' +
                   next_x_pos)
    ret = np.asarray(ret)
    return ret

tr_X_POS = get_pos(tr_X)
val_X_POS = get_pos(val_X)
ts_X_POS = get_pos(ts_X)

In [104]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.sentiment.util import demo_sent_subjectivity
from nltk.corpus import stopwords
from textblob import TextBlob

sid = SentimentIntensityAnalyzer()
english_sw = set(stopwords.words('english'))
negs = set(['not', 'no', 'doesn\'t', 'don\'t', 'didn\'t', 'won\'t', 'wouldn\'t'])

def get_extra(X):
    ret = []
    for x in X:
        tks = x.lower().split()
        num_stopwords = float(len([x for x in tks if x in english_sw])) / len(tks)
        blob = TextBlob(x).sentiment
        pol = sid.polarity_scores(x)
        next_x = [pol[k] for k in ['neg', 'neu', 'pos', 'compound']] + \
            [(pol['pos'] + 1) / (pol['neg'] + 1), len(x.split()), num_stopwords,#, len(set(x.split()))
             blob[0], blob[1], len([k for k in x if k == '*']), len([k for k in x if k == '*']) != 0,
             len([k for k in tks if k in negs]),
            ]
        ret.append(next_x)
    ret = np.asarray(ret)
    return ret

tr_vader = get_extra(tr_X)
val_vader = get_extra(val_X)
ts_vader = get_extra(ts_X)

In [105]:
from sklearn.pipeline import make_pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, HashingVectorizer
from sklearn.model_selection import cross_val_predict
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import RidgeCV, ElasticNetCV, LassoCV
from sklearn.dummy import DummyRegressor
from sklearn.naive_bayes import MultinomialNB
from sklearn.preprocessing import Normalizer, StandardScaler
from sklearn import metrics
from sklearn.neural_network import MLPRegressor
import tensorflow as tf

from keras.layers import Dense, Input, Concatenate, Dropout, GaussianDropout
from keras.models import Model
from keras.regularizers import l2
from sklearn.metrics import make_scorer
from keras.callbacks import ModelCheckpoint

from keras import backend as K

def cov_keras(a, b):
    return K.sum((a - K.mean(a)) * (b - K.mean(b)))
    cov = K.dot(K.transpose(a), b)
    return cov#K.sum(cov)

EPS = 1e-20

def my_ccc(y_true, y_pred):
    true_mean = K.mean(y_true)
    true_variance = K.var(y_true)
    pred_mean = K.mean(y_pred)
    pred_variance = K.var(y_pred)
    true_std = K.std(y_true)
    pred_std = K.std(y_pred)
    
    rho = K.sum((y_true - true_mean) * (y_pred - pred_mean)) / \
        (K.sqrt(K.sum(K.square(y_true - true_mean))) * 
         K.sqrt(K.sum(K.square(y_pred - pred_mean))) + EPS)
    
    std_predictions = K.std(y_pred)

    std_gt = K.std(y_true)

    ccc = (2 * rho * std_gt * std_predictions) / (
        K.square(std_predictions) + K.square(std_gt) +
        K.square(pred_mean - true_mean) + EPS)

    return ccc

def my_ccc_loss(y_true, y_pred):
    return 1 - my_ccc(y_true, y_pred)

labels = ['arousal', 'valence']
#labels = ['valence']

val_preds_per_label = {}
ts_preds_per_label = {}

for l in labels[::-1]:
    activation = 'sigmoid' if l == 'arousal' else 'tanh'

    tr_y_l = tr_y[l].values
    val_y_l = val_y[l].values

    vec = TfidfVectorizer(ngram_range=(1, 3), min_df=2, sublinear_tf=l == 'arousal')
    vec.fit(list(val_X_POS) + list(ts_X_POS))# if l == 'arousal' else list(ts_X_POS))

    X_counts = vec.transform(tr_X_POS)
    X_ = np.hstack((X_counts.todense(),
                    tr_vader,))

    X_val = np.hstack((vec.transform(val_X_POS).todense(),
                       val_vader,))
    X_ts = np.hstack((vec.transform(ts_X_POS).todense(),
                       ts_vader,))

    input_bow = Input((X_counts.shape[1],), name='bow')
    input_fts = Input((tr_vader.shape[1],), name='fts')
    
    last_bow = input_bow
    last_bow = Dropout(0.10)(last_bow)
    last_bow = Dense(10, activation='relu' if l == 'arousal' else activation,
                     #kernel_regularizer=l2(1e-1)#e-1)
                    )(last_bow)
    
    last_fts = input_fts
    #last_fts = Dropout(0.1)(last_fts)
    last_fts = Dense(5, activation='relu'
                     #kernel_regularizer=l2(1e-1)
                    )(last_fts)
    last = Concatenate()([last_bow, last_fts])

    output = Dense(1, activation=activation)(last)
    model = Model(inputs=[input_bow, input_fts], outputs=[output])
    model.compile(optimizer='adam', metrics=[my_ccc, my_ccc_loss],
                  loss=my_ccc_loss)
    
    model.summary()
    
    model.fit({'bow': X_counts, 'fts': tr_vader}, tr_y_l,
              epochs=50,
              validation_data=({'bow': vec.transform(val_X_POS).todense(),
                                'fts': val_vader}, val_y_l),
              callbacks=[ModelCheckpoint('model-%s.h5' % l,
                                         monitor='val_my_ccc_loss',
                                         save_best_only=True, verbose=1)],
              verbose=0)
    
    model.load_weights('model-%s.h5' % l)

    tr_preds = model.predict({'bow': vec.transform(tr_X_POS).todense(),
                              'fts': tr_vader}).reshape(len(X_))

    val_preds = model.predict({'bow': vec.transform(val_X_POS).todense(),
                               'fts': val_vader}).reshape(len(X_val))
    ts_preds = model.predict({'bow': vec.transform(ts_X_POS).todense(),
                              'fts': ts_vader}).reshape(len(X_ts))

    val_preds_per_label[l] = val_preds
    ts_preds_per_label[l] = ts_preds

    print(l,
          ccc(tr_y_l, tr_preds)[0],
          metrics.mean_squared_error(tr_y_l, tr_preds),
          ccc(val_y_l, val_preds)[0],
          metrics.mean_squared_error(val_y_l, val_preds),
          )
    print()


__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
bow (InputLayer)                (None, 8977)         0                                            
__________________________________________________________________________________________________
dropout_107 (Dropout)           (None, 8977)         0           bow[0][0]                        
__________________________________________________________________________________________________
fts (InputLayer)                (None, 12)           0                                            
__________________________________________________________________________________________________
dense_316 (Dense)               (None, 10)           89780       dropout_107[0][0]                
__________________________________________________________________________________________________
dense_317 


Epoch 00029: val_my_ccc_loss did not improve

Epoch 00030: val_my_ccc_loss did not improve

Epoch 00031: val_my_ccc_loss did not improve

Epoch 00032: val_my_ccc_loss did not improve

Epoch 00033: val_my_ccc_loss did not improve

Epoch 00034: val_my_ccc_loss did not improve

Epoch 00035: val_my_ccc_loss did not improve

Epoch 00036: val_my_ccc_loss did not improve

Epoch 00037: val_my_ccc_loss did not improve

Epoch 00038: val_my_ccc_loss did not improve

Epoch 00039: val_my_ccc_loss did not improve

Epoch 00040: val_my_ccc_loss did not improve

Epoch 00041: val_my_ccc_loss did not improve

Epoch 00042: val_my_ccc_loss did not improve

Epoch 00043: val_my_ccc_loss did not improve

Epoch 00044: val_my_ccc_loss did not improve

Epoch 00045: val_my_ccc_loss did not improve

Epoch 00046: val_my_ccc_loss did not improve

Epoch 00047: val_my_ccc_loss did not improve

Epoch 00048: val_my_ccc_loss did not improve

Epoch 00049: val_my_ccc_loss did not improve

Epoch 00050: val_my_ccc_loss did 

In [115]:
def preds2csv(csv_fn, gt_file_fn, preds_arousal, preds_valence):
    df_sample = pd.read_csv(gt_file_fn)
    columns = ['video', 'utterance', 'arousal', 'valence']
    data = {}
    data['video'] = df_sample['video']
    data['utterance'] = df_sample['utterance']
    data['arousal'] = preds_arousal
    data['valence'] = preds_valence
    df_preds = pd.DataFrame(data, columns=columns)
    df_preds.to_csv(csv_fn, sep=',', index=False)

valence_val_preds = np.ones(val_mask.shape[0], dtype=np.float)
valence_val_preds[val_mask] = val_preds_per_label['valence']
valence_val_preds[~val_mask] = np.nan

arousal_val_preds = np.ones(val_mask.shape[0], dtype=np.float)
arousal_val_preds[val_mask] = val_preds_per_label['arousal']
arousal_val_preds[~val_mask] = np.nan

#valence_ts_preds = np.ones(ts_mask.shape[0], dtype=np.float)
valence_ts_preds = ts_preds_per_label['valence']
valence_ts_preds[~ts_mask] = np.nan

#arousal_ts_preds = np.ones(ts_mask.shape[0], dtype=np.float)
arousal_ts_preds = ts_preds_per_label['arousal']
arousal_ts_preds[~ts_mask] = np.nan

preds2csv('val_preds_kelwin.csv', 'omg_ValidationVideos.csv', arousal_val_preds, valence_val_preds)
preds2csv('ts_preds_kelwin.csv', 'omg_TestVideos_WithoutLabels.csv', arousal_ts_preds, valence_ts_preds)