# Video analysis on the MOUD dataset

This notebook contains a model to evaluate facial behaviors from videos from the MOUD dataset obtaining and processing data obtained from OpenFace toolkit. LINK: https://github.com/TadasBaltrusaitis/OpenFace

In [1]:
# The path of the train and test transcriptions
# The data is seperated in an 80-20 ratio and the test directory is untouched. 
train_path = r"C:\Users\Roshan Sridhar\Google Drive\Documents\NYU\GILAB\MMML\Datasets\MOUD\VideoReviews\transcriptions\train\*.csv"
test_path = r"C:\Users\Roshan Sridhar\Google Drive\Documents\NYU\GILAB\MMML\Datasets\MOUD\VideoReviews\transcriptions\test\*.csv"

In [2]:
import glob
import numpy as np
import pandas as pd
import sklearn
import scipy

In [3]:
# funcion to combine multiple speech, annotation columns to one and drop rest of columns
def clean_moud(df_name):
    if 'Speech' not in df_name.columns:
        df_name['Speech'] = ''    
    if 'speech' in df_name.columns:
        df_name['Speech'] = df_name[['Speech','speech']].fillna('').sum(axis=1)   
    if 'transcription' in df_name.columns:
        df_name['Speech'] = df_name[['Speech','transcription']].fillna('').sum(axis=1)

    if 'sentimentAnnotation' not in df_name.columns:
        df_name['sentimentAnnotation'] = 0    
    if 'sentimentAnnotations' in df_name.columns:
        df_name['sentimentAnnotation'] = df_name[['sentimentAnnotation','sentimentAnnotations']].fillna(0).sum(axis=1)
    if 'sentimentannotations' in df_name.columns:
        df_name['sentimentAnnotation'] = df_name[['sentimentAnnotation','sentimentannotations']].fillna(0).sum(axis=1)
    
    return df_name

In [4]:
# funcion to append all utterances to dataframe
def create_data_df(df_name,data_path):
    '''
    Returns a text dataframe with two columns 'Speech' and 'sentimentAnnotation'
    Returns a sparse matrix of video features to be combined with the text tfidf later'
    '''
    # Creating video df
    v_cols = []
    skeleton_path = r"C:\Users\Roshan Sridhar\Google Drive\Documents\NYU\GILAB\MMML\Python\MOUD\Text_Video\video_skeleton.csv"
    df_v = pd.DataFrame(pd.read_csv(skeleton_path, sep = ','))
    df_v = df_v.drop([df_v.columns.values[0]],axis=1)

    for f in glob.glob(data_path):
        
        # TEXT 
        # append speech utterances to text dataframe 
        df_name = df_name.append(pd.read_csv(f,sep=';'),ignore_index=True)
        
        # VIDEO
        # Create sparse video matrix for each file consecutively while creating text dataframe
        # It is done at this particular point to extract time related groups before the starttime and endtimes are lost
        
        # Creating a temporary text df to get times and clean
        df_name_temp = pd.read_csv(f,sep=';')

        df_name_temp = clean_moud(df_name_temp)
        
        # Remove neutral annotations
        df_name_temp = df_name_temp.query('sentimentAnnotation != 0')
         
        # Creating a df of the corredponding OpenFace features file 
        v_name = r"C:\Users\Roshan Sridhar\Google Drive\Documents\NYU\GILAB\MMML\Datasets\MOUD\OpenFaceFeatures" + "\\" + f.rsplit("\\",1)[1].split(".")[0] + ".mp4.csv"
        df_v_name = pd.read_csv(v_name, sep = ", ", engine = "python")
    
        # Splitting the video data by utterances
        for starttime,endtime in zip(df_name_temp['#starttime'],df_name_temp['#endtime']):    
            # Generate mean and standard deviation upto endtime of utterance, new df because columns need to be dropped
            df_v_name_temp = df_v_name.query('timestamp >='+str(starttime)+'& timestamp <='+str(endtime)).agg(['mean','std'])
            # Drop unwanted labels after querying because timestamp is required to filter in prev line
            df_v_name_temp.drop(['frame','timestamp','confidence','success'], axis = 1)
            # append single row of means and stds to the main dataframe
            
            df_v.loc[len(df_v)] = np.array(df_v_name_temp).ravel()

    # TEXT 
    # combine multiple speech, annotation columns to one and drop rest of columns
    df_name = clean_moud(df_name)
    
    # Remove neutral annotations
    df_name = df_name.query('sentimentAnnotation != 0')
    
    df_name = df_name[['Speech','sentimentAnnotation']].reset_index(drop=True)  
    
    return df_name, df_v.values

In [5]:
df = pd.DataFrame()
df_t = pd.DataFrame()

# Clean dataframe and create sparse video matrix
df, v_train = create_data_df(df,train_path)
df_t, v_test = create_data_df(df_t,test_path)

#converted to sparse matrix for faster computation
v_train_sparse = scipy.sparse.csr_matrix(v_train)
v_test_sparse = scipy.sparse.csr_matrix(v_test)
df.head()

Unnamed: 0,Speech,sentimentAnnotation
0,yo habia visto resenas que decian que picaba c...,-1.0
1,y la verdad es que si la use una vez y t- y te...,-1.0
2,y dije no: puede ser posible tanto la deseaba ...,-1.0
3,esta tambien tira un poquito de pelo pero haga...,-1.0
4,pero igual con las lavadas se ha dejado de tir...,1.0


### Data cleaning and text preprocessing

This section 'Data cleaning and text preprocessing' is to preprocess the text for text+video analysis.

In [None]:
# from https://www.kaggle.com/c/word2vec-nlp-tutorial/
import re
from bs4 import BeautifulSoup
import nltk

# execute the following commented step to install the data packages if you don't already have it  
# nltk.download()

from nltk.corpus import stopwords

#using text translation API
from watson_developer_cloud import LanguageTranslatorV2
language_translator = LanguageTranslatorV2(
    username="",
    password="")

# resuable function to convert raw speech to preprocessed
def utterance_to_words(raw_utterance):
    # 1. Removing any HTML elements
    utterance_text = BeautifulSoup(raw_utterance, "lxml").get_text()
    # TRANSLATION
    translated_utterance = language_translator.translate(utterance_text, source='es',target='en')
    # 2. Remove non-letters
    letters_only = re.sub("[^a-zA-Z]", " ", utterance_text) 
    # 3. Convert to lower case, split into individual words
    words = letters_only.lower().split()
    # 4. convert the stop words to a set
    stops = set(stopwords.words("english"))
    # 5. Remove stop words
    meaningful_words = [w for w in words if not w in stops]
    # 6. Join the words back into one string separated by space, and return the result.
    return( " ".join( meaningful_words ))

# applying the function to the speech column
df['Speech'] = df['Speech'].apply(lambda x: utterance_to_words(x))
df_t['Speech'] = df_t['Speech'].apply(lambda x: utterance_to_words(x))

In [None]:
# from sklearn.model_selection import train_test_split

# # splitting dataset into train and test in stratified fashion and a ratio of 80% - 20%
# X, y = df[['Speech']],df[['sentimentAnnotation']]
# X_trn, X_tst, y_trn, y_tst = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)
X_trn, y_trn = df[['Speech']],df[['sentimentAnnotation']]
X_tst, y_tst = df_t[['Speech']],df_t[['sentimentAnnotation']]

### Utterance level video-ONLY analysis

The following 'video-only analysis' code is present here due to dependency of timestamps from text dataset.
This section performs analysis on only the video features extracted. 

The next section 'Machine Learning' contains both the video and text stacked using the 'early fusion' method. (See section 6.1 https://arxiv.org/pdf/1705.09406.pdf)

In [None]:
from sklearn import svm
# SVM model creation and fitting train vector to annotations
model_tf_v = svm.SVC(kernel='linear', C=1, gamma=1).fit(v_train_sparse,y_trn['sentimentAnnotation'].values)

# generate predictions
predicted_tf_v = model_tf_v.predict(v_test_sparse)

In [None]:
# Classification report
from sklearn.metrics import classification_report
print(classification_report(y_tst['sentimentAnnotation'].values, predicted_tf_v))

#create df to show results
disp = y_tst.reset_index(drop=True).join(pd.DataFrame(predicted_tf_v,columns=['Prediction']))
disp = disp.join(pd.DataFrame(disp['sentimentAnnotation']==disp['Prediction'],columns=['Right/Wrong']))
scores = model_tf_v.score(v_test_sparse,y_tst['sentimentAnnotation'].values)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))
print("Mean sentiment: {!r}. Predicted mean sentiment: {!r}.".format('Positive' if disp['sentimentAnnotation'].mean()>=0 else 'Negative','Positive' if disp['Prediction'].mean()>=0 else 'Negative'))
disp.head()

In [None]:
# cross validation of training set
from sklearn.model_selection import cross_val_score
clf_cv = svm.SVC(kernel='linear', C=1, gamma=1)
scores = cross_val_score(clf_cv, v_train_sparse, y_trn['sentimentAnnotation'].values, cv=10)
scores
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

In [None]:
from sklearn.linear_model import LogisticRegression
model_lr = LogisticRegression().fit(v_train_sparse,y_trn['sentimentAnnotation'].values)
# generate predictions
predicted_lr = model_lr.predict(v_test_sparse)
# Classification report
print(classification_report(y_tst['sentimentAnnotation'].values, predicted_lr))

In [None]:
from sklearn.tree import DecisionTreeClassifier
model_dt = DecisionTreeClassifier().fit(v_train_sparse,y_trn['sentimentAnnotation'].values)
# generate predictions
predicted_dt = model_dt.predict(v_test_sparse)
# Classification report
print(classification_report(y_tst['sentimentAnnotation'].values, predicted_dt))

In [None]:
from sklearn.ensemble import RandomForestClassifier
model_rf = RandomForestClassifier().fit(v_train_sparse,y_trn['sentimentAnnotation'].values)
# generate predictions
predicted_rf = model_rf.predict(v_test_sparse)
# Classification report
print(classification_report(y_tst['sentimentAnnotation'].values, predicted_rf))

In [None]:
# Classification report
from sklearn.metrics import classification_report
print(classification_report(y_tst['sentimentAnnotation'].values, predicted_tf_v))
print(classification_report(y_tst['sentimentAnnotation'].values, predicted_lr))
print(classification_report(y_tst['sentimentAnnotation'].values, predicted_dt))
print(classification_report(y_tst['sentimentAnnotation'].values, predicted_rf))

### Machine learning

In [None]:
# countVectorizer initialization
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(analyzer = "word",   \
                             tokenizer = None,    \
                             preprocessor = None, \
                             stop_words = None,   \
                             lowercase = True,    \
                             max_features = 5000) 

# create bag of words vector for the training set using countVectorizer
train_data_features = vectorizer.fit_transform(X_trn['Speech'].values)

In [None]:
# transformation of test data
test_data_features = vectorizer.transform(X_tst['Speech'].values)

In [None]:
# tf-idf transformer initialization
from sklearn.feature_extraction.text import TfidfTransformer
tfidf_transformer = TfidfTransformer()

# create tfidf transformed vector for the training set using tf-idf transformer
X_train_tfidf = tfidf_transformer.fit_transform(train_data_features)
X_test_tfidf = tfidf_transformer.transform(test_data_features)

In [None]:
# Stacking Video to Text
train_data_features_v = scipy.sparse.hstack([X_train_tfidf, v_train_sparse])
test_data_features_v = scipy.sparse.hstack([X_test_tfidf, v_test_sparse])

In [None]:
# SVM model creation and fitting train vector to annotations
from sklearn import svm
model_tf = svm.SVC(kernel='linear', C=1, gamma=1).fit(train_data_features_v,y_trn['sentimentAnnotation'].values)

# generate predictions
predicted_tf = model_tf.predict(test_data_features_v)

# Classification report
from sklearn.metrics import classification_report
print(classification_report(y_tst['sentimentAnnotation'].values, predicted_tf))

#create df to show results
disp = X_tst.join(y_tst).reset_index(drop=True).join(pd.DataFrame(predicted_tf,columns=['Prediction']))
disp = disp.join(pd.DataFrame(disp['sentimentAnnotation']==disp['Prediction'],columns=['Right/Wrong']))
scores = model_tf.score(test_data_features_v,y_tst['sentimentAnnotation'].values)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))
print("Mean sentiment: {!r}. Predicted mean sentiment: {!r}.".format('Positive' if disp['sentimentAnnotation'].mean()>=0 else 'Negative','Positive' if disp['Prediction'].mean()>=0 else 'Negative'))
disp.head()

In [None]:
# cross validation of training set
from sklearn.model_selection import cross_val_score
clf_cv = svm.SVC(kernel='linear', C=1, gamma=1)
scores = cross_val_score(clf_cv, train_data_features_v, y_trn['sentimentAnnotation'].values, cv=10)
scores
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

Logistic Regression

In [None]:
# LR model creation and fitting train vector to annotations
model_tf = LogisticRegression().fit(train_data_features_v,y_trn['sentimentAnnotation'].values)
# generate predictions
predicted_tf = model_tf.predict(test_data_features_v)

# Classification report
from sklearn.metrics import classification_report
print(classification_report(y_tst['sentimentAnnotation'].values, predicted_tf))

#create df to show results
disp = X_tst.join(y_tst).reset_index(drop=True).join(pd.DataFrame(predicted_tf,columns=['Prediction']))
disp = disp.join(pd.DataFrame(disp['sentimentAnnotation']==disp['Prediction'],columns=['Right/Wrong']))
scores = model_tf.score(test_data_features_v,y_tst['sentimentAnnotation'].values)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))
print("Mean sentiment: {!r}. Predicted mean sentiment: {!r}.".format('Positive' if disp['sentimentAnnotation'].mean()>=0 else 'Negative','Positive' if disp['Prediction'].mean()>=0 else 'Negative'))
disp.head()

In [None]:
# cross validation of training set
from sklearn.model_selection import cross_val_score
clf_cv = svm.SVC(kernel='linear', C=1, gamma=1)
scores = cross_val_score(clf_cv, X_train_tfidf, y_trn['sentimentAnnotation'].values, cv=10)
scores
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))