In [1]:
import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt
%matplotlib inline
from fastai.vision import *
from IPython.display import display
pd.set_option('display.max_rows', 70)
pd.set_option('display.max_columns', 40)
pd.set_option('display.max_colwidth', 1000)

Using the exported, trained model, I will predict the emotions based on the Audio clips where only Sanyam is the speaker, from most of the Episodes.

During extraction of the Audio clips, I made some changes compared to the images used for training the model. I removed any axes, ticks, and labels. And removed any possible margins around the image. I also saved these Mel Spectograms as `png`s instead of `jpeg`s like for the training dataset. 

Minor changes, but for now, this is just piecing things together as quickly as possible and learning along the way.

### Load Test Images

In [3]:
test_images = ImageList.from_folder("/notebooks/storage/ctds_data/audio_files_segments/")

In [11]:
print(test_images.items[2])

/notebooks/storage/ctds_data/audio_files_segments/44_49.png


### Load model

In [4]:
learn = load_learner("/notebooks", "model_export", test=test_images)

### Predictions

In [5]:
preds, y = learn.get_preds(ds_type=DatasetType.Test)

print(preds.shape)
print(preds[:3])

torch.Size([2523, 8])
tensor([[0.0403, 0.0248, 0.0032, 0.0102, 0.7061, 0.1849, 0.0248, 0.0056],
        [0.0450, 0.0080, 0.0035, 0.0071, 0.8305, 0.0239, 0.0754, 0.0065],
        [0.0368, 0.0711, 0.0043, 0.0189, 0.3675, 0.3903, 0.1078, 0.0032]])


In [6]:
labels = np.argmax(preds, 1)
print(labels.shape)
print(labels[:3])

torch.Size([2523])
tensor([4, 4, 5])


### Create DataFrame

I will create a new csv that stores the predictions corresponding to the clips. Columns for this csv -

- episode_num
- clip_num
- pred_num
- pred_label
- speaker

Using the existing transcripts - 

- I will also mark the missing values for the clips I didn't include for this as null. 
- Include timestamps for the clip

I am also adding `speaker` so that I can extend this csv file later when I use the rest of the audio clips.

In [60]:
episode_check = dict()
pattern = r"segments/(.*).png"
exp = re.compile(pattern)
for idx, item in enumerate(test_images.items):
    op = exp.findall(str(item))[0]
    ep = int(op.split("_")[0])
    clip = float(op.split("_")[1])
    if ep not in episode_check:
        episode_check[ep] = [[clip], [labels[idx]]]
    else:
        episode_check[ep][0].append(clip)
        episode_check[ep][1].append(labels[idx])

In [30]:
input_path = "/notebooks/storage/ctds_data/audio_files/"
path = os.listdir(input_path)
data_path = "/notebooks/storage/ctds_data/Cleaned Subtitles/"
episode_num = []
episode_clip = []
clip_start = []
for file in path:
    ep_num = int(file.split('.mp3')[0])

    # account for non-interview episodes and missing E4 subtitles and E46
    if (ep_num > 45 and ep_num < 55):
        continue
    elif (ep_num == 4):
        episode_num.append(ep_num)
        episode_clip.append(np.nan)
        clip_start.append(np.nan)
        continue
    elif ep_num > 54:
        ep_num = ep_num - 9
    episode_transcript = pd.read_csv(data_path + "E" + str(ep_num) + ".csv")
    
    count = 0
    for idx, time2 in enumerate(episode_transcript["Time"]):
        episode_num.append(ep_num)
        episode_clip.append(idx)
        clip_start.append(time2)

In [33]:
print(len(episode_num), len(episode_clip), len(clip_start))

7650 7650 7650


In [35]:
data = {"episode_num": episode_num, "episode_clip": episode_clip, "clip_start": clip_start}
predictions = pd.DataFrame(data=data)

In [38]:
predictions.sample(5)

Unnamed: 0,episode_num,episode_clip,clip_start
3304,75,68.0,32:35
2096,62,60.0,54:21
4506,5,52.0,27:18
5523,37,69.0,1:00:25
6815,35,52.0,21:29


In [61]:
def speaker(df):
    for key, val in episode_check.items():
        if (df["episode_num"] == key) and (df["episode_clip"] in val[0]):
            return "Sanyam Bhutani"
        
    return np.nan
            
predictions["speaker"] = predictions.apply(speaker, axis=1)

In [64]:
predictions['speaker'].sample(5)

5188               NaN
3350               NaN
6088    Sanyam Bhutani
5829    Sanyam Bhutani
1091               NaN
Name: speaker, dtype: object

In [95]:
def pred_num(df):
    for key, val in episode_check.items():
        if (df["episode_num"] == key):
            for idx, c in enumerate(val[0]):
                if df["episode_clip"] == c:
                    return int(val[1][idx].item())
        
    return np.nan

predictions["pred_num"] = predictions.apply(pred_num, axis=1)

In [97]:
predictions["pred_num"].sample(5)

1121    6.0
4178    NaN
331     NaN
1578    NaN
2711    4.0
Name: pred_num, dtype: float64

In [99]:
data_dict = {0.0: 'angry', 1.0: 'calm', 2.0: 'disgust', 3.0: 'fearful', 4.0: 'happy', 5.0: 'neutral', 6.0: 'sad', 7.0: 'surprised'}

def pred_label(row):
    if row and row in data_dict:
        return data_dict[row]
    return np.nan
        
predictions["pred_label"] = predictions["pred_num"].apply(pred_label)

In [104]:
predictions.sample(5)

Unnamed: 0,episode_num,episode_clip,clip_start,speaker,pred_num,pred_label
308,66,51.0,55:12,,,
1526,15,64.0,47:06,Sanyam Bhutani,4.0,happy
5535,18,0.0,0:13,,,
4070,36,30.0,13:20,Sanyam Bhutani,4.0,happy
2530,48,30.0,11:31,Sanyam Bhutani,6.0,sad


I think `predictions` is as ready as it can be for the time being. We can move onto analysis.

In [106]:
predictions.to_csv("/notebooks/storage/ctds_data/predictions.csv")

## Prediction for Guests

I managed to extract the data corresponding to the guests as well. I will extend the above for them.

### Load data

In [2]:
guest_images = ImageList.from_folder("/notebooks/storage/ctds_data/audio_files_segments_guest/")
print(guest_images.items[2])

/notebooks/storage/ctds_data/audio_files_segments_guest/25_136.png


### Load Model

In [3]:
learn = load_learner("/notebooks", "model_export", test=guest_images)

### Predictions

In [4]:
preds, y = learn.get_preds(ds_type=DatasetType.Test)

print(preds.shape)
print(preds[:3])

torch.Size([3366, 8])
tensor([[4.4624e-02, 1.0234e-02, 1.4024e-02, 2.4906e-02, 2.7836e-01, 1.0106e-01,
         5.1932e-01, 7.4751e-03],
        [2.0837e-01, 2.5329e-03, 2.1087e-03, 2.2924e-03, 6.6199e-01, 1.1084e-01,
         1.1328e-02, 5.4082e-04],
        [2.9459e-02, 4.8490e-03, 1.5461e-03, 6.3262e-03, 5.2660e-01, 3.3569e-01,
         8.7672e-02, 7.8569e-03]])


In [5]:
labels = np.argmax(preds, 1)
print(labels.shape)
print(labels[:3])

torch.Size([3366])
tensor([6, 4, 4])


### Modify Dataframe

I already have the data corresponding to Sanyam's audio clips, so I will append these new values to that same dataframe.

In [18]:
predictions = pd.read_csv("/notebooks/storage/ctds_data/predictions.csv")
predictions.sample(5)

Unnamed: 0.1,Unnamed: 0,episode_num,episode_clip,clip_start,speaker,pred_num,pred_label
3580,3580,24,66.0,55:44,Sanyam Bhutani,0.0,
4120,4120,36,80.0,26:59,,,
983,983,33,103.0,52:31,,,
2975,2975,19,39.0,11:23,,,
6175,6175,72,56.0,46:35,Sanyam Bhutani,1.0,calm


Since I didn't future-proof my previous functions, I will modify and use them again for this.

In [67]:
input_path = "/notebooks/storage/ctds_data/audio_files/"
path = os.listdir(input_path)
data_path = "/notebooks/storage/ctds_data/Cleaned Subtitles/"
episode_check = dict()
for file in path:
    ep_num = int(file.split('.mp3')[0])

    # account for non-interview episodes and missing E4 subtitles and E46
    if (ep_num > 45 and ep_num < 55):
        continue
    elif (ep_num == 4):
        if ep_num not in episode_check:
            episode_check[ep_num] = [np.nan, np.nan]
        else:
            episode_check[ep_num].append([np.nan, np.nan])
        continue
    elif ep_num > 54:
        ep_num = ep_num - 9
    episode_transcript = pd.read_csv(data_path + "E" + str(ep_num) + ".csv")
    
    count = 0
    for idx, row in episode_transcript.iterrows(): 
        if ep_num not in episode_check:
            episode_check[ep_num] = [[idx, row["Speaker"]]]
        else:
            episode_check[ep_num].append([idx, row["Speaker"]])

In [100]:
episode_check_labels = dict()
pattern = r"segments_guest/(.*).png"
exp = re.compile(pattern)
for idx, item in enumerate(guest_images.items):
    op = exp.findall(str(item))[0]
    ep = int(op.split("_")[0])
    clip = float(op.split("_")[1])
    if ep not in episode_check_labels:
        episode_check_labels[ep] = [[clip], [labels[idx]]]
    else:
        episode_check_labels[ep][0].append(clip)
        episode_check_labels[ep][1].append(labels[idx])

In [95]:
def speaker(df):
    if df["episode_num"] == 4:
        return np.nan
    
    for key, val in episode_check.items():
        if (int(df["episode_num"]) == key):
                for v in val:
                    if v[0] == df["episode_clip"]:
                        return v[1]
            
predictions["speaker"] = predictions.apply(speaker, axis=1)

In [96]:
predictions.sample(5)

Unnamed: 0.1,Unnamed: 0,episode_num,episode_clip,clip_start,speaker,pred_num,pred_label
6154,6154,72,35.0,33:19,Andreas Mueller,,
633,633,45,55.0,40:43,Marios,,
1672,1672,53,22.0,21:20,Sanyam Bhutani,4.0,happy
1339,1339,47,33.0,25:29,John Miller,,
2880,2880,60,68.0,51:09,Sanyam Bhutani,6.0,sad


In [132]:
def pred_num(df):
    if np.isnan(df['pred_num']):
        for key, val in episode_check_labels.items():
            if (df["episode_num"] == key):
                for idx, c in enumerate(val[0]):
                    if df["episode_clip"] == c:
                        return int(val[1][idx].item())
    return df["pred_num"]


predictions["pred_num"] = predictions.apply(pred_num, axis=1)
predictions.head()

Unnamed: 0.1,Unnamed: 0,episode_num,episode_clip,clip_start,speaker,pred_num,pred_label
0,0,29,0.0,0:13,Sanyam Bhutani,,
1,1,29,1.0,3:42,Sanyam Bhutani,4.0,happy
2,2,29,2.0,3:49,Eugene Khvedchenya,4.0,
3,3,29,3.0,3:53,Sanyam Bhutani,4.0,happy
4,4,29,4.0,4:08,Eugene Khvedchenya,4.0,


In [133]:
data_dict = {0.0: 'angry', 1.0: 'calm', 2.0: 'disgust', 3.0: 'fearful', 4.0: 'happy', 5.0: 'neutral', 6.0: 'sad', 7.0: 'surprised'}

def pred_label(row):
    if row and row in data_dict:
        return data_dict[row]
    return np.nan
        
predictions["pred_label"] = predictions["pred_num"].apply(pred_label)
predictions.sample(5)

Unnamed: 0.1,Unnamed: 0,episode_num,episode_clip,clip_start,speaker,pred_num,pred_label
3371,3371,43,58.0,27:33,Sanyam Bhutani,4.0,happy
2910,2910,69,27.0,38:53,Sanyam Bhutani,4.0,happy
4168,4168,36,128.0,43:16,Jason Antic,6.0,sad
6279,6279,14,76.0,28:58,Pierre Stock,6.0,sad
1437,1437,22,72.0,48:26,Sanyam Bhutani,,


There are some discrepancies with the predictions where it seems prediction should NOT be NaN. But we will look into that later. We will clean up `predictions` before saving it.

- Remove Unnamed: 0 column
- Sort by episode_num and episode_clip

In [136]:
predictions = predictions.drop(columns=["Unnamed: 0"])
predictions.head(1)

Unnamed: 0,episode_num,episode_clip,clip_start,speaker,pred_num,pred_label
0,29,0.0,0:13,Sanyam Bhutani,,


In [142]:
predictions = predictions.sort_values(by=["episode_num", "episode_clip"]).reset_index(drop=True)
predictions.head()

Unnamed: 0,episode_num,episode_clip,clip_start,speaker,pred_num,pred_label
0,1,0.0,0:13,Sanyam Bhutani,,
1,1,1.0,1:49,Abhishek Thakur,7.0,surprised
2,1,2.0,1:53,Sanyam Bhutani,4.0,happy
3,1,3.0,2:12,Abhishek Thakur,5.0,neutral
4,1,4.0,2:41,Sanyam Bhutani,4.0,happy


In [143]:
predictions.to_csv("/notebooks/storage/ctds_data/predictions.csv")