In [12]:
import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt
%matplotlib inline
from fastai.vision import *
from IPython.display import display
pd.set_option('display.max_rows', 70)
pd.set_option('display.max_columns', 40)
pd.set_option('display.max_colwidth', 1000)

Using the exported, trained model, I will predict the emotions based on the Audio clips where only Sanyam is the speaker, from most of the Episodes.

During extraction of the Audio clips, I made some changes compared to the images used for training the model. I removed any axes, ticks, and labels. And removed any possible margins around the image. I also saved these Mel Spectograms as `png`s instead of `jpeg`s like for the training dataset. 

Minor changes, but for now, this is just piecing things together as quickly as possible and learning along the way.

### Load Test Images

In [3]:
test_images = ImageList.from_folder("/notebooks/storage/ctds_data/audio_files_segments/")

In [11]:
print(test_images.items[2])

/notebooks/storage/ctds_data/audio_files_segments/44_49.png


### Load model

In [4]:
learn = load_learner("/notebooks", "model_export", test=test_images)

### Predictions

In [5]:
preds, y = learn.get_preds(ds_type=DatasetType.Test)

print(preds.shape)
print(preds[:3])

torch.Size([2523, 8])
tensor([[0.0403, 0.0248, 0.0032, 0.0102, 0.7061, 0.1849, 0.0248, 0.0056],
        [0.0450, 0.0080, 0.0035, 0.0071, 0.8305, 0.0239, 0.0754, 0.0065],
        [0.0368, 0.0711, 0.0043, 0.0189, 0.3675, 0.3903, 0.1078, 0.0032]])


In [6]:
labels = np.argmax(preds, 1)
print(labels.shape)
print(labels[:3])

torch.Size([2523])
tensor([4, 4, 5])


### Create DataFrame

I will create a new csv that stores the predictions corresponding to the clips. Columns for this csv -

- episode_num
- clip_num
- pred_num
- pred_label
- speaker

Using the existing transcripts - 

- I will also mark the missing values for the clips I didn't include for this as null. 
- Include timestamps for the clip

I am also adding `speaker` so that I can extend this csv file later when I use the rest of the audio clips.

In [60]:
episode_check = dict()
pattern = r"segments/(.*).png"
exp = re.compile(pattern)
for idx, item in enumerate(test_images.items):
    op = exp.findall(str(item))[0]
    ep = int(op.split("_")[0])
    clip = float(op.split("_")[1])
    if ep not in episode_check:
        episode_check[ep] = [[clip], [labels[idx]]]
    else:
        episode_check[ep][0].append(clip)
        episode_check[ep][1].append(labels[idx])

In [30]:
input_path = "/notebooks/storage/ctds_data/audio_files/"
path = os.listdir(input_path)
data_path = "/notebooks/storage/ctds_data/Cleaned Subtitles/"
episode_num = []
episode_clip = []
clip_start = []
for file in path:
    ep_num = int(file.split('.mp3')[0])

    # account for non-interview episodes and missing E4 subtitles and E46
    if (ep_num > 45 and ep_num < 55):
        continue
    elif (ep_num == 4):
        episode_num.append(ep_num)
        episode_clip.append(np.nan)
        clip_start.append(np.nan)
        continue
    elif ep_num > 54:
        ep_num = ep_num - 9
    episode_transcript = pd.read_csv(data_path + "E" + str(ep_num) + ".csv")
    
    count = 0
    for idx, time2 in enumerate(episode_transcript["Time"]):
        episode_num.append(ep_num)
        episode_clip.append(idx)
        clip_start.append(time2)

In [33]:
print(len(episode_num), len(episode_clip), len(clip_start))

7650 7650 7650


In [35]:
data = {"episode_num": episode_num, "episode_clip": episode_clip, "clip_start": clip_start}
predictions = pd.DataFrame(data=data)

In [38]:
predictions.sample(5)

Unnamed: 0,episode_num,episode_clip,clip_start
3304,75,68.0,32:35
2096,62,60.0,54:21
4506,5,52.0,27:18
5523,37,69.0,1:00:25
6815,35,52.0,21:29


In [61]:
def speaker(df):
    for key, val in episode_check.items():
        if (df["episode_num"] == key) and (df["episode_clip"] in val[0]):
            return "Sanyam Bhutani"
        
    return np.nan
            
predictions["speaker"] = predictions.apply(speaker, axis=1)

In [64]:
predictions['speaker'].sample(5)

5188               NaN
3350               NaN
6088    Sanyam Bhutani
5829    Sanyam Bhutani
1091               NaN
Name: speaker, dtype: object

In [95]:
def pred_num(df):
    for key, val in episode_check.items():
        if (df["episode_num"] == key):
            for idx, c in enumerate(val[0]):
                if df["episode_clip"] == c:
                    return int(val[1][idx].item())
        
    return np.nan

predictions["pred_num"] = predictions.apply(pred_num, axis=1)

In [97]:
predictions["pred_num"].sample(5)

1121    6.0
4178    NaN
331     NaN
1578    NaN
2711    4.0
Name: pred_num, dtype: float64

In [99]:
data_dict = {0.0: 'angry', 1.0: 'calm', 2.0: 'disgust', 3.0: 'fearful', 4.0: 'happy', 5.0: 'neutral', 6.0: 'sad', 7.0: 'surprised'}

def pred_label(row):
    if row and row in data_dict:
        return data_dict[row]
    return np.nan
        
predictions["pred_label"] = predictions["pred_num"].apply(pred_label)

In [104]:
predictions.sample(5)

Unnamed: 0,episode_num,episode_clip,clip_start,speaker,pred_num,pred_label
308,66,51.0,55:12,,,
1526,15,64.0,47:06,Sanyam Bhutani,4.0,happy
5535,18,0.0,0:13,,,
4070,36,30.0,13:20,Sanyam Bhutani,4.0,happy
2530,48,30.0,11:31,Sanyam Bhutani,6.0,sad


I think `predictions` is as ready as it can be for the time being. We can move onto analysis.

In [106]:
predictions.to_csv("/notebooks/storage/ctds_data/predictions.csv")