## Data Preprocessing

In [1]:
import numpy as np
from glob import glob
import librosa 
import matplotlib.pyplot as plt
import pandas as pd
import pylab
import librosa.display

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


### Data preprocessing

In [3]:
# get paths to files 
speech_directory = '/content/drive/MyDrive/Mignot Lab Research/Experiments/one_sample/one-second-splits/speech'

In [4]:
# create a list of all of the files in the folder using glob 
speech_subset = glob(speech_directory + '/*.wav')

In [5]:
print(len(speech_subset))

2487


In [6]:
speech_subset = speech_subset[:500] # for class balancing 

In [8]:
train_num_speech = round(.90*len(speech_subset))
print(train_num_speech)

450


In [9]:
speech_train = speech_subset[0:train_num_speech]
print(len(speech_train))

450


In [10]:
temp_speech_labels = []
for i in range(len(speech_train)):
    temp_speech_labels.append(0) # 0 = speech

Now for snore samples 

In [12]:
snore_directory = '/content/drive/MyDrive/Mignot Lab Research/Experiments/speech-vs-snore/1'

In [13]:
# create a list of all of the files in the folder using glob 
snore_subset = glob(snore_directory + '/*.wav')

In [14]:
print(len(snore_subset))

500


In [15]:
train_num_snore = round(.90*len(snore_subset))
print(train_num_snore)

450


In [16]:
snore_train = snore_subset[0:train_num_snore]
print(len(snore_train))

450


In [17]:
temp_snore_labels = []
for i in range(len(snore_train)):
    temp_snore_labels.append(1) # 1 = speech

In [18]:
train_data_points_raw = speech_train + snore_train

In [19]:
train_labels = temp_speech_labels + temp_snore_labels
train_labels = np.array(train_labels)

In [20]:
len(train_data_points_raw)

900

In [21]:
len(train_labels)

900

### Pickle Training Labels

In [24]:
import pickle
out_file = open("/content/drive/MyDrive/Mignot Lab Research/Experiments/speech-vs-snore/train_labels.pkl", "wb")
pickle.dump(train_labels, out_file)
out_file.close()

### Generate Mel Spectrograms

In [25]:
train = []

In [26]:
for elem in train_data_points_raw:
    y, sr = librosa.load(elem)
    time = np.arange(0, len(y)) / sr
    S = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=128,
                                    fmax=8000)
    train.append(S)

In [27]:
train = np.array(train)

In [28]:
out_file = open("/content/drive/MyDrive/Mignot Lab Research/Experiments/speech-vs-snore/train_samples.pkl", "wb")
pickle.dump(train, out_file)
out_file.close()

---

### Test data preprocessing

In [40]:
# get paths to files 
speech_directory = '/content/drive/MyDrive/Mignot Lab Research/Experiments/one_sample/one-second-splits/speech'

In [41]:
# create a list of all of the files in the folder using glob 
speech_testset = glob(speech_directory + '/*.wav')

In [42]:
print(len(speech_testset))

2487


In [43]:
speech_testset = speech_testset[500:550] # for class balancing 

In [44]:
print(len(speech_testset))

50


In [45]:
test_speech_labels = []
for i in range(len(speech_testset)):
    test_speech_labels.append(0) # 0 is equal to speech

In [48]:
test_snore_labels = []
for i in range(len(speech_testset)):
    test_snore_labels.append(1) # 1 is equal to snore

In [49]:
len(test_snore_labels)

50

In [50]:
test_labels_official = test_speech_labels + test_snore_labels

In [51]:
test_labels = np.array(test_labels_official)

In [52]:
import pickle
out_file = open("/content/drive/MyDrive/Mignot Lab Research/Experiments/speech-vs-snore/test_labels.pkl", "wb")
pickle.dump(test_labels, out_file)
out_file.close()

Now again for snore samples

In [54]:
snore_directory = '/content/drive/MyDrive/Mignot Lab Research/Experiments/speech-vs-snore/1'

In [55]:
# create a list of all of the files in the folder using glob 
snore_subset = glob(snore_directory + '/*.wav')

In [56]:
snore_test = snore_subset[450:]

In [58]:
final_test = speech_testset + snore_test

In [60]:
test = []

In [61]:
for elem in final_test:
    y, sr = librosa.load(elem)
    time = np.arange(0, len(y)) / sr
    S = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=128,
                                    fmax=8000)
    test.append(S)

In [62]:
test = np.array(test)

In [63]:
out_file = open("/content/drive/MyDrive/Mignot Lab Research/Experiments/speech-vs-snore/test_samples.pkl", "wb")
pickle.dump(test, out_file)
out_file.close()