Details and Resources of the project in the doc:                                
https://docs.google.com/document/d/1F-KYQ5nRnDAUVDUrEbTFYGn7TEVsT8EByuM9xJLT4gA/edit

In [1]:
import warnings                        # To ignore any warnings
warnings.filterwarnings("ignore")
%matplotlib inline
%pylab inline
import os
import pandas as pd
import numpy as np
import librosa
import librosa.display
import glob 
import matplotlib.pyplot as plt
%config InlineBackend.figure_format = 'retina'

import seaborn as sns; sns.set()
import scipy.io as sio
# descriptive statistics
import scipy as sp
import scipy.stats as stats
import pywt

Populating the interactive namespace from numpy and matplotlib


In [2]:
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


In [3]:

INPUT_DIR = '/content/drive/MyDrive/AI ML Things/University of Turku Research Internship'
'''
set_a_path=base_path+'/set_a'
set_a_metadata_path=base_path+'/set_a.csv'
set_b_path=base_path+'/set_b'
set_b_metadata_path=base_path+'/set_b.csv'
set_a_metadata = pd.read_csv(set_a_metadata_path)
'''
dataset_path = INPUT_DIR+'/set_a'
metadata = pd.read_csv(INPUT_DIR+'/set_a.csv')

SAMPLE_RATE = 16000
# seconds
MAX_SOUND_CLIP_DURATION=12  

#Explorer data

In [4]:
set_a=pd.read_csv(INPUT_DIR+"/set_a.csv")
set_a.head()

Unnamed: 0,dataset,fname,label,sublabel
0,a,set_a/artifact__201012172012.wav,artifact,
1,a,set_a/artifact__201105040918.wav,artifact,
2,a,set_a/artifact__201105041959.wav,artifact,
3,a,set_a/artifact__201105051017.wav,artifact,
4,a,set_a/artifact__201105060108.wav,artifact,


In [5]:
set_a_timing=pd.read_csv(INPUT_DIR+"/set_a_timing.csv")
set_a_timing.head()

Unnamed: 0,fname,cycle,sound,location
0,set_a/normal__201102081321.wav,1,S1,10021
1,set_a/normal__201102081321.wav,1,S2,20759
2,set_a/normal__201102081321.wav,2,S1,35075
3,set_a/normal__201102081321.wav,2,S2,47244
4,set_a/normal__201102081321.wav,3,S1,62992


In [6]:
set_b=pd.read_csv(INPUT_DIR+"/set_b.csv")
set_b.head()

Unnamed: 0,dataset,fname,label,sublabel
0,b,set_b/Btraining_extrastole_127_1306764300147_C...,extrastole,
1,b,set_b/Btraining_extrastole_128_1306344005749_A...,extrastole,
2,b,set_b/Btraining_extrastole_130_1306347376079_D...,extrastole,
3,b,set_b/Btraining_extrastole_134_1306428161797_C...,extrastole,
4,b,set_b/Btraining_extrastole_138_1306762146980_B...,extrastole,


In [7]:
#merging both set-a and set-b
frames = [set_a, set_b]
train_ab=pd.concat(frames)
train_ab.describe()

Unnamed: 0,dataset,fname,label,sublabel
count,832,832,585,149
unique,2,832,5,2
top,b,set_b/Btraining_normal_180_1307990956284_C.wav,normal,noisynormal
freq,656,1,351,120


In [8]:
#checking for duplicates
nb_classes=train_ab.label.unique()

print("Number of training examples=", train_ab.shape[0], "  Number of classes=", len(train_ab.label.unique()))
print (nb_classes)

Number of training examples= 832   Number of classes= 6
['artifact' 'extrahls' 'murmur' 'normal' nan 'extrastole']


Note: 'nan' indicate unclassified and unlabel test files

#Extracting Features of Data in Audio Domain

#Loading Data

murmur_noisymurmur_293_1311680805936_D.wav in set_B is corrupted removed it...

In [29]:
print("Number of training examples=", train_ab.shape[0], "  Number of classes=", len(train_ab.label.unique()))

Number of training examples= 832   Number of classes= 6


In [30]:
# get audio data with a fix padding may also chop off some file
def load_file_data (folder,file_names, duration=12, sr=16000):
    input_length=sr*duration
    # function to load files and extract features
    # file_names = glob.glob(os.path.join(folder, '*.wav'))
    data = []
    lables= []
    for file_name in file_names:
        try:
            sound_file=folder+file_name
            print ("load file ",sound_file)
            # use kaiser_fast technique for faster extraction
            X, sr = librosa.load( sound_file, sr=sr, duration=duration,res_type='kaiser_fast') 
            dur = librosa.get_duration(X, sr)

            # pad audio file same duration
            if (round(dur) < duration):
                print ("fixing audio lenght :", file_name)
                y = librosa.util.fix_length(X, input_length)

            data.append(y)
            
        except Exception as e:
            print("Error encountered while parsing file: ", file_name)        # , file



    return data
                        


In [31]:
def wpd_features(data,no_wpd_features):
  arr=np.asarray(data) 
  db1 = pywt.Wavelet('db1')

  ###############################
  """Extract The Coeeficients"""
  numrows = arr.shape[0]   
  numcols = no_wpd_features     #Number of features extracted from Wavelet Packet Decomposition is 83 here

  Extracted_Features=np.ndarray(shape=(numrows, numcols), dtype=float, order='F')
  for i in range(numrows):
    wp= pywt.WaveletPacket(arr[i,:], db1, mode='symmetric', maxlevel=6)
    Extracted_Features[i,0]=sp.mean(abs(wp['a'].data))
    Extracted_Features[i,1]=sp.mean(abs(wp['aa'].data))
    Extracted_Features[i,2]=sp.mean(abs(wp['aaa'].data))
    Extracted_Features[i,3]=sp.mean(abs(wp['aaaa'].data))
    Extracted_Features[i,4]=sp.mean(abs(wp['aaaaa'].data))
    Extracted_Features[i,5]=sp.mean(abs(wp['aaaaaa'].data))
    Extracted_Features[i,6]=sp.mean(abs(wp['d'].data))
    Extracted_Features[i,7]=sp.mean(abs(wp['dd'].data))
    Extracted_Features[i,8]=sp.mean(abs(wp['ddd'].data))
    Extracted_Features[i,9]=sp.mean(abs(wp['dddd'].data))
    Extracted_Features[i,10]=sp.mean(abs(wp['ddddd'].data))
    Extracted_Features[i,11]=sp.mean(abs(wp['dddddd'].data))

    Extracted_Features[i,12]=sp.std(wp['a'].data)
    Extracted_Features[i,13]=sp.std(wp['aa'].data)
    Extracted_Features[i,14]=sp.std(wp['aaa'].data)
    Extracted_Features[i,15]=sp.std(wp['aaaa'].data)
    Extracted_Features[i,16]=sp.std(wp['aaaaa'].data)
    Extracted_Features[i,17]=sp.std(wp['aaaaaa'].data)
    Extracted_Features[i,18]=sp.std(wp['d'].data)
    Extracted_Features[i,19]=sp.std(wp['dd'].data)
    Extracted_Features[i,20]=sp.std(wp['ddd'].data)
    Extracted_Features[i,21]=sp.std(wp['dddd'].data)
    Extracted_Features[i,22]=sp.std(wp['ddddd'].data)
    Extracted_Features[i,23]=sp.std(wp['dddddd'].data)

    Extracted_Features[i,24]=sp.median(wp['a'].data)
    Extracted_Features[i,25]=sp.median(wp['aa'].data)
    Extracted_Features[i,26]=sp.median(wp['aaa'].data)
    Extracted_Features[i,27]=sp.median(wp['aaaa'].data)
    Extracted_Features[i,28]=sp.median(wp['aaaaa'].data)
    Extracted_Features[i,29]=sp.median(wp['aaaaaa'].data)
    Extracted_Features[i,30]=sp.median(wp['d'].data)
    Extracted_Features[i,31]=sp.median(wp['dd'].data)
    Extracted_Features[i,32]=sp.median(wp['ddd'].data)
    Extracted_Features[i,33]=sp.median(wp['dddd'].data)
    Extracted_Features[i,34]=sp.median(wp['ddddd'].data)
    Extracted_Features[i,35]=sp.median(wp['dddddd'].data)

    Extracted_Features[i,36]=stats.skew(wp['a'].data)
    Extracted_Features[i,37]=stats.skew(wp['aa'].data)
    Extracted_Features[i,38]=stats.skew(wp['aaa'].data)
    Extracted_Features[i,39]=stats.skew(wp['aaaa'].data)
    Extracted_Features[i,40]=stats.skew(wp['aaaaa'].data)
    Extracted_Features[i,41]=stats.skew(wp['aaaaaa'].data)
    Extracted_Features[i,42]=stats.skew(wp['d'].data)
    Extracted_Features[i,43]=stats.skew(wp['dd'].data)
    Extracted_Features[i,44]=stats.skew(wp['ddd'].data)
    Extracted_Features[i,45]=stats.skew(wp['dddd'].data)
    Extracted_Features[i,46]=stats.skew(wp['ddddd'].data)
    Extracted_Features[i,47]=stats.skew(wp['dddddd'].data)

    Extracted_Features[i,48]=stats.kurtosis(wp['a'].data)
    Extracted_Features[i,49]=stats.kurtosis(wp['aa'].data)
    Extracted_Features[i,50]=stats.kurtosis(wp['aaa'].data)
    Extracted_Features[i,51]=stats.kurtosis(wp['aaaa'].data)
    Extracted_Features[i,52]=stats.kurtosis(wp['aaaaa'].data)
    Extracted_Features[i,53]=stats.kurtosis(wp['aaaaaa'].data)
    Extracted_Features[i,54]=stats.kurtosis(wp['d'].data)
    Extracted_Features[i,55]=stats.kurtosis(wp['dd'].data)
    Extracted_Features[i,56]=stats.kurtosis(wp['ddd'].data)
    Extracted_Features[i,57]=stats.kurtosis(wp['dddd'].data)
    Extracted_Features[i,58]=stats.kurtosis(wp['ddddd'].data)
    Extracted_Features[i,59]=stats.kurtosis(wp['dddddd'].data)

    Extracted_Features[i,60]=np.sqrt(np.mean(wp['a'].data**2))   #RMS Value
    Extracted_Features[i,61]=np.sqrt(np.mean(wp['aa'].data**2))
    Extracted_Features[i,62]=np.sqrt(np.mean(wp['aaa'].data**2))
    Extracted_Features[i,63]=np.sqrt(np.mean(wp['aaaa'].data**2))
    Extracted_Features[i,64]=np.sqrt(np.mean(wp['aaaaa'].data**2))
    Extracted_Features[i,65]=np.sqrt(np.mean(wp['aaaaaa'].data**2))
    Extracted_Features[i,66]=np.sqrt(np.mean(wp['d'].data**2))
    Extracted_Features[i,67]=np.sqrt(np.mean(wp['dd'].data**2))
    Extracted_Features[i,68]=np.sqrt(np.mean(wp['ddd'].data**2))
    Extracted_Features[i,69]=np.sqrt(np.mean(wp['dddd'].data**2))
    Extracted_Features[i,70]=np.sqrt(np.mean(wp['ddddd'].data**2))
    Extracted_Features[i,71]=np.sqrt(np.mean(wp['dddddd'].data**2))

    Extracted_Features[i,72]=sp.mean(abs(wp['a'].data))/sp.mean(abs(wp['aa'].data))
    Extracted_Features[i,73]=sp.mean(abs(wp['aa'].data))/sp.mean(abs(wp['aaa'].data))
    Extracted_Features[i,74]=sp.mean(abs(wp['aaa'].data))/sp.mean(abs(wp['aaaa'].data))
    Extracted_Features[i,75]=sp.mean(abs(wp['aaaa'].data))/sp.mean(abs(wp['aaaaa'].data))
    Extracted_Features[i,76]=sp.mean(abs(wp['aaaaa'].data))/sp.mean(abs(wp['aaaaaa'].data))
    Extracted_Features[i,77]=sp.mean(abs(wp['aaaaaa'].data))/sp.mean(abs(wp['d'].data))
    Extracted_Features[i,78]=sp.mean(abs(wp['d'].data))/sp.mean(abs(wp['dd'].data))
    Extracted_Features[i,79]=sp.mean(abs(wp['dd'].data))/sp.mean(abs(wp['ddd'].data))
    Extracted_Features[i,80]=sp.mean(abs(wp['ddd'].data))/sp.mean(abs(wp['dddd'].data))
    Extracted_Features[i,81]=sp.mean(abs(wp['dddd'].data))/sp.mean(abs(wp['ddddd'].data))
    Extracted_Features[i,82]=sp.mean(abs(wp['ddddd'].data))/sp.mean(abs(wp['dddddd'].data))

  return Extracted_Features


In [16]:
'''
# load dataset-a, keep them separate for testing purpose
import os, fnmatch

A_folder=INPUT_DIR+'/set_a/'
# set-a
A_artifact_files = fnmatch.filter(os.listdir(INPUT_DIR+'/set_a'), 'artifact*.wav')
A_artifact_sounds = wpd_features(load_file_data(folder=A_folder,file_names=A_artifact_files, duration=MAX_SOUND_CLIP_DURATION), 83)
A_artifact_labels = [1 for items in A_artifact_files]

A_normal_files = fnmatch.filter(os.listdir(INPUT_DIR+'/set_a'), 'normal*.wav')
A_normal_sounds = wpd_features(load_file_data(folder=A_folder,file_names=A_normal_files, duration=MAX_SOUND_CLIP_DURATION), 83)
A_normal_labels = [2 for items in A_normal_sounds]

A_extrahls_files = fnmatch.filter(os.listdir(INPUT_DIR+'/set_a'), 'extrahls*.wav')
A_extrahls_sounds = wpd_features(load_file_data(folder=A_folder,file_names=A_extrahls_files, duration=MAX_SOUND_CLIP_DURATION), 83)
A_extrahls_labels = [3 for items in A_extrahls_sounds]

A_murmur_files = fnmatch.filter(os.listdir(INPUT_DIR+'/set_a'), 'murmur*.wav')
A_murmur_sounds = wpd_features(load_file_data(folder=A_folder,file_names=A_murmur_files, duration=MAX_SOUND_CLIP_DURATION), 83)
A_murmur_labels = [4 for items in A_murmur_files]

# test files
A_unlabelledtest_files = fnmatch.filter(os.listdir(INPUT_DIR+'/set_a'), 'Aunlabelledtest*.wav')
A_unlabelledtest_sounds = wpd_features(load_file_data(folder=A_folder,file_names=A_unlabelledtest_files, duration=MAX_SOUND_CLIP_DURATION), 83)
A_unlabelledtest_labels = [-1 for items in A_unlabelledtest_sounds]

print ("loaded dataset-a with WPD features")





%%time
import os, fnmatch
# load dataset-b, keep them separate for testing purpose 
B_folder=INPUT_DIR+'/set_b_1/'
# set-b
B_normal_files = fnmatch.filter(os.listdir(INPUT_DIR+'/set_b_1'), 'normal*.wav')  # include noisy files
B_normal_sounds = wpd_features(load_file_data(folder=B_folder,file_names=B_normal_files, duration=MAX_SOUND_CLIP_DURATION), 83)
B_normal_labels = [2 for items in B_normal_sounds]

B_murmur_files = fnmatch.filter(os.listdir(INPUT_DIR+'/set_b_1'), 'murmur*.wav')  # include noisy files
B_murmur_sounds = wpd_features(load_file_data(folder=B_folder,file_names=B_murmur_files, duration=MAX_SOUND_CLIP_DURATION), 83)
B_murmur_labels = [4 for items in B_murmur_files]

B_extrastole_files = fnmatch.filter(os.listdir(INPUT_DIR+'/set_b_1'), 'extrastole*.wav')
B_extrastole_sounds = wpd_features(load_file_data(folder=B_folder,file_names=B_extrastole_files, duration=MAX_SOUND_CLIP_DURATION), 83)
B_extrastole_labels = [5 for items in B_extrastole_files]

#test files
B_unlabelledtest_files = fnmatch.filter(os.listdir(INPUT_DIR+'/set_b_1'), 'Bunlabelledtest*.wav')
B_unlabelledtest_sounds = wpd_features(load_file_data(folder=B_folder,file_names=B_unlabelledtest_files, duration=MAX_SOUND_CLIP_DURATION), 83)
B_unlabelledtest_labels = [-1 for items in B_unlabelledtest_sounds]
print ("loaded dataset-b_1 with WPD features")




%%time
import os, fnmatch
# load dataset-b, keep them separate for testing purpose 
B_folder=INPUT_DIR+'/set_b_2/'
# set-b
B_normal_files = fnmatch.filter(os.listdir(INPUT_DIR+'/set_b_2'), 'normal*.wav')  # include noisy files
B_normal_sounds = wpd_features(load_file_data(folder=B_folder,file_names=B_normal_files, duration=MAX_SOUND_CLIP_DURATION), 83)
B_normal_labels = [2 for items in B_normal_sounds]

B_murmur_files = fnmatch.filter(os.listdir(INPUT_DIR+'/set_b_2'), 'murmur*.wav')  # include noisy files
B_murmur_sounds = wpd_features(load_file_data(folder=B_folder,file_names=B_murmur_files, duration=MAX_SOUND_CLIP_DURATION), 83)
B_murmur_labels = [4 for items in B_murmur_files]

B_extrastole_files = fnmatch.filter(os.listdir(INPUT_DIR+'/set_b_2'), 'extrastole*.wav')
B_extrastole_sounds = wpd_features(load_file_data(folder=B_folder,file_names=B_extrastole_files, duration=MAX_SOUND_CLIP_DURATION), 83)
B_extrastole_labels = [5 for items in B_extrastole_files]

#test files
B_unlabelledtest_files = fnmatch.filter(os.listdir(INPUT_DIR+'/set_b_2'), 'Bunlabelledtest*.wav')
B_unlabelledtest_sounds = wpd_features(load_file_data(folder=B_folder,file_names=B_unlabelledtest_files, duration=MAX_SOUND_CLIP_DURATION), 83)
B_unlabelledtest_labels = [-1 for items in B_unlabelledtest_sounds]
print ("loaded dataset-b_2 with WPD features")


'''

load file  /content/drive/MyDrive/AI ML Things/University of Turku Research Internship/set_a/artifact__201106031558.wav
fixing audio lenght : artifact__201106031558.wav
load file  /content/drive/MyDrive/AI ML Things/University of Turku Research Internship/set_a/artifact__201106010559.wav
fixing audio lenght : artifact__201106010559.wav
load file  /content/drive/MyDrive/AI ML Things/University of Turku Research Internship/set_a/artifact__201105280851.wav
fixing audio lenght : artifact__201105280851.wav
load file  /content/drive/MyDrive/AI ML Things/University of Turku Research Internship/set_a/artifact__201106040933.wav
fixing audio lenght : artifact__201106040933.wav
load file  /content/drive/MyDrive/AI ML Things/University of Turku Research Internship/set_a/artifact__201106030612.wav
fixing audio lenght : artifact__201106030612.wav
load file  /content/drive/MyDrive/AI ML Things/University of Turku Research Internship/set_a/artifact__201105051017.wav
fixing audio lenght : artifact__201

In [32]:

%%time
import os, fnmatch
# load dataset-b, keep them separate for testing purpose 
B_folder=INPUT_DIR+'/set_b_2/'
# set-b
B_normal_files = fnmatch.filter(os.listdir(INPUT_DIR+'/set_b_2'), 'normal*.wav')  # include noisy files
B_normal_sounds = wpd_features(load_file_data(folder=B_folder,file_names=B_normal_files, duration=MAX_SOUND_CLIP_DURATION), 83)
B_normal_labels = [2 for items in B_normal_sounds]

B_murmur_files = fnmatch.filter(os.listdir(INPUT_DIR+'/set_b_2'), 'murmur*.wav')  # include noisy files
B_murmur_sounds = wpd_features(load_file_data(folder=B_folder,file_names=B_murmur_files, duration=MAX_SOUND_CLIP_DURATION), 83)
B_murmur_labels = [4 for items in B_murmur_files]

B_extrastole_files = fnmatch.filter(os.listdir(INPUT_DIR+'/set_b_2'), 'extrastole*.wav')
B_extrastole_sounds = wpd_features(load_file_data(folder=B_folder,file_names=B_extrastole_files, duration=MAX_SOUND_CLIP_DURATION), 83)
B_extrastole_labels = [5 for items in B_extrastole_files]

#test files
B_unlabelledtest_files = fnmatch.filter(os.listdir(INPUT_DIR+'/set_b_2'), 'Bunlabelledtest*.wav')
B_unlabelledtest_sounds = wpd_features(load_file_data(folder=B_folder,file_names=B_unlabelledtest_files, duration=MAX_SOUND_CLIP_DURATION), 83)
B_unlabelledtest_labels = [-1 for items in B_unlabelledtest_sounds]
print ("loaded dataset-b_2 with WPD features")



load file  /content/drive/MyDrive/AI ML Things/University of Turku Research Internship/set_b_2/normal__181_1308052613891_D.wav
fixing audio lenght : normal__181_1308052613891_D.wav
load file  /content/drive/MyDrive/AI ML Things/University of Turku Research Internship/set_b_2/normal__106_1306776721273_D1.wav
fixing audio lenght : normal__106_1306776721273_D1.wav
load file  /content/drive/MyDrive/AI ML Things/University of Turku Research Internship/set_b_2/normal__143_1306763822290_C.wav
fixing audio lenght : normal__143_1306763822290_C.wav
load file  /content/drive/MyDrive/AI ML Things/University of Turku Research Internship/set_b_2/normal_noisynormal_109_1305653972028_A.wav
fixing audio lenght : normal_noisynormal_109_1305653972028_A.wav
load file  /content/drive/MyDrive/AI ML Things/University of Turku Research Internship/set_b_2/normal__177_1307989650056_B.wav
fixing audio lenght : normal__177_1307989650056_B.wav
load file  /content/drive/MyDrive/AI ML Things/University of Turku Rese

In [33]:
'''
#combine set-a 
x_data_a = np.concatenate((A_artifact_sounds, A_normal_sounds,A_extrahls_sounds,A_murmur_sounds))

y_data_a = np.concatenate((A_artifact_labels, A_normal_labels,A_extrahls_labels,A_murmur_labels))

test_x_a = np.copy((A_unlabelledtest_sounds))
test_y_a = np.copy((A_unlabelledtest_labels))

print ("combined training data record: ",len(y_data_a), len(test_y_a))
print ("combined training data record: ",len(x_data_a), len(test_x_a))





#combine set-b_2 
x_data_b2 = np.concatenate((B_normal_sounds,B_murmur_sounds,B_extrastole_sounds))

y_data_b2 = np.concatenate((B_normal_labels,B_murmur_labels,B_extrastole_labels))

test_x_b2 = np.copy((B_unlabelledtest_sounds))
test_y_b2 = np.copy((B_unlabelledtest_labels))

print ("combined training data record: ",len(y_data_b2), len(test_y_b2))





#combine set-b_1
x_data_b1 = np.concatenate((B_normal_sounds,B_murmur_sounds,B_extrastole_sounds))
 
y_data_b1 = np.concatenate((B_normal_labels,B_murmur_labels,B_extrastole_labels))
 
test_x_b1 = np.copy((B_unlabelledtest_sounds))
test_y_b1 = np.copy((B_unlabelledtest_labels))
 
print ("combined training data record: ",len(y_data_b1), len(test_y_b1))
print ("combined training data record: ",len(x_data_b1), len(test_x_b1))
'''



#combine set-b_2 
x_data_b2 = np.concatenate((B_normal_sounds,B_murmur_sounds,B_extrastole_sounds))

y_data_b2 = np.concatenate((B_normal_labels,B_murmur_labels,B_extrastole_labels))

test_x_b2 = np.copy((B_unlabelledtest_sounds))
test_y_b2 = np.copy((B_unlabelledtest_labels))

print ("combined training data record: ",len(y_data_b2), len(test_y_b2))
print ("combined training data record: ",len(x_data_b2), len(test_x_b2))

combined training data record:  327 0
combined training data record:  327 0


In [34]:
import os 

# Set your working directory to a folder in your Google Drive. This way, if your notebook times out,
# your files will be saved in your Google Drive!

# the base Google Drive directory
root_dir = "/content/drive/My Drive/"

# choose where you want your project files to be saved
project_folder = "AI ML Things/University of Turku Research Internship/WPD_features/"

def create_and_set_working_directory(project_folder):
  # check if your project folder exists. if not, it will be created.
  if os.path.isdir(root_dir + project_folder) == False:
    os.mkdir(root_dir + project_folder)
    print(root_dir + project_folder + ' did not exist but was created.')

  # change the OS to use your project folder as the working directory
  os.chdir(root_dir + project_folder)

  # create a test file to make sure it shows up in the right place
  !touch 'new_file_in_working_directory.txt'
  print('\nYour working directory was changed to ' + root_dir + project_folder + \
        "\n\nAn empty text file was created there. You can also run !pwd to confirm the current working directory." )

create_and_set_working_directory(project_folder)


Your working directory was changed to /content/drive/My Drive/AI ML Things/University of Turku Research Internship/WPD_features/

An empty text file was created there. You can also run !pwd to confirm the current working directory.


In [35]:
'''
np.save('x_data_a.npy',x_data_a)
np.save('y_data_a.npy',y_data_a)
np.save('test_x_a.npy',test_x_a)
np.save('test_y_a.npy',test_y_a)
'''
'''
np.save('x_data_b1.npy',x_data_b1)
np.save('y_data_b1.npy',y_data_b1)
np.save('test_x_b1.npy',test_x_b1)
np.save('test_y_b1.npy',test_y_b1)



np.save('x_data_b2.npy',x_data_b2)
np.save('y_data_b2.npy',y_data_b2)
np.save('test_x_b2.npy',test_x_b2)
np.save('test_y_b2.npy',test_y_b2)

'''

np.save('x_data_b2.npy',x_data_b2)
np.save('y_data_b2.npy',y_data_b2)
np.save('test_x_b2.npy',test_x_b2)
np.save('test_y_b2.npy',test_y_b2)

In [36]:
import os 

# Set your working directory to a folder in your Google Drive. This way, if your notebook times out,
# your files will be saved in your Google Drive!

# the base Google Drive directory
root_dir = "/content/drive/My Drive/"

# choose where you want your project files to be saved
project_folder = "AI ML Things/University of Turku Research Internship/WPD_features/"

def create_and_set_working_directory(project_folder):
  # check if your project folder exists. if not, it will be created.
  if os.path.isdir(root_dir + project_folder) == False:
    os.mkdir(root_dir + project_folder)
    print(root_dir + project_folder + ' did not exist but was created.')

  # change the OS to use your project folder as the working directory
  os.chdir(root_dir + project_folder)

  # create a test file to make sure it shows up in the right place
  !touch 'new_file_in_working_directory.txt'
  print('\nYour working directory was changed to ' + root_dir + project_folder + \
        "\n\nAn empty text file was created there. You can also run !pwd to confirm the current working directory." )

create_and_set_working_directory(project_folder)


Your working directory was changed to /content/drive/My Drive/AI ML Things/University of Turku Research Internship/WPD_features/

An empty text file was created there. You can also run !pwd to confirm the current working directory.


In [37]:
x_data_a=np.load('x_data_a.npy')
y_data_a=np.load('y_data_a.npy')
test_x_a=np.load('test_x_a.npy')
test_y_a=np.load('test_y_a.npy')

In [38]:
x_data_b1=np.load('x_data_b1.npy')
y_data_b1=np.load('y_data_b1.npy')
test_x_b1=np.load('test_x_b1.npy')
test_y_b1=np.load('test_y_b1.npy')

In [39]:
x_data_b2=np.load('x_data_b2.npy')
y_data_b2=np.load('y_data_b2.npy')
test_x_b2=np.load('test_x_b2.npy')
test_y_b2=np.load('test_y_b2.npy')

In [40]:
y_data_a.shape

(124,)

In [41]:
x_data_a.shape

(124, 83)

In [42]:
x_data_b1.shape

(133, 83)

In [43]:
y_data_b1.shape

(133,)

In [44]:
x_data_b2.shape

(327, 83)

In [45]:
y_data_b2.shape

(327,)

In [49]:
#combine set-a and set-b 
train_x = np.concatenate((x_data_a, x_data_b1, x_data_b2))

train_y = np.concatenate((y_data_a, y_data_b1, y_data_b2))

test_x = np.concatenate((test_x_a, test_x_b1, test_x_b2))
test_y = np.concatenate((test_y_a, test_y_b1, test_y_b2))

print ("combined training data record: ",len(train_y), len(test_y))

combined training data record:  584 246


In [50]:
np.save('train_x.npy',train_x)
np.save('train_y.npy',train_y)
np.save('test_x.npy',test_x)
np.save('test_y.npy',test_y)

In [51]:
x_data_wpd = np.concatenate((train_x, test_x))
y_data_wpd = np.concatenate((train_y, test_y))

In [52]:
x_data_wpd.shape

(830, 83)

In [53]:
y_data_wpd.shape

(830,)

In [54]:
np.save('x_data_wpd .npy',x_data_wpd )
np.save('y_data_wpd.npy',y_data_wpd)