In [17]:
import json
import numpy as np 
from matplotlib import pyplot as plt
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC  
from sklearn.metrics import classification_report, confusion_matrix,accuracy_score
import pickle
import time

DATASET_PATH = "json_wav_data"

In [2]:
def load_data(dataset_path):
    """
        Load the data from json file
        return:
            X represents the mffc in data
            y represents the labels in data
    """
    with open(dataset_path, "r") as f:
        data = json.load(f)
    
    X = np.array(data["mfcc"])
    y = np.array(data["label"])
    y_label = np.array(data["name"])

    return X,y,y_label

In [7]:
def make_english_non_data(X_original):
    """
    given an X_original data which contains all segments of all audios, we want to extract those of english speakers
    and those of non-english speakers.
    (the indices used below are found by examining X_original)
    
    """
    
    #extracting english segments
    X_original_english = X_original[1015:2470] #at y_original[1015==>2469]=37 => y_label[37] = english

    #extracting non-english segments
    X_original_non_english = np.concatenate([X_original[:1015], X_original[2470:]]) 

    #randomly sampling same number of non-english examples from english ones
    """this is because there are a lot of non-english samples than english ones, 
    so I tried to constract the same number as english "randomly" from non-english (it's not guaranteed
    that the sampling will  be homogeneous).
    I'll try other methods of sampling in futur"""
    
    random_indices = np.random.randint(X_original_non_english.shape[0], size=X_original_english.shape[0])
    X_original_non_english = X_original_non_english[random_indices,:,:]

    #reshaping from (n,m,q) to (n*m, q)
    X_original_english = X_original_english.reshape((X_original_english.shape[0]*X_original_english.shape[1],X_original_english.shape[2]))
    X_original_non_english = X_original_non_english.reshape((X_original_non_english.shape[0]*X_original_non_english.shape[1],X_original_non_english.shape[2]))

    #creating output of english and non-english data
    #output y is encoded as 0 for non-english speakers and 1 for english speakers
    y_english = np.ones((X_original_english.shape[0],))
    y_non_english = np.zeros((X_original_non_english.shape[0],))

    #concatenating english and non english into X and y
    X = np.concatenate((X_original_english, X_original_non_english))
    y = np.concatenate((y_english, y_non_english))

    return (X, y)



X_train shape (751944, 13) 
y_train shape (751944,)
X_test shape (187986, 13) 
y_test shape (187986,)


In [None]:
X_original, y_original, y_label = load_data(DATASET_PATH)
print("X_original shape {} \ny_original shape {}".format(X.shape, y.shape))

X, y = ake_english_non_data(X_original)
print("X shape {} \ny shape {}".format(X.shape, y.shape))

In [5]:
"""
for computational reasons the data is reduced by 80%
"""
#randomly sampling 20% from X and y
n_points = len(X)
X_s = X[np.random.choice(n_points, int(0.2*n_points)).tolist()]
y_s = y[np.random.choice(n_points, int(0.2*n_points)).tolist()]


#spliting the data into 80% for training and 30% for testing
X_train, X_test, y_train, y_test = train_test_split(X_s, y_s, test_size = 0.30)
print("X_train shape {} \ny_train shape {}".format(X_train.shape, y_train.shape))
print("X_test shape {} \ny_test shape {}".format(X_test.shape, y_test.shape))



X_train shape (131590, 13) 
y_train shape (131590,)
X_test shape (56396, 13) 
y_test shape (56396,)


In [6]:
tstart = time.time()

#training svm (with rdbf kernel) model while tunning the C parameter.
param_grid = {'C': [0.1,100], 'kernel': ['rbf']}
grid = GridSearchCV(SVC(), param_grid,refit=True, verbose=10)

grid.fit(X_train,y_train)


tend = time.time()
print("Execution time :{}".format(tend-tstart))

Fitting 5 folds for each of 2 candidates, totalling 10 fits
[CV] C=0.1, kernel=rbf ...............................................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] ................... C=0.1, kernel=rbf, score=0.502, total= 9.7min
[CV] C=0.1, kernel=rbf ...............................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  9.7min remaining:    0.0s


[CV] ................... C=0.1, kernel=rbf, score=0.504, total= 9.6min
[CV] C=0.1, kernel=rbf ...............................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed: 19.3min remaining:    0.0s


[CV] ................... C=0.1, kernel=rbf, score=0.503, total= 9.9min
[CV] C=0.1, kernel=rbf ...............................................


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed: 29.2min remaining:    0.0s


[CV] ................... C=0.1, kernel=rbf, score=0.505, total=17.3min
[CV] C=0.1, kernel=rbf ...............................................


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed: 46.5min remaining:    0.0s


[CV] ................... C=0.1, kernel=rbf, score=0.505, total=11.9min
[CV] C=100, kernel=rbf ...............................................


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed: 58.4min remaining:    0.0s


[CV] ................... C=100, kernel=rbf, score=0.501, total=18.1min
[CV] C=100, kernel=rbf ...............................................


[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed: 76.5min remaining:    0.0s


[CV] ................... C=100, kernel=rbf, score=0.498, total=18.6min
[CV] C=100, kernel=rbf ...............................................


[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed: 95.1min remaining:    0.0s


[CV] ................... C=100, kernel=rbf, score=0.500, total=18.1min
[CV] C=100, kernel=rbf ...............................................


[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed: 113.2min remaining:    0.0s


[CV] ................... C=100, kernel=rbf, score=0.504, total=18.2min
[CV] C=100, kernel=rbf ...............................................


[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed: 131.4min remaining:    0.0s


[CV] ................... C=100, kernel=rbf, score=0.502, total=18.4min


[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed: 149.8min finished


Execution time :9736.80957365036


<b>Execution time : 2 hours and 42 minutes</b>

In [None]:
#evaluation
print(grid.best_params_) 
predictions = grid.predict(X_test) 

In [13]:
confusion_matrix(y_test, predictions)

array([[49196, 44704],
       [47312, 46774]])

In [16]:
accuracy_score(y_test, predictions)

0.5105167406083432

In [12]:
# print classification report 
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

         0.0       0.51      0.52      0.52     93900
         1.0       0.51      0.50      0.50     94086

    accuracy                           0.51    187986
   macro avg       0.51      0.51      0.51    187986
weighted avg       0.51      0.51      0.51    187986



In [18]:
# save the model
filename = 'models/SVM_rbf.sav'
pickle.dump(grid, open(filename, 'wb'))

In [19]:
# load the model
model = pickle.load(open('models/SVM_rbf.sav', 'rb'))