In [6]:
import glob, python_speech_features, json
import numpy as np
from sklearn import svm, linear_model
from sklearn.model_selection import cross_val_score, train_test_split, KFold, LeaveOneOut
from sklearn.ensemble import RandomForestClassifier
import librosa

In [25]:
import subprocess

sample_rate = 8000
byte_rate = sample_rate * 2 

def open_mp3(filename):
    process = subprocess.Popen(
            ['ffmpeg', '-loglevel', 'quiet', '-i',
            filename,
            '-ar', str(sample_rate) , '-ac', '1', '-f', 's16le', '-'],
            stdout=subprocess.PIPE)
    return process.stdout

def bytes_to_array(x, width=2, dtype=np.float16):
    # Invert the scale of the data
    scale = 1.0 / float(1 << ((8 * width) - 1))
    # Construct the format string
    fmt = "<i{:d}".format(width)
    # Rescale and format the data buffer
    return scale * np.frombuffer(x, fmt).astype(dtype)

In [4]:
def extract_features(files):
    feats = {}
    for filename in glob.glob(files):
        y, sr = librosa.load(filename, sr=None)
        mfcc = python_speech_features.mfcc(y[0:sample_rate], sample_rate)
        feats[filename] = mfcc
    return feats

In [7]:
%%time
ship = extract_features("shipping/*.wav")
not_ship = extract_features("not_shipping/*.wav")

CPU times: user 6.95 s, sys: 1.69 s, total: 8.64 s
Wall time: 1.52 s


In [12]:
X, y = [], []
for lol in ship.values():
    v = np.array(lol)
    X.append(v.flatten())
    y.append(1)

for lol in not_ship.values():
    v = np.array(lol)
    X.append(v.flatten())
    y.append(0)

X, X_hold, y, y_hold = train_test_split(X, y, test_size=0.1)
len(X), len(X_hold), len(y), len(y_hold)
#X[0].shape

(575, 64, 575, 64)

In [9]:
cv = KFold(n_splits=5, random_state=1, shuffle=True)
def test_classifier(c):
    ev = [True for x, y_ in zip(X_hold, y_hold) if c.predict([x])[0] == y_]
    return len(ev) / len(y_hold)

In [10]:
clf = svm.SVC()
scores = cross_val_score(clf, X, y, cv=cv)
scores.mean()

0.9686956521739131

In [11]:
clf = svm.SVC()
clf.fit(X, y)
%time test_classifier(clf)

CPU times: user 25.4 ms, sys: 68 µs, total: 25.4 ms
Wall time: 25.5 ms


0.984375

In [171]:
import pickle
len(pickle.dumps(clf))

3962529

In [19]:
rf = RandomForestClassifier(n_estimators=100)
scores = cross_val_score(rf, X, y, cv=cv)
scores.mean()

0.9530434782608695

In [20]:
rf = RandomForestClassifier(n_estimators=100)
rf.fit(X, y)
%time test_classifier(rf)

CPU times: user 452 ms, sys: 755 µs, total: 453 ms
Wall time: 453 ms


0.921875

In [174]:
import pickle
len(pickle.dumps(rf))

559360

In [175]:
lr = linear_model.LogisticRegression(max_iter=1000)
scores = cross_val_score(rf, X, y, cv=cv)
scores.mean()

0.9634782608695651

In [176]:
lr = linear_model.LogisticRegression(max_iter=1000)
lr.fit(X, y)
%time test_classifier(lr)

CPU times: user 17.8 ms, sys: 2.19 ms, total: 20 ms
Wall time: 3.35 ms


0.953125

In [177]:
len(pickle.dumps(lr))

11010

In [13]:
%%time
window_size = byte_rate // 2
mp3_file = "train/20220301Z0520.mp3"
mp3, _ = librosa.load(mp3_file, sr=16000)


  return f(*args, **kwargs)


KeyboardInterrupt: 

In [16]:
%%time
mp3_data = open_mp3(mp3_file).read()
mp3 = bytes_to_array(mp3_data)

CPU times: user 160 ms, sys: 83.5 ms, total: 244 ms
Wall time: 1.21 s


In [187]:
mp3[0:sample_rate].shape

(16000,)

In [153]:
%time mfcc_feat = librosa.feature.mfcc(y=mp3, sr=sample_rate, n_mfcc=24).T
mfcc_feat.shape

CPU times: user 925 ms, sys: 119 ms, total: 1.04 s
Wall time: 625 ms


(22657, 24)

In [17]:
s = mp3[0:sample_rate]
%time mfcc_feat = python_speech_features.mfcc(s, sample_rate)
(s.shape, mfcc_feat.shape)

CPU times: user 6.23 ms, sys: 3.27 ms, total: 9.49 ms
Wall time: 4.08 ms


((16000,), (99, 13))

In [23]:
%%time
p=[]
for i in range(0, len(mp3), sample_rate // 2):
    s = mp3[i:i+sample_rate]
    if len(s) < sample_rate:break
    mfcc = python_speech_features.mfcc(s, sample_rate)
    #print(mfcc.flatten().shape)
    #break
    x = mfcc.flatten()
    if rf.predict([x]):
        p.append((i / sample_rate, clf.predict([x])))
p

CPU times: user 1min 7s, sys: 8.64 s, total: 1min 15s
Wall time: 12.7 s


[(9.0, array([1])),
 (15.5, array([1])),
 (22.0, array([1])),
 (38.0, array([1])),
 (43.5, array([1])),
 (44.5, array([0])),
 (47.5, array([1])),
 (50.0, array([0])),
 (54.0, array([0])),
 (66.5, array([1])),
 (68.0, array([1])),
 (71.0, array([1])),
 (76.0, array([1])),
 (77.0, array([1])),
 (84.5, array([0])),
 (89.5, array([1])),
 (95.0, array([1])),
 (97.5, array([0])),
 (105.0, array([1])),
 (112.0, array([1])),
 (118.0, array([1])),
 (133.0, array([0])),
 (141.5, array([1])),
 (152.5, array([1])),
 (174.5, array([0])),
 (185.0, array([0])),
 (207.5, array([0])),
 (214.0, array([1])),
 (216.0, array([0])),
 (225.5, array([0])),
 (230.5, array([0])),
 (249.0, array([0])),
 (283.5, array([0])),
 (299.5, array([0])),
 (308.5, array([1])),
 (311.5, array([0])),
 (317.0, array([1])),
 (332.5, array([1])),
 (334.0, array([0])),
 (336.5, array([0])),
 (346.0, array([1])),
 (354.0, array([1])),
 (381.5, array([1])),
 (391.0, array([0])),
 (437.0, array([0])),
 (439.0, array([1])),
 (487.5

In [198]:
len(mp3)

11599872

In [142]:
mp3[i:i+5]

array([0., 0., 0., 0., 0.], dtype=float32)

In [None]:
help