In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import numpy as np
import pandas as pd
import pickle

In [3]:
data_p = pd.read_pickle(r'drive/MyDrive/pos')
data_n = pd.read_pickle(r'drive/MyDrive/neg')

In [4]:
label0 = np.zeros(len(data_n))
label1 = np.ones(len(data_p))

In [5]:
data_features = np.concatenate((data_p, data_n),axis = 0)
data_labels = np.concatenate((label1, label0),axis = 0)

In [6]:
data_features = np.array(data_features, dtype=np.float)
data_labels = np.array(data_labels, dtype=np.long)
data_features.shape

(533692, 562)

In [7]:
np.random.seed(10605)
np.random.shuffle(data_features) 
np.random.seed(10605)
np.random.shuffle(data_labels)

In [8]:
np.sum(data_labels)/len(data_labels)

0.5621069830538963

In [9]:
data_labels

array([0, 1, 0, ..., 1, 0, 0])

In [10]:
import numpy as np
import torch
import torchvision
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import torchvision.transforms as transforms
from torch.utils.data import Dataset, DataLoader

train_x = data_features[30000:]
train_y = data_labels[30000:]
test_x = data_features[:30000]
test_y = data_labels[:30000]

In [11]:
test_x.shape

(30000, 562)

In [12]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [13]:
class SongDataset(Dataset):
    def __init__(self, x, y=None):
        self.x = x
        self.y = y
        

    def __len__(self):
        return len(self.y)


    def __getitem__(self, idx):
        return self.x[idx], self.y[idx]

In [14]:
def get_dataloader(trainset, valset = None, batch_size = 256, num_workers = 4):
    train_loader = DataLoader(trainset, batch_size=batch_size, shuffle=True, num_workers=num_workers,drop_last = True)
    
    if(valset is None):
        val_loader = None
    else:
        val_loader = DataLoader(valset, batch_size=batch_size, shuffle=False, num_workers=num_workers,drop_last = True)
        
    return (train_loader, val_loader)

In [15]:
trainset = SongDataset(train_x,train_y)
valset = SongDataset(test_x,test_y)
train_loader, val_loader = get_dataloader(trainset,valset)

In [16]:
def train_one_epoch(model, train_loader, optimizer):
    
    model.train()
    total_loss = 0
    count = 0
    acc = 0
    
    for x, y in train_loader:
        x = x.to(device)
        y = y.to(device)
        
        y_hat = model(x.float())
        loss = criterion(np.squeeze(y_hat), y)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        #compute loss and accuracy 
        count += 1
        total_loss += loss.item()
        values, indices = y_hat.max(1)
        acc += (y-indices == 0).sum(dim=0).item()
    
    return(total_loss/count, acc/len(train_loader.dataset))

In [17]:
model = nn.Sequential(
            nn.Linear(562, 1024),
            nn.ReLU(),
            nn.BatchNorm1d(1024),
            nn.Linear(1024, 2048),
            nn.ReLU(),
            nn.BatchNorm1d(2048),
            nn.Linear(2048, 4096),
            nn.ReLU(),
            nn.BatchNorm1d(4096),
            nn.Linear(4096, 4096),
            nn.ReLU(),
            nn.BatchNorm1d(4096),
            nn.Linear(4096, 2048),
            nn.ReLU(),
            nn.BatchNorm1d(2048),
            nn.Linear(2048, 1024),
            nn.ReLU(),
            nn.BatchNorm1d(1024),
            nn.Linear(1024, 512),
            nn.ReLU(),
            nn.BatchNorm1d(512),
            nn.Linear(512, 256),
            nn.ReLU(),
            nn.BatchNorm1d(256),
            nn.Linear(256, 128),
            nn.ReLU(),
            nn.BatchNorm1d(128),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.BatchNorm1d(64),
            nn.Linear(64, 2)
        )

model.to(device)

optimizer = optim.Adam(model.parameters())
criterion = nn.CrossEntropyLoss()

In [18]:
for i in range(10):
  avg_train_loss, train_accuracy = train_one_epoch(model, train_loader, optimizer)
  print(train_accuracy)

0.849261056359839
0.9620700745693797
0.9800929933371981
0.9865850559468882
0.9888940066548605
0.991258546889766
0.9918779730470207
0.9919554013166777
0.9933649928924819
0.9937044860748235


In [19]:
def validate_model(model, val_loader):  
    """
    Validate a given model with a validation dataloader.
    
    args:
        model (nn.Module): the trained model 
        val_loader (DataLoader): iterable for valset minibatches

    return: Tuple(avg_val_loss, val_accuracy, val_time)
        avg_val_loss (float): average validation loss across batches
        val_accuracy (float): portion of correctly classified images in the validation dataset
        val_time (float): the time taken to run this function
    """
    
    model.eval()
    torch.no_grad()
    total_loss = 0
    count = 0
    acc = 0
    
    for x, y in val_loader:
        x = x.to(device)
        y = y.to(device)
        
        y_hat = model(x.float())
        loss = criterion(y_hat, y)
        
        #compute loss and accuracy 
        count += 1
        total_loss += loss.item()
        values, indices = y_hat.max(1)
        acc += (y-indices == 0).sum(dim=0).item()

    
    
    return(total_loss/count, acc/len(val_loader.dataset))

In [22]:
avg_val_loss, val_accuracy = validate_model(model, val_loader)
print(val_accuracy)

0.9922666666666666


In [24]:
torch.save(model.state_dict(), 'drive/MyDrive/605model_state_dict')

In [None]:
!wget http://millionsongdataset.com/sites/default/files/tasteprofile/sid_mismatches.txt
!unzip drive/MyDrive/train_triplets.txt.zip
!unzip drive/MyDrive/taste_profile_song_to_tracks.txt.zip

--2020-12-01 05:49:30--  http://millionsongdataset.com/sites/default/files/tasteprofile/sid_mismatches.txt
Resolving millionsongdataset.com (millionsongdataset.com)... 173.231.209.32
Connecting to millionsongdataset.com (millionsongdataset.com)|173.231.209.32|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 2026182 (1.9M) [text/plain]
Saving to: ‘sid_mismatches.txt’


2020-12-01 05:49:31 (3.25 MB/s) - ‘sid_mismatches.txt’ saved [2026182/2026182]

Archive:  drive/MyDrive/train_triplets.txt.zip
  inflating: train_triplets.txt      
Archive:  drive/MyDrive/taste_profile_song_to_tracks.txt.zip
  inflating: taste_profile_song_to_tracks.txt  


In [None]:
songToTrackDf = pd.read_csv(
    "taste_profile_song_to_tracks.txt", names=["song", "track"], sep="\t")
songToTrackDf.dropna(inplace=True)
userSongCountDf = pd.read_csv(
    "train_triplets.txt", names=["user", "song", "count"], sep="\t")
mismatches = []
with open("sid_mismatches.txt", "r", encoding='UTF-8') as f:
    for line in f:
        mismatches.append((line[8:8+len(songToTrackDf["song"][0])], line[9+len(songToTrackDf["song"][0]):9+len(songToTrackDf["song"][0])+len(songToTrackDf["track"][0])]))
mismatchesDf = pd.DataFrame(mismatches, columns=["song", "track"])
songToTrackDf = songToTrackDf[(~songToTrackDf["track"].isin(mismatchesDf["track"])) & (~songToTrackDf["song"].isin(mismatchesDf["song"]))]
userSongCountDf = userSongCountDf[userSongCountDf["song"].isin(songToTrackDf["song"])]
userSongTrackCountDf = userSongCountDf.merge(songToTrackDf, how="left", on="song")
userSongTrackCountDf = userSongTrackCountDf[userSongTrackCountDf["count"] > 1]
userSongTrackCountDf.reset_index(drop=True, inplace=True)
uniqueSongs = userSongTrackCountDf["track"].unique()
tmp = userSongTrackCountDf[["user", "track"]]

userSongListDf = tmp.groupby("user")["track"].apply(list).reset_index(name='tracks')

In [None]:
test = list(userSongListDf.iloc[-1]['tracks'])


In [None]:
feature_subset = pickle.load(open('drive/MyDrive/parsed_subset.pkl', 'rb'))

In [None]:
feature_subset['track_id'] = feature_subset['track_id'].apply(lambda x: str(x[2:-1]))

In [None]:
track_ids = feature_subset['track_id'].tolist()
raw_track_ids = userSongTrackCountDf['track'].tolist()

intersect_track_ids = set(raw_track_ids) & set(track_ids)
print(len(intersect_track_ids))

3156


In [None]:
print(len(userSongTrackCountDf))
print(len(feature_subset['track_id']))

18550228
10000


In [None]:
temp = userSongTrackCountDf[userSongTrackCountDf['track'].isin(feature_subset['track_id'])]
print(len(temp))
assert len(temp) == len(temp[temp['count'] > 1])
temp = temp[["user", "track"]]
temp = temp.groupby("user").filter(lambda x: len(x['track']) > 1)
temp = temp.groupby("user")["track"].apply(list).reset_index(name='track_id')[["track_id"]].to_numpy()
temp = np.squeeze(temp)
print(temp.shape)


# sanity check that all remaining tracks are within intersected track_ids
users_track_ids = set()
for user_lst in temp:
  users_track_ids.update(set(user_lst))
assert len(users_track_ids - intersect_track_ids) == 0

237500
(41346,)


In [None]:
# compute pairs
import itertools
track_lsts = temp.tolist()

pairs = set()
for track_lst in track_lsts:
  pairs.update(set(itertools.combinations(track_lst, 2)))

In [None]:
# select features
features = feature_subset[[
    'track_id',
    'artist_familiarity',
    'danceability',
    'duration',
    'end_of_fade_in',
    'key',
    'key_confidence', 
    'loudness', 
    'mode',
    'segments_confidence',
    'segments_loudness_max',
    'segments_loudness_max_time',
    'segments_pitches',
    'segments_timbre',
    'tempo', 
    'time_signature', 
    'time_signature_confidence'
]]
features = features.set_index('track_id')

# flatten 2d features segments_pitches and segments_timbre
features['segments_pitches'] = features['segments_pitches'].apply(np.concatenate).apply(list)
features['segments_timbre'] = features['segments_timbre'].apply(np.concatenate).apply(list)

In [None]:
# flatten 2d features at column idx 8, 9, 10, 11, 12 corresponding to segments_...
# new len should be 8 + 10 + 10 + 10 + 120 + 120 + 3 = 281
def nested_flatten(lst):
  nested_parts = lst[8] + lst[9] + lst[10] + lst[11] + lst[12]
  return np.append(np.append(lst[:8], nested_parts), lst[13:])

In [None]:
# map features to pairs to create datapoints, flatten in the meanwhile
# datapoints stores (track1_id, track2_id): [(track1_id, track2_id), list_of_track1_features, list_of_track2_features]
datapoints_d = {}
for id1, id2 in pairs:
  track1 = nested_flatten(features.loc[id1].to_numpy())
  track2 = nested_flatten(features.loc[id2].to_numpy())
  if id1 < id2: # bigger track_id always precede smaller track_id in pair positioning
    track1, track2 = track2, track1
    id1, id2 = id2, id1
  datapoints_d[(id1, id2)] = [(id1, id2), track1, track2]

# sanity check
sample_point = list(datapoints_d.values())[1]
sample_key = list(datapoints_d.keys())[1]
assert len(sample_point) == 3
assert sample_point[0] == sample_key
assert len(sample_point[1]) == 281
assert len(sample_point[2]) == 281

In [None]:
# construct final data, where we have
# features: track1_features & track2_features in a list
# label: 1 if in datapoints_d, 0 if in negative_sample_datapoints
data_features = []
data_labels = []
for p_data in datapoints_d.values():
  track_id_pair, p_feature1, p_feature2 = p_data[0], p_data[1], p_data[2]
  data_features.append(np.concatenate((p_feature1, p_feature2)))
  data_labels.append(1)

assert len(data_features) == len(data_labels)


data_features = np.array(data_features)
data_labels = np.array(data_labels)
assert data_features.shape[0] == data_labels.shape[0]
assert data_features.shape[1] == 281 * 2

In [None]:
datapoints_d.shape

AttributeError: ignored