In [None]:
import pandas as pd
from sklearn.cluster import Birch
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import csv
import pickle
import numpy as np

In [None]:
def dataclean(filePath):
    '''
    Cleans the data to be feeded into the KMeans model.
    Assigns variables to be stored in the output csv. 
    '''
    data = pd.read_csv(filePath)
    names = data[['name']]
    artist = data[['artists']]
    ids = data[['id']]

    dropThis = ['year', 'artists', 'explicit', 'duration_ms', 'popularity', 'id', 'release_date', 'name', 'liveness', 'mode']
    for name in dropThis:
        data = data.drop(name, 1);
    return (data, names, artist, ids)

In [None]:
datapath = "/tmp/data.csv"
song_data = pd.read_csv(datapath)
song_data = song_data.dropna()
song_data.to_csv("/tmp/cleandata.csv")

In [None]:
clean_path = "/tmp/cleandata.csv"
data, names, artists, IDs = dataclean(clean_path)

In [None]:
def normalize(data):
    '''
    Uses a Standard Scaler to perform feature scaling on the cleansed data.
    '''
    X = data.to_numpy()
    sc = StandardScaler()
    sc.fit(X)
    X = sc.transform(X)
    return X, sc

In [None]:
X, scaler = normalize(data)
print(len(X[0])) #Number of initial components

10


In [None]:
def pca(data, components):
  pca = PCA(n_components = components)
  pca.fit(data)
  data = pca.transform(data)
  return data, pca

In [None]:
X, pca_model = pca(X, 5)

In [None]:
def birch(data):
    '''
    Fits a birch model with the data and returns the model.
    '''
    model = Birch(n_clusters = None)
    model.fit(data)
    return model

In [None]:
model = Birch(n_clusters = 10000)
model.fit(X)

Birch(branching_factor=50, compute_labels=True, copy=True, n_clusters=10000,
      threshold=0.5)

In [None]:
print(np.amax(model.labels_))

9999


In [None]:
def csvWrite(data, names, artists, ids, model):
    ''' 
    Writes the data with the assigned clusters to a csv file.
    '''
    with open("/home/birch_final.csv", "w") as fout:
        csv_writer = csv.writer(fout)
        temp = list(zip(names.values.flatten(), artists.values, ids.values.flatten(), model.labels_))
        temp.sort(key = lambda x: x[3])
        for name, artist, ids, label in temp:
            csv_writer.writerow([name, artist, ids, label])


In [None]:
csvWrite(data, names, artists, IDs, model)

In [None]:
from google.colab import files
files.download("/home/birch_final.csv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
model.partial_fit()

Birch(branching_factor=50, compute_labels=True, copy=True, n_clusters=10000,
      threshold=0.5)

In [None]:
def migrate(model, scaler, pca):
  '''
  Serializes the model and scaler to pickle files for deployment in Flask.
  '''
  pickle.dump(model, open("model_save.pkl", "wb"))
  pickle.dump(scaler, open("scaler_save.pkl", "wb"))
  pickle.dump(pca, open("pca_save.pkl", "wb"))


In [None]:
migrate(model, scaler, pca_model)

In [None]:
from google.colab import files
files.download("/content/scaler_save.pkl")
files.download("/content/model_save.pkl")
files.download("/content/pca_save.pkl")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>