In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sqlite3

In [None]:
pip install kneed
pip install yellowbrick
pip install scikit-learn

In [2]:
from sklearn.datasets import make_blobs
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_samples, silhouette_score
from sklearn.preprocessing import StandardScaler

In [None]:
# data preparation

In [3]:
cnx = sqlite3.connect('last_fm.db')

df = pd.read_sql_query("SELECT * FROM TRAINING_DB_2", cnx)
feat = pd.read_sql_query("SELECT * FROM FEATURES_2", cnx)

In [4]:
full = pd.merge(df, feat, how = "left", on = "track_id")

In [5]:
full_numeric = full.select_dtypes(include=['float64'])

In [6]:
full_numeric = full_numeric.dropna()
full = full.dropna()

In [7]:
scaler = StandardScaler()
full_scaled = scaler.fit_transform(full_numeric)

In [8]:
cluster_number = 10

In [None]:
km = KMeans(
    n_clusters=cluster_number, init='random',
    n_init=10, max_iter=500, 
    tol=1e-04, random_state=0
)
y_km = km.fit_predict(full_scaled)

In [11]:
song_index = full_numeric.reset_index()
indd = song_index["index"]

1

In [13]:
genre_column = full["genre_name"]

In [None]:
genre_column = []
for i in range(len(full_numeric)):
    num = indd[i]
    genr = full.iloc[num]["genre_name"]
    genre_column.append(genr)

In [14]:
genres_info = pd.DataFrame({"index" : indd, "genre" : genre_column})

Unnamed: 0,index,genre
0,0.0,Everything
1,1.0,Everything
2,2.0,Everything
3,3.0,Everything
4,4.0,Everything
...,...,...
1114143,,Vietnam - Underground
1114144,,Vietnam - Underground
1114145,,Vietnam - Underground
1114146,,Vietnam - Underground


In [15]:
songs_with_genres = pd.merge(song_index, genres_info, how = 'left', on = "index")
cluster_genres_info = pd.DataFrame({"index" : indd, "cluster" : y_km})
clustered_genres_df = pd.merge(songs_with_genres, cluster_genres_info, how = 'left', on = "index")

In [18]:
mean_null_list = []
def mean_null_calc(some_table):
    ggg = some_table.groupby(["genre", "cluster"]).count()[["index"]]
    ggg = ggg.reset_index()
    ggg = ggg.set_index("genre")
    ggg = ggg.pivot(columns='cluster', values='index')
    ggg2 = ggg.isnull().sum(axis=1)
    mean_null_list.append(ggg2.mean())

In [19]:
def create_pred_table(num_clusters):
    km = KMeans(
    n_clusters=num_clusters, init='random',
    n_init=10, max_iter=500, 
    tol=1e-04, random_state=0)
    y_km = km.fit_predict(full_scaled)
    new_genres_info = pd.DataFrame({"index" : indd, "cluster" : y_km})
    clustered_genres_df = pd.merge(songs_with_genres, new_genres_info, how = 'left', on = "index")
    return clustered_genres_df

In [None]:
# classification: logistic regression

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
model_reg = LogisticRegression(multi_class='multinomial', solver='lbfgs')

In [None]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(full_scaled, y_km, test_size=0.25, random_state=0)

In [None]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(full_scaled, y_classes, test_size=0.25, random_state=0)

In [None]:
train_set_scaled = scaler.fit_transform(train_set)

In [None]:
train_set = train_set.dropna()

In [None]:
genre_column = []
for i in range(len(train_set)):
    num = indd[i]
    genr = full.iloc[num]["genre_name"]
    genre_column.append(genr)

genre_column

In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import cross_validate
from sklearn.model_selection import cross_val_predict
from numpy import mean
from numpy import std

In [None]:
# accuracy
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
# evaluate the model and collect the scores
n_scores = cross_val_score(model_reg, train_sample, y_sample, scoring='accuracy', cv=5, n_jobs=-1)
# report the model performance
print('Mean Accuracy: %.3f (%.3f)' % (mean(n_scores), std(n_scores)))

In [None]:
# recall
# evaluate the model and collect the scores
n_scores = cross_val_score(decision_tree_2, x_train, y_train, scoring = "precision_micro", n_jobs =-1)
# report the model performance
print('Mean accuracy: %.3f (%.3f)' % (mean(n_scores), std(n_scores)))

In [None]:
cross_validate(model_reg, x_train, y_train, scoring = "accuracy", n_jobs = -1)

In [None]:
model_reg.fit(x_train, y_train)

In [None]:
predictions = model_reg.predict(x_test)

In [None]:
import seaborn as sns
from sklearn import metrics

In [None]:
cm = metrics.confusion_matrix(y_test, predictions)
print(cm)

In [None]:
# Use score method to get accuracy of model
score = model_reg.score(x_test, y_test)
print(score)

In [None]:
plt.figure(figsize=(10,10))
sns.heatmap(cm, annot=True, fmt=".0f", linewidths=.5, square = True, cmap = 'PiYG');
plt.ylabel('Actual label');
plt.xlabel('Predicted label');
all_sample_title = 'Accuracy Score: {0}'.format(score)
plt.title(all_sample_title, size = 15);

In [None]:
from sklearn.metrics import precision_recall_fscore_support
precision_recall_fscore_support(y_test, predictions, average=None)

In [None]:
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import SGDClassifier
clf = make_pipeline(StandardScaler(), SGDClassifier(max_iter=1000, tol=1e-3))
clf.fit(x_train, y_train)
#Pipeline(steps=[('standardscaler', StandardScaler()),
               # ('sgdclassifier', SGDClassifier())])

In [None]:
from matplotlib import pyplot
train_set.hist(alpha=0.5, figsize=(20, 10))
pyplot.show()

In [None]:
from sklearn.preprocessing import MinMaxScaler
columns = train_set.columns
norm = MinMaxScaler()
X_norm = norm.fit_transform(train_set)
X_norm = pd.DataFrame(X_norm, columns = columns)
X_norm.head()

In [None]:
# classification: decision tree

In [None]:
from sklearn.tree import DecisionTreeClassifier
decision_tree = DecisionTreeClassifier()
decision_tree.fit(x_train, y_train)

In [None]:
cross_validate(clf, x_train, y_train, scoring = "accuracy", n_jobs = -1)

In [None]:
scores = []
accuracies = []
for i in range(1,30):
    decision_tree_2 = DecisionTreeClassifier(max_depth = i)
    n_scores = cross_val_score(decision_tree_2, x_train, y_train, scoring = "accuracy", n_jobs =-1)
    decision_tree_2.fit(x_train, y_train)
    score = decision_tree_2.score(x_test, y_test)
    accuracies.append((mean(n_scores)))
    scores.append(score)

In [None]:
plt.figure(figsize = (10,7))
plt.xticks(xlist_better)
plt.plot(range(1,30), accuracies, "bo-", linewidth = 2)
#plt.plot(scores, "ro-", alpha = 0.6, linewidth = 2)

plt.xlabel("Depth of the tree")
plt.ylabel("Accuracy of the model")
plt.legend(["Cross-validation accuracy"])

In [None]:
from sklearn import tree

In [None]:
from sklearn.tree import DecisionTreeClassifier
decision_tree_2 = DecisionTreeClassifier(max_depth = 20)
decision_tree_2.fit(x_train, y_train)

In [None]:
score = decision_tree_2.score(x_test, y_test)
print(score)