In [None]:
import os

from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.metrics import confusion_matrix, accuracy_score, roc_auc_score, roc_curve
from sklearn.cluster import KMeans
import seaborn as sns
import sys
import copy
import random

import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt

In [None]:
data = pd.read_csv('../input/gtzan-dataset-music-genre-classification/Data/features_3_sec.csv')
# we dont't want serial number 
data = data.iloc[0:, 1:] 
data.head()

In [None]:
d = []
for i in data[data.columns[58]]:
    if i == 'blues':
        d.append(0)
    if i == 'classical':
        d.append(1)
    if i == 'country':
        d.append(2)
    if i == 'disco':
        d.append(3)
    if i == 'hiphop':
        d.append(4)
    if i == 'jazz':
        d.append(5)
    if i == 'metal':
        d.append(6)
    if i == 'pop':
        d.append(7)
    if i == 'reggae':
        d.append(8)
    if i == 'rock':
        d.append(9)

In [None]:
from sklearn.manifold import TSNE
tsne = TSNE(n_components=2).fit_transform(data.drop(columns= 'label'))
plt.scatter(tsne[:,0], tsne[:,1],c=d)

In [None]:
data.describe()

In [None]:
corr = data.corr()
# corr = data.corr()
plt.figure()
ax = sns.heatmap(
    corr, 
#     vmin=-1, vmax=1, center=0,
#     cmap=sns.diverging_palette(20, 220, n=200),
    square=True
)

In [None]:
data.columns

### FEATURES

In [None]:
features = data.drop(columns= 'label')
# labels = data[data.columns[58]]
labels = pd.DataFrame(d)

In [None]:
labels = np.array(labels).reshape(9990)
print(labels)
print(labels.shape)

### NORMALISE

In [None]:
min_max_scaler = preprocessing.MinMaxScaler()
features_scaled = min_max_scaler.fit_transform(features)

# new data frame with the new scaled data. 
features_scaled = pd.DataFrame(features_scaled, columns = features.columns)

### SPLIT AND TRAIN

In [None]:
def linear_classification_train(model, features, labels, test_split_size, title = "Default"):
    """
    input :
    takes scaled features and labels
    split size for test
    model
    
    output : 
    accuracy after fitting the model and testing on test split
    """
    x_train, x_test, y_train, y_test = train_test_split(features, labels, test_size=test_split_size, random_state=42)
    model.fit(x_train, y_train)
    preds = model.predict(x_test)
    print('Accuracy', title, ':', round(accuracy_score(y_test, preds), 4)*100, '\n')
    plt.figure()
#     plt.show(confusion_matrix(y_test, preds))
    ax = sns.heatmap(
        confusion_matrix(y_test, preds), 
#         vmin=-1, vmax=1, center=0,
#         cmap=sns.diverging_palette(20, 220, n=200),
        square=True,annot = True,
        cmap=plt.cm.Blues
    )    

Logistic regression  

Experimented with :  
1. Saga  
2. Sag  
3. Newton-cg  

lbfgs gives best accuracy

In [None]:
model = LogisticRegression(random_state=0, solver='lbfgs', multi_class='multinomial')
linear_classification_train(model, features_scaled, labels, 0.2, "Logistic Regression")

In [None]:
from sklearn.svm import SVC
model = SVC(C=1.0,kernel='linear',random_state=0)
linear_classification_train(model, features_scaled, labels, 0.2, "SVM linear kernel")

In [None]:
model = SVC(C=1.0,kernel='poly',degree=2,random_state=0)
linear_classification_train(model, features_scaled, labels, 0.2, "SVM poly kernel of degree 2")

In [None]:
model = SVC(C=1.0,kernel='poly',degree=3,random_state=0)
linear_classification_train(model, features_scaled, labels, 0.2, "SVM poly kernel of degree 3")

In [None]:
model = SVC(C=1.0,kernel='poly',degree=4,random_state=0)
linear_classification_train(model, features_scaled, labels, 0.2, "SVM poly kernel of degree 4")

In [None]:
model = SVC(C=1.0,kernel='poly',degree=5,random_state=0)
linear_classification_train(model, features_scaled, labels, 0.2, "SVM poly kernel of degree 5")

In [None]:
model = SVC(C=1.0,kernel='poly',degree=6,random_state=0)
linear_classification_train(model, features_scaled, labels, 0.2, "SVM poly kernel of degree 6")

In [None]:
model = SVC(C=1.0,kernel='poly',degree=7,random_state=0)
linear_classification_train(model, features_scaled, labels, 0.2, "SVM poly kernel of degree 7")

In [None]:
for i in range(1,3):
    c=i
    model = SVC(C=i,kernel='rbf',random_state=0)
    linear_classification_train(model, features_scaled, labels, 0.2, "SVM rbf kernel c=" + str(c))


In [None]:
for i in range(10,101,10):
    model = SVC(C=i,kernel='rbf',random_state=0)
    linear_classification_train(model, features_scaled, labels, 0.2, "SVM rbf kernel c=" + str(i))

In [None]:
for i in range(100,1001,100):
    model = SVC(C=i,kernel='rbf',random_state=0)
    linear_classification_train(model, features_scaled, labels, 0.2, "SVM rbf kernel c=" + str(i))

In [None]:
model = SVC(C=1.0,kernel='sigmoid',random_state=0)
linear_classification_train(model, features_scaled, labels, 0.2, "SVM sigmoid kernel")

In [None]:
for i in range(1,11):
    c = i/10.0
    model = SVC(C=c,kernel='poly',degree=6,random_state=0)
    linear_classification_train(model, features_scaled, labels, 0.2, "SVM poly d=6 kernel " + str(c))

In [None]:
for i in range(1,4):
    c = i
    model = SVC(C=c,kernel='poly',degree=6,random_state=0)
    linear_classification_train(model, features_scaled, labels, 0.2, "SVM poly deg=6 kernel " + str(c))

In [None]:
# sgd = SGDClassifier(max_iter=5000, random_state=0, loss = 'log')
# linear_classification_train(sgd, features_scaled, labels, 0.2, "SGD Classifier logistic regression")

# sgd = SGDClassifier(max_iter=5000, random_state=0, loss = 'modified_huber')
# linear_classification_train(sgd, features_scaled, labels, 0.2, "SGD Classifier some model")

# sgd = SGDClassifier(max_iter=50000, random_state=0, loss = 'epsilon_insensitive')
# linear_classification_train(sgd, features_scaled, labels, 0.2, "SGD Classifier some other model")

# # sgd = SGDClassifier(max_iter=50000, random_state=0, loss = 'squared_epsilon_insensitive')
# # linear_classification_train(sgd, features_scaled, labels, 0.2, "SGD Classifier ...")

# sgd = SGDClassifier(max_iter=5000, random_state=0)
# linear_classification_train(sgd, features_scaled, labels, 0.2, "SGD Classifier SVM")

In [None]:
# def cluster_train(model, features, labels, test_split_size):
#     x_train, x_test, y_train, y_test = train_test_split(features_scaled, labels, test_size=test_split_size, random_state=42)
#     pred = model.fit(x_train)
    
#     print(len(pred.labels_))
#     print(pred.labels_)
    
#     pred_labels = [0,0,0,0,0,0,0,0,0,0]
    
#     for i in pred.labels_:
#         pred_labels[i-1] += 1
        
#     print(pred_labels)
#     print(y_train)
    
#     print(len(labels))
#     print('Accuracy of kmeans :', round(accuracy_score(y_train, pred.labels_), 5), '\n')

In [None]:
# model = KMeans(n_clusters=8, init='k-means++', n_init=10, max_iter=3000)
# cluster_train(sgd, features_scaled, labels, 0.2)

In [None]:
model = SVC(C=160,kernel='sigmoid',random_state=0,gamma=0.03)
linear_classification_train(model, features_scaled, labels, 0.2, "SVM sigmoid kernel")

In [None]:
# for c in range(10,201,10):
#     for g in range(1,10):
#         model = SVC(C=c,kernel='sigmoid',random_state=0,gamma = g/100)
#         linear_classification_train(model, features_scaled, labels, 0.2, "SVM sigmoid kernel c=" +str(c)+" gamma="+str(g/100))
#     for g in range(1,11):
#         model = SVC(C=c,kernel='sigmoid',random_state=0,gamma = g/10)
#         linear_classification_train(model, features_scaled, labels, 0.2, "SVM sigmoid kernel c=" +str(c)+" gamma="+str(g/10))

In [None]:
for i in range(0,10):
    labels1=[]
    fs1=[]
    #     labels1 = copy.deepcopy(labels)
    for j in range(len(labels)):
        if labels[j] == i:
            fs1.append(features_scaled.iloc[j,:])
            labels1.append(1)
        else :
            k = random.randint(0,10)
            if k == 1:
                fs1.append(features_scaled.iloc[j])
                labels1.append(0)
    fs1 = pd.DataFrame(fs1)
    labels1 = np.array(pd.DataFrame(labels1))
    print(fs1.shape,labels1.shape)
    model = SVC(C=130,kernel='sigmoid',random_state=0,gamma=0.04)
    linear_classification_train(model, fs1, labels1, 0.2, "SVM sigmoid kernel " + str(i) + " vs rest")
#     break

In [None]:
model = SVC(C=200,kernel='rbf',random_state=0)
linear_classification_train(model, features_scaled, labels, 0.2, "SVM rbf kernel c=200 ")

In [None]:
for i in range(1,10):
    model = SVC(C=200,kernel='rbf',random_state=0,gamma=i/100)
    linear_classification_train(model, features_scaled, labels, 0.2, "SVM rbf kernel c=200 gamma=" + str(i/100))
for i in range(1,11):
    model = SVC(C=200,kernel='rbf',random_state=0,gamma=i/10)
    linear_classification_train(model, features_scaled, labels, 0.2, "SVM rbf kernel c=200 gamma=" + str(i/10))

In [None]:
for i in range(1,11):
    model = SVC(C=200,kernel='rbf',random_state=0,gamma=i)
    linear_classification_train(model, features_scaled, labels, 0.2, "SVM rbf kernel c=200 gamma=" + str(i))

In [None]:
model = SVC(C=200,kernel='rbf',random_state=0,gamma=3)
linear_classification_train(model, features_scaled, labels, 0.2, "SVM rbf kernel c=200 gamma=" + str(3))

In [None]:
model.get_params(deep = True)