In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

from subprocess import check_output
print(check_output(["ls", "../input"]).decode("utf8"))

# Any results you write to the current directory are saved as output.

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
from sklearn import datasets, svm, metrics
from sklearn.preprocessing import LabelEncoder
import itertools

In [None]:
df = pd.read_csv('../input/year_prediction.csv')
df.head()

In [None]:
data = df.label.value_counts().sort_index()
plt.xlabel("Release Year")
plt.ylabel("Audio samples")
plt.plot(data.index,data.values)
plt.show()

In [None]:
df['label'] = df.label.apply(lambda year : year-(year%10))
df.label.value_counts()

In [None]:
df = df[df.label>1940]
df.label.value_counts()

In [None]:
#Normalize
for col in df.columns[1:]:
    df[col] = (df[col] - df[col].mean()) / (df[col].max() - df[col].min())

In [None]:
min_samples = df.label.value_counts().min()
output_classes = df.label.unique()
sample_df = pd.DataFrame(columns=df.columns)
for c in output_classes:
    sample_df = sample_df.append(df[df.label==c].sample(min_samples))
sample_df.label = sample_df.label.astype(int)

In [None]:
corr = sample_df.iloc[:,:20].corr()
fig, ax = plt.subplots(figsize=(20,20)) 
plt.title("Correlation")
sns.heatmap(corr)

In [None]:
columns = sample_df.groupby(['label']).mean().columns
labels = ["{:02d}'s".format(l%100) for l in sorted(sample_df.label.unique())]
fig, ax = plt.subplots(figsize=(20,5)) 
sns.heatmap(sample_df.groupby(['label']).mean().iloc[:,0:50], yticklabels=labels) 

In [None]:
sample_df = shuffle(sample_df)
sample_df.head()

In [None]:
df_train, df_test = train_test_split(sample_df, test_size=0.5)

In [None]:
X_train = df_train.iloc[:,1:].values 
y_train = df_train.iloc[:,0].values

C = 10
clf = svm.SVC(kernel='rbf',C=C, gamma=5.0)
X_train.shape, y_train.shape

In [None]:
clf.fit(X_train, y_train)

In [None]:
tst = df_test
X_test = tst.iloc[:,1:].values 
y_test = tst.iloc[:,0].values
expected = y_test
predicted = clf.predict(X_test)
print("Classification report for classifier %s:\n%s\n"
      % (clf, metrics.classification_report(expected, predicted)))
cnf_matrix = metrics.confusion_matrix(expected, predicted)

In [None]:
def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
    
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

# Compute confusion matrix
np.set_printoptions(precision=2)

# Plot non-normalized confusion matrix
plt.figure()
labels = sorted(sample_df.label.unique())
plot_confusion_matrix(cnf_matrix, classes=["{:02d}'s".format(label%100) for label in labels],
                      title='Confusion matrix')

# Plot normalized confusion matrix
plt.figure()
plot_confusion_matrix(cnf_matrix, classes=["{:02d}'s".format(label%100) for label in labels], normalize=True,
                      title='Normalized')

plt.show()