In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import train_test_split
from sklearn import model_selection
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.dummy import DummyClassifier
from sklearn.metrics import make_scorer
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score
from mlxtend.classifier import StackingClassifier
from IPython.display import display
import warnings

warnings.simplefilter("ignore")
SEED = 42

In [None]:
data = pd.read_csv("../input/spotifyclassification/data.csv")
data.drop('Unnamed: 0', axis =1, inplace=True)
print(display(data.head()))

In [None]:
print(data.info())
print('Дубликатов:', data.duplicated().sum())
print(display(data.describe()))
    
for column in data:
    if data[column].dtype == 'object':
        print('Признак:', column)
        print('Уникальные:', data[column].value_counts().count())
        print()


In [None]:
data= data.drop_duplicates().reset_index(drop=True)

In [None]:
def ranking(y, title='', xlabel='', ylabel=''):
    y_pos = np.arange(len(y),0,-1)
    plt.figure(figsize=(5,5))
    cmap = ListedColormap(sns.color_palette("GnBu_d"))
    bar_colors = cmap(y.values)
    plt.barh(y_pos, width=y, height=0.6, left=None, align='center', color=bar_colors, alpha = 0.8)
    plt.yticks(y_pos, y.index)
    plt.title(title)
    plt.xlabel(xlabel)
    plt.ylabel(ylabel)
    for p, c, ch in zip(y_pos, y.index, round(y,4)):
        plt.annotate(str(ch), xy=(ch if ch>0 else 0, p), va='center')

In [None]:
ranking(data.corr()['target'].sort_values(ascending = False), 'Correlation of target with other features', 'Correlation', 'Features')

In [None]:
def range_col(data,parameter, begin, end, step):

  rangecol = []
  
  for row in data.values:
    row[parameter] = float(row[parameter])
    
    if row[parameter] < begin:
      rangecol.append("<" + str(round(begin,2)))
    elif row[parameter] >= end:
      rangecol.append(">=" + str(round(end,2)))
    else:
      for r in np.arange(begin,end,step).round(2):
        if r <= row[parameter] < min((r+step, end)):
          rangecol.append(str(round(r,2))+"-"+str(round(min(r+step, end),2)))
          break
        
  
     
  return rangecol

In [None]:
data_modified = data.copy()
data_modified['acousticness'] = range_col(data_modified, 0, 0, 1, 0.1)
data_modified['danceability'] = range_col(data_modified, 1, 0, 1, 0.1)
data_modified['duration_ms'] = range_col(data_modified, 2, 50000, 1000000, 50000)
data_modified['energy'] = range_col(data_modified, 3, 0, 1, 0.1)
data_modified['instrumentalness'] = range_col(data_modified, 4, 0, 1, 0.1)
data_modified['liveness'] = range_col(data_modified, 6, 0, 1, 0.1)
data_modified['loudness'] = range_col(data_modified, 7, -34, 0, 2)
data_modified['speechiness'] = range_col(data_modified, 9, 0, 1, 0.1)
data_modified['tempo'] = range_col(data_modified, 10, 40, 220, 10)
data_modified['valence'] = range_col(data_modified, 12, 0, 1, 0.1)

In [None]:
print(display(data_modified.head()))

In [None]:
data_encoded = pd.get_dummies(data_modified.drop('song_title', axis = 1), dummy_na=False, drop_first=True)
print(data_encoded.info())

In [None]:
features = data_encoded.drop('target', axis =1)
target = data_encoded['target']

In [None]:
features_train_val, features_test, target_train_val, target_test = train_test_split(features, target, test_size=0.20, random_state=SEED)
features_train, features_val, target_train, target_val = train_test_split(features_train_val, target_train_val, test_size=0.25, random_state=SEED)
print(features_train.shape, features_val.shape, features_test.shape, target_train.shape, target_val.shape, target_test.shape)

In [None]:
target.value_counts(normalize = True)

In [None]:
clf1 = LogisticRegression(random_state = SEED)
clf2 = GradientBoostingClassifier(random_state = SEED)
lr = LogisticRegression(random_state = SEED)
sclf = StackingClassifier(classifiers=[clf1, clf2], 
                          use_probas=True,
                          meta_classifier=lr)

In [None]:
param1 = ({'C': list(np.arange(1, 10)), 'penalty': ['l1'], 'solver' : ['liblinear'], 'multi_class' : ['ovr']},    
              {'C': list(np.arange(1, 10)), 'penalty': ['l2'], 'solver' : ['liblinear', 'newton-cg', 'lbfgs'], 'multi_class' : ['ovr']})

param2 = ({'learning_rate': (0.01, 0.05, 0.1, 0.5),
                  'n_estimators': (50, 100, 200),
                  'min_samples_split': (2, 3),
                  'max_depth': (2, 3, 5)
                                                   
                 })
param_sclf = ({})

In [None]:
scorer = make_scorer(roc_auc_score)

In [None]:
score_table = pd.DataFrame(columns = ['model', 'params', 'accuracy_score', 'roc_auc'])
i=0
for clf, label, param in zip([clf1, clf2,sclf], 
                      ['Logistic Regression', 
                       'Gradient Boosting',
                       'Stacking Classifier'], 
                       [param1, param2, param_sclf]):
    
    grid = model_selection.GridSearchCV(clf, param_grid=param,scoring=scorer)
    grid.fit(features_train, target_train)
    scores = grid.score(features_val, target_val)
    print(label)
    print(grid.best_params_)
    print("accuracy score = %0.4f" 
          % (scores))
    probabilities = grid.predict_proba(features_test)
    probabilities_ones = probabilities[:,1]
    auc_roc = roc_auc_score(target_test, probabilities_ones)
    print('roc_auc = {:0.4f}'.format(auc_roc)) 
    print()
    if clf == clf1:
      clf1.set_params(**grid.best_params_)
    if clf == clf2:
      clf2.set_params(**grid.best_params_)
    score_table.loc[i, 'model'] = label
    score_table.loc[i, 'params'] = [grid.best_params_]
    score_table.loc[i, 'accuracy_score'] = scores
    score_table.loc[i, 'roc_auc'] = auc_roc

    i+=1


    train_sizes, train_scores, test_scores, fit_times, _ = model_selection.learning_curve(clf, features_train, target_train, return_times=True, scoring=scorer)
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)
    fit_times_mean = np.mean(fit_times, axis=1)
    fit_times_std = np.std(fit_times, axis=1)

    # Plot learning curve
    _, axes = plt.subplots(1, 1, figsize=(6, 6))

    axes.set_title(label)
    axes.set_xlabel("Training examples")
    axes.set_ylabel("Score")
    axes.grid()
    axes.fill_between(train_sizes, train_scores_mean - train_scores_std,
                         train_scores_mean + train_scores_std, alpha=0.1,
                         color="r")
    axes.fill_between(train_sizes, test_scores_mean - test_scores_std,
                         test_scores_mean + test_scores_std, alpha=0.1,
                         color="g")
    axes.plot(train_sizes, train_scores_mean, 'o-', color="r",
                 label="Training score")
    axes.plot(train_sizes, test_scores_mean, 'o-', color="g",
                 label="Cross-validation score")
    axes.legend(loc="best")    

In [None]:
score_table

In [None]:
ranking(score_table.groupby(['model'])['roc_auc'].aggregate('max').sort_values(ascending = False), 'Рейтинг моделей', 'Метрика roc_auc', 'Модель')

In [None]:
clf1 = LogisticRegression(random_state = SEED)
params = score_table[score_table['model']=='Logistic Regression']['params']
clf1.set_params(**params[0][0])

clf2 = GradientBoostingClassifier(random_state = SEED)
params = score_table[score_table['model']=='Gradient Boosting']['params']

clf2.set_params(**params[1][0])

lr = LogisticRegression(random_state = SEED)
model = StackingClassifier(classifiers=[clf1, clf2], 
                          use_probas=True,
                          meta_classifier=lr)
model.fit(features_train_val, target_train_val)

In [None]:
target_predict = model.predict(features_test)
accuracy = accuracy_score(target_test, target_predict)
print('Best model:')
print("accuracy score = %0.4f" % (accuracy))
probabilities = model.predict_proba(features_test)
probabilities_ones = probabilities[:,1]
auc_roc = roc_auc_score(target_test, probabilities_ones)
print('roc_auc = {:0.4f}'.format(auc_roc))    
    

In [None]:
model_dummy = DummyClassifier()
model_dummy.fit(features_train_val, target_train_val)
target_predict_d = model_dummy.predict(features_test)
accuracy_d = accuracy_score(target_test, target_predict_d)
print('Dummy classifier:')
print("accuracy score = %0.4f" % (accuracy_d))
probabilities_d = model_dummy.predict_proba(features_test)
probabilities_ones_d = probabilities_d[:,1]
auc_roc_d = roc_auc_score(target_test, probabilities_ones_d)
print('roc_auc = {:0.4f}'.format(auc_roc_d))  

In [None]:
fpr, tpr, thresholds = roc_curve(target_test, probabilities_ones)
fpr_d, tpr_d, thresholds_d = roc_curve(target_test, probabilities_ones_d)
plt.figure(figsize = (8,8))
plt.plot(fpr, tpr, color='darkorange', label='Ensemble ROC curve (area = %0.3f)' % auc_roc)

plt.plot(fpr_d, tpr_d, color='navy', linestyle='--', label = 'Dummy classifier ROC curve (area = %0.3f)' % auc_roc_d)


for point,flag in zip([0.10, 0.20, 0.30, 0.50, 0.70, 0.90], [0,0,0,0,0,0]):
  i=0
  for fp in fpr:
    if round(fp, 1) == point and flag ==0:
      plt.plot(fpr[i], tpr[i], color='red', marker="o")
      plt.annotate("threshold = {}".format(point), xy=(fpr[i], tpr[i]), xytext=(fpr[i]+0.04, tpr[i]-0.05),  arrowprops={'width':0.3,'headwidth':7,'color':'#333333'})
      flag=1
    i+=1
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC-curve')
plt.legend(loc="lower right")
plt.show()


In [None]:
optimal_idx = np.argmax(tpr - fpr)
optimal_threshold = thresholds[optimal_idx]
print("Optimal threshold = %0.2f" %optimal_threshold)
preds = [1 if pr>optimal_threshold else 0 for pr in probabilities_ones]
print("accuracy score = %0.4f" %(accuracy_score(target_test, preds)))

In [None]:
print('threshold =', 0.25)
preds = [1 if pr>0.25 else 0 for pr in probabilities_ones]
print("accuracy score = %0.4f" %(accuracy_score(target_test, preds)))