In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_recall_fscore_support
from sklearn.model_selection import train_test_split
import math

# datafile has no duplicates (not a multilabelling problem)
datafile='/home/seherkhan/myfiles/coursework/usc/fall2018/MLforDI/project/FinalDraft/data/playlist_data_21oct_withoutdups.csv'
df=pd.read_csv(datafile,sep='|')
tmp_X = df.iloc[:,4:18]
tmp_y = df.iloc[:,18:19]
tmp_y['playlist'] = tmp_y['playlist'].astype('category')
tmp_y['playlist_codes']=tmp_y['playlist'].cat.codes
scaler = StandardScaler()
StandardScaler(copy=True, with_mean=True, with_std=True)
X=scaler.fit_transform(tmp_X)
y=np.ravel(tmp_y['playlist_codes'])

X_trainset, X_testset, y_trainset, y_testset = train_test_split(X, y, test_size=0.1)

In [None]:
# initial model
kf = KFold(n_splits=2,shuffle=True)
    
for train_index,test_index in kf.split(X,y):
    X_1,X_2=X[train_index],X[test_index]
    y_1,y_2=y[train_index],y[test_index]
        
    X_train = X_1
    X_test = X_2
    y_train = y_1
    y_test = y_2

gb=GradientBoostingClassifier()
gb.fit(X_train,y_train)
pred = gb.predict(X_test)
score = precision_recall_fscore_support(y_test, pred, labels=range(14), average='micro')
accuracy = accuracy_score(y_test, pred, normalize=True)
prediction = score[0]
recall = score[1]
f_score = score[2]

print 'accuracy =', accuracy
print 'prediction =', prediction
print 'recall =', recall
print 'f_score =', f_score

In [None]:
# tune n_estimators
param_grid_nest={'n_estimators':range(20,150,20)}
estimator = GradientBoostingClassifier(learning_rate=0.1, max_features='sqrt', subsample=0.8, random_state=10)
gs1 = GridSearchCV(estimator = estimator, param_grid = param_grid_nest,n_jobs=4)
gs1.fit(X,y)
print gs1.best_params_
print gs1.best_score_

In [None]:
# tune max_depth and min_samples_split
param_grid_dp_sampsplit = {'max_depth':range(5,10,2), 'min_samples_split':range(200,500,50)}
estimator = GradientBoostingClassifier(learning_rate=0.1, n_estimators=40, max_features='sqrt', subsample=0.8, random_state=10)
gs2 = GridSearchCV(estimator = estimator, param_grid = param_grid_dp_sampsplit,n_jobs=4)
gs2.fit(X,y)
print gs2.best_params_
print gs2.best_score_

In [None]:
# tune min_samples_leaf
param_grid_sampleaf = {'min_samples_leaf':range(80,200,40),'max_depth':range(5,13,2)}
estimator = GradientBoostingClassifier(learning_rate=0.1, n_estimators=40, min_samples_split=350, max_features='sqrt', subsample=0.8, random_state=10)
gs2 = GridSearchCV(estimator = estimator, param_grid = param_grid_sampleaf,n_jobs=4)
gs2.fit(X,y)
print gs2.best_params_
print gs2.best_score_

In [None]:
# Final model
#Continue with 5x2CV

min_samples_split = 350 
min_samples_leaf = 100
max_depth = 7
max_features = 'sqrt'
subsample = 0.8

print 'Gradient Boosting: performance on training data 5*2 CV'
gb=GradientBoostingClassifier(min_samples_split=min_samples_split,min_samples_leaf=min_samples_leaf,
                            max_depth = max_depth, max_features =max_features,subsample=subsample)

acc=[]
pre=[]
re=[]
f_sc=[]

accuracy = 0
prediction = 0 
recall = 0
f_score = 0

for n in range(5):
    kf = KFold(n_splits=2,shuffle=True)
    
    for train_index,test_index in kf.split(X_trainset,y_trainset):
        X_1,X_2=X[train_index],X[test_index]
        y_1,y_2=y[train_index],y[test_index]
        
    X_train = X_1
    X_val = X_2
    y_train = y_1
    y_val = y_2
    
    gb.fit(X_train, y_train)
    pred = gb.predict(X_val)
    score = precision_recall_fscore_support(y_val, pred, labels=range(14), average='micro')
    accuracy = accuracy + accuracy_score(y_val, pred, normalize=True)
    prediction = prediction + score[0]
    recall = recall + score[1]
    f_score = f_score + score[2]
    acc.append(accuracy_score(y_val, pred, normalize=True))
    pre.append(score[0])
    re.append(score[1])
    f_sc.append(score[2])
    
    X_train = X_2
    X_val = X_1
    y_train = y_2
    y_val = y_1
    
    gb.fit(X_train, y_train)
    pred = gb.predict(X_val)
    score = precision_recall_fscore_support(y_val, pred, labels=range(14), average='micro')
    accuracy = accuracy + accuracy_score(y_val, pred, normalize=True)
    prediction = prediction + score[0]
    recall = recall + score[1]
    f_score = f_score + score[2]
    acc.append(accuracy_score(y_val, pred, normalize=True))
    pre.append(score[0])
    re.append(score[1])
    f_sc.append(score[2])
    
# calculate average accuracy
accuracy = accuracy/10
prediction = prediction/10 
recall = recall/10
f_score = f_score/10

print 'accuracy =', accuracy,"+-",1.96*np.std(acc)/math.sqrt(10)
print 'prediction =', prediction,"+-",1.96*np.std(pre)/math.sqrt(10)
print 'recall =', recall,"+-",1.96*np.std(re)/math.sqrt(10)
print 'f_score =', f_score,"+-",1.96*np.std(f_sc)/math.sqrt(10)

In [None]:
print 'Gradient Boosting: performance on test data of model fitted to training data in one go'

min_samples_split = 350 
min_samples_leaf = 100
max_depth = 7
max_features = 'sqrt'
subsample = 0.8

gb1=GradientBoostingClassifier(min_samples_split=min_samples_split,min_samples_leaf=min_samples_leaf,
                            max_depth = max_depth, max_features =max_features,subsample=subsample)
gb1.fit(X_trainset,y_trainset)
pred = gb1.predict(X_testset)
score_1 = precision_recall_fscore_support(y_testset, pred, labels=range(14), average='micro')
print 'accuracy =', accuracy_score(y_testset, pred, normalize=True)
print 'prediction =', score_1[0]
print 'recall =', score_1[1]
print 'f_score =', score_1[2]

In [None]:
from matplotlib import pyplot
print gb1.feature_importances_

# Relative feature importance according to the model gb1 (fitted on training data)
rel_importance = gb1.feature_importances_/gb1.feature_importances_.max()
pyplot.figure(figsize=(8, 4)) 
pyplot.bar(tmp_X.keys(),rel_importance)
pyplot.xticks(rotation='vertical')
pyplot.ylabel('Relative Feature Importance')
pyplot.show()