In [1]:
import pandas as pd

import os
import random
import datetime
from dateutil.parser import parse
import numpy as np
import itertools
import string
import time
import csv
import math
from operator import itemgetter

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn import preprocessing

from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import permutation_test_score

from sklearn.metrics import mean_absolute_error, median_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

from sklearn.linear_model import LinearRegression

from sklearn.ensemble import RandomForestRegressor

from sklearn.svm import SVR
from sklearn.svm import SVC

from sklearn.neighbors import LocalOutlierFactor
from sklearn.neighbors import KNeighborsRegressor

# Control Variable

In [2]:
CSV_File = 'billboard_w_song_facts_v2.csv'
Year = 2018
Quarter = 3
test_size = 0.2

# New Features

In [3]:
def advanced_insert_feature(df, func = None, feature_name = None):
    if not feature_name:
        raise ('Empty feature')

    new_data = df.apply(lambda x: func(x, feature_name), axis = 1)
    df.insert(df.shape[1], feature_name, new_data)
    
def insert_feature(df, func = None, feature_name = None):
    if not feature_name:
        raise ('Empty feature')

    new_data = df.apply(func, axis = 1)
    df.insert(df.shape[1], feature_name, new_data)

# Features for base model

In [4]:
# helper function
adjusted_diff_time = {0: 0, 1: 0.25, 2: 0.5, 3: 0.75}
def adjust_time_duration(month):
    quarter = (month - 1)//3
    return adjusted_diff_time[quarter]

In [5]:
def difference_year(row_data, year = Year):
    return year - row_data['year']

def adjusted_difference_year(row_data, year = Year):
    return year + adjust_time_duration(Quarter) - row_data['year'] - adjust_time_duration(row_data['weekid'].month)

def log_rank(row_data):
    return np.log(row_data['rank'])

def log_reverse_rank(row_data):
    return np.log(101 - row_data['rank'])

def log_weeks_on_chart(row_data):
    return np.log(row_data['weeks_on_chart'])

def log_jump_time(row_data):
    if row_data['jump_time'] == 0:
        return 0
    else:
        return np.log(row_data['jump_time'])

def log_max_duration(row_data):
    if row_data['max_jump_duration'] == 0:
        return 0
    else:
        return np.log(row_data['max_jump_duration'])
    
def log_popularity(row_data, rule):
    if row_data[rule] == 0:
        return 0
    else:
        return np.log(row_data[rule])

def insert_base_feature(df):
    insert_feature(df, difference_year, 'diff_year')
    insert_feature(df, adjusted_difference_year, 'adjusted_diff_year')
    insert_feature(df, log_rank, 'log_rank')
    insert_feature(df, log_reverse_rank, 'log_reverse_rank')
    insert_feature(df, log_weeks_on_chart, 'log_weeks_on_chart')
    insert_feature(df, log_jump_time, 'log_jump_time')
    insert_feature(df, log_max_duration, 'log_max_duration')

# Read Data

In [6]:
df_all = pd.read_csv(CSV_File, sep='\t')
df_all['weekid'] = pd.to_datetime(df_all.weekid, infer_datetime_format = True)
df_train = pd.read_csv('fixed_train.csv', sep = '\t')
df_test = pd.read_csv('fixed_test.csv', sep='\t')

# Features for advanced model

In [7]:
def new_feature(row_data, feature_name = None):
    if not feature_name:
        raise ('Empty feature')

    songid = row_data['song']
    
    if df_features[feature].loc[df_features[feature]['song'] == songid].shape[0] > 0:
        return 1
    else:
        return 0

# Auxiliary Functions

In [8]:
def print_statistics_error_info(df, model_name, popularity, predicted_popularity):
    print('Mean Absolute Error of' , model_name, 'is', mean_absolute_error(df[popularity], df[predicted_popularity]))
    print('Median Absolute Error of' , model_name, 'is', median_absolute_error(df[popularity], df[predicted_popularity]))
    print('Mean Squared Error of' , model_name, 'is', mean_squared_error(df[popularity], df[predicted_popularity]))

In [9]:
def plot_error_by_rank(train = df_train, test = None, err_type = 'mean_absolute_error', compared_feature = ['strict_rule', 'predict'], model_name = ''):
    err = {'mean_squared_error': 'Mean Squared Error',
           'mean_absolute_error': 'Mean Absolute Error',
           'median_absolute_error': 'Median Absolute Error'}
    func = {'mean_squared_error': mean_squared_error,
           'mean_absolute_error': mean_absolute_error,
           'median_absolute_error': median_absolute_error}

    if err_type not in err.keys():
        print('Please enter valid error type')
        return

    predicted_group_by_rank=train.groupby('rank')
    train_rank_error=[]
    for index, group in predicted_group_by_rank:
        train_rank_error.append(func[err_type](group[compared_feature[0]], group[compared_feature[1]]))
    
    plt.figure(figsize=(20, 6), dpi=80)
    x = np.arange(1, 101)

    if test is None:
        plt.bar(x + 0.1, train_rank_error, color = 'green', width = 0.8, label='train')
    else:
        predicted_group_by_rank=test.groupby('rank')
        test_rank_error = []
        for index, group in predicted_group_by_rank:
            test_rank_error.append(func[err_type](group[compared_feature[0]], group[compared_feature[1]]))
            
        plt.bar(x + 0.1, train_rank_error, color = 'green', width = 0.4, label='train')
        plt.bar(x + 0.5, test_rank_error, color = 'orange', width = 0.4, label='test')
    plt.xlabel("Peak Rank",fontsize=18)
    plt.ylabel("Mean Absolute Error",fontsize=18)
    plt.title("Mean Absolute Error by Rank (" + model_name + ")",fontsize=18)
    plt.rc('xtick',labelsize=14)
    plt.rc('ytick',labelsize=14)
    ax = plt.gca()
    ax.spines['right'].set_visible(False)
    ax.spines['top'].set_visible(False)
    ax.legend()
    plt.show()

def plot_mae_by_year(train = df_train, test = None, err_type = 'mean_absolute_error', compared_feature = ['strict_rule', 'predict'], model_name = ''):
    predicted_group_by_rank=train.groupby('year')
    min_year=df_train['year'].min()
    max_year=df_train['year'].max()+1  
    year = np.arange(min_year, max_year)
    train_rank_error=[]
    for index, group in predicted_group_by_rank:
        train_rank_error.append(mean_absolute_error(group['strict_rule'], group['predict'])) 
    
    plt.figure(figsize=(20, 6), dpi=80)

    if test is None:
        plt.bar(year + 0.1, train_rank_error, color = 'green', width = 0.8, label='train')
    else:
        predicted_group_by_rank=test.groupby('year')
        test_rank_error = []
        for index, group in predicted_group_by_rank:
            test_rank_error.append(mean_absolute_error(group['strict_rule'], group['predict']))

        plt.bar(year + 0.1, train_rank_error, color = 'green', width = 0.4, label='train')
        plt.bar(year + 0.5, test_rank_error, color = 'orange', width = 0.4, label='test')
    plt.xlabel("Year",fontsize=18)
    plt.ylabel("Mean Absolute Error",fontsize=18)
    plt.title("Mean Absolute Error by Year (Base Model)",fontsize=18)
    plt.rc('xtick',labelsize=14)
    plt.rc('ytick',labelsize=14)
    ax = plt.gca()
    ax.spines['right'].set_visible(False)
    ax.spines['top'].set_visible(False)
    ax.legend()
    plt.show()

# Base Model Used in Progress Report

$popularity = rank \times \Delta t^{C_t}$

$\log(popularity) = \log(rank) +  C_{t}\log(\Delta t)$

In [10]:
df_train['log_diff_year'] = df_train['diff_year'].apply(lambda x: np.log(x))
df_test['log_diff_year'] = df_test['diff_year'].apply(lambda x: np.log(x))

In [11]:
def model_one(train = df_train, test = df_test, x_features = None, y_feature = None, rank_type = 'log_reverse_rank'):
    x_train = train[x_features].values
    y_train = train[y_feature].values
    log_y_train = train.apply(lambda row: -row[rank_type] if row[y_feature] == 0 else np.log(row[y_feature]) - row[rank_type], axis = 1).values
    x_test = test[x_features].values
    y_test = test[y_feature].values
    log_y_test = test.apply(lambda row: -row[rank_type] if row[y_feature] == 0 else np.log(row[y_feature]) - row[rank_type], axis = 1).values
    
    regr = LinearRegression()  

    regr.fit(x_train.reshape(-1, 1), log_y_train)

    log_y_test_pred = regr.predict(x_test.reshape(-1, 1))
    log_y_train_pred= regr.predict(x_train.reshape(-1, 1))

    y_test_pred = pd.Series(log_y_test_pred).combine(pd.Series(test[rank_type].values), lambda x1, x2: 100 if x1+x2 > 100 else x1+x2)
    y_train_pred = pd.Series(log_y_train_pred).combine(pd.Series(train[rank_type].values), lambda x1, x2: 100 if x1+x2 > 100 else x1+x2)

    info = {'coef': regr.coef_, \
            'test': {'Log-MSE': mean_squared_error(log_y_test, log_y_test_pred), \
                     'Log-MAE': mean_absolute_error(log_y_test, log_y_test_pred), \
                     'Log-Variance': r2_score(log_y_test, log_y_test_pred), \
                     'MSE': mean_squared_error(y_test, y_test_pred), \
                     'MAE': mean_absolute_error(y_test, y_test_pred), \
                     'Variance': r2_score(y_test, y_test_pred)},\
            'train': {'Log-MSE': mean_squared_error(log_y_train, log_y_train_pred), \
                     'Log-MAE': mean_absolute_error(log_y_train, log_y_train_pred), \
                     'Log-Variance': r2_score(log_y_train, log_y_train_pred), \
                     'MSE': mean_squared_error(y_train, y_train_pred), \
                     'MAE': mean_absolute_error(y_train, y_train_pred), \
                     'Variance': r2_score(y_train, y_train_pred)}}

    return {'info':info, \
            'train':train.reset_index().assign(predicted=y_train_pred), \
            'test':test.reset_index().assign(predicted=y_test_pred)}

stats = {'strict_rule': {}, 'popularity': {}}
mae_error = {'train': [], 'test': []}
mse_error = {'train': [], 'test': []}
for _ in range(1):
    MAE = {'strict_rule': [], 'popularity': []}
    MSE = {'strict_rule': [], 'popularity': []}
    for y_feature in ['strict_rule']:
        for x_features in ['log_diff_year']:
            for rank in range(1, 101):
                model_info = model_one(train = df_train.loc[(df_train[y_feature] > 0)& (df_train['rank'] == rank)] , \
                                test = df_test.loc[(df_test[y_feature] > 0) & (df_test['rank'] == rank)], \
                                x_features = x_features, \
                                y_feature = y_feature, \
                                rank_type = 'log_reverse_rank')
                info = model_info['info']
                MAE[y_feature].append([x_features, \
                                   info['coef'], \
                                   info['train']['MAE'], \
                                   info['train']['Variance'], \
                                   info['test']['MAE'], \
                                   info['test']['Variance']])
                mae_error['train'].append(info['train']['MAE'])
                mse_error['train'].append(info['train']['MSE'])
                mae_error['test'].append(info['test']['MAE'])
                mse_error['test'].append(info['test']['MSE'])

df_train1, df_test1 = train_test_split(df_all, test_size = test_size)
df_train1['log_diff_year'] = df_train1['diff_year'].apply(lambda x: np.log(x))
df_test1['log_diff_year'] = df_test1['diff_year'].apply(lambda x: np.log(x))
model_one_info = model_one(train = df_train1.loc[(df_train1['strict_rule'] > 0)],
                           test = df_test1.loc[(df_train1['strict_rule'] > 0)],
                           x_features = 'log_diff_year',
                           y_feature = 'strict_rule',
                           rank_type = 'log_reverse_rank')
print_linear_regr_info(model_one_info['info'])

plt.figure(figsize=(20, 6), dpi=80)
x = np.arange(0, 100)
plt.bar(x,mae_error['train'], color = 'orange', width = 0.25, label='train')
plt.bar(x + 0.4, mae_error['test'], color = 'blue', width = 0.25, label='test')
plt.xlabel("Peak Rank",fontsize=18)
plt.ylabel("Absolute Error of Popularity",fontsize=18)
plt.title("Absolute Error of Every Year",fontsize=18)
plt.rc('xtick',labelsize=14)
plt.rc('ytick',labelsize=14)
ax = plt.gca()
ax.spines['right'].set_visible(False)
ax.spines['top'].set_visible(False)
ax.legend()
plt.show()

plt.figure(figsize=(20, 6), dpi=80)
x = np.arange(100)
plt.bar(x,mse_error['train'], color = 'y', width = 0.25, label='train')
plt.bar(x + 0.4, mse_error['test'], color = 'r', width = 0.25, label='test')
plt.xlabel("Peak Rank",fontsize=18)
plt.ylabel("Absolute Error of Popularity",fontsize=18)
plt.title("Absolute Error of Every Year",fontsize=18)
plt.rc('xtick',labelsize=14)
plt.rc('ytick',labelsize=14)
ax = plt.gca()
ax.spines['right'].set_visible(False)
ax.spines['top'].set_visible(False)
ax.legend()
plt.show()


# Linear Regression Base Model
$
\text{popularity} = C_i\text{(reverse_rank)}^{C_r}(\text{weeks_on_chart})^{C_w}e^{C_t (\text{diff_year})}
$


$$
\begin{align*}
\log(\text{popularity}) =& \log C_i + C_r\log(\text{reverse_rank}) + C_w\log(\text{weeks_on_chart}) + {C_t (\text{diff_year})}
\end{align*}
$$

In [13]:
def model_linear_regression(train = df_train, test = df_test, x_features = None, y_feature = None, intercept = True):
    x_train = train[x_features].values
    y_train = train[y_feature].values
    log_y_train = train[y_feature].apply(lambda x: 0 if x == 0 else np.log(x)).values
    x_test = test[x_features].values
    y_test = test[y_feature].values
    log_y_test = test[y_feature].apply(lambda x: 0 if x == 0 else np.log(x)).values
    
    regr = LinearRegression(fit_intercept=intercept)  
    
    regr.fit(x_train, log_y_train)
    
    log_y_test_pred = regr.predict(x_test)
    log_y_train_pred= regr.predict(x_train)

    y_test_pred = pd.Series(log_y_test_pred).apply(lambda x: np.exp(x))
    y_train_pred = pd.Series(log_y_train_pred).apply(lambda x: np.exp(x))
    
    info = {'coef': regr.coef_, \
            'intercept': regr.intercept_, \
            'test': {'Log-MSE': mean_squared_error(log_y_test, log_y_test_pred), \
                     'Log-MAE': mean_absolute_error(log_y_test, log_y_test_pred), \
                     'Log-Variance': r2_score(log_y_test, log_y_test_pred), \
                     'MSE': mean_squared_error(y_test, y_test_pred), \
                     'MAE': mean_absolute_error(y_test, y_test_pred), \
                     'Variance': r2_score(y_test, y_test_pred)},\
            'train': {'Log-MSE': mean_squared_error(log_y_train, log_y_train_pred), \
                     'Log-MAE': mean_absolute_error(log_y_train, log_y_train_pred), \
                     'Log-Variance': r2_score(log_y_train, log_y_train_pred), \
                     'MSE': mean_squared_error(y_train, y_train_pred), \
                     'MAE': mean_absolute_error(y_train, y_train_pred), \
                     'Variance': r2_score(y_train, y_train_pred)}} 

    return {'info':info, \
            'regr':regr, \
            'train':train.reset_index().assign(predicted=y_train_pred), \
            'test':test.reset_index().assign(predicted=y_test_pred)}

In [None]:
# df_train1, df_test1 = train_test_split(df_all, test_size = 0.2)
# df_train_wo_jump = df_train.loc[df_train.jump_time == 0]
# df_test_wo_jump = df_test.loc[df_test.jump_time == 0]
# df_train_w_jump = df_train.loc[df_train.jump_time > 0]
# df_test_w_jump = df_test.loc[df_test.jump_time > 0]

# print(df_train_wo_jump.shape[0], df_test_wo_jump.shape[0], df_train_w_jump.shape[0], df_test_w_jump.shape[0])
# wo_jump_model_info = model_linear_regression(train=df_train_wo_jump.loc[df_train_wo_jump['popularity'] > 0], 
#                                              test=df_test_wo_jump.loc[df_test_wo_jump['popularity'] > 0], 
#                                              x_features=['log_reverse_rank', 'log_weeks_on_chart', 'diff_year'], 
#                                              y_feature='popularity',
#                                              intercept=True)
# print_linear_regr_info(wo_jump_model_info['info'])
# wo_jump_model_info = model_linear_regression(train=df_train_wo_jump.loc[df_train_wo_jump['popularity'] > 0], 
#                                              test=df_test_wo_jump.loc[df_test_wo_jump['popularity'] > 0], 
#                                              x_features=['log_reverse_rank', 'log_weeks_on_chart', 'log_jump_time', 'log_max_duration','diff_year'], 
#                                              y_feature='popularity',
#                                              intercept=True)
# print_linear_regr_info(wo_jump_model_info['info'])
# print('\n\n')
# w_jump_model_info = model_linear_regression(train=df_train_w_jump.loc[df_train_w_jump['popularity'] > 0],
#                                             test=df_test_w_jump.loc[df_test_w_jump['popularity'] > 0],
#                                             x_features=['log_reverse_rank', 'log_weeks_on_chart', 'diff_year'],
#                                             y_feature='popularity',
#                                             intercept=True)
# print_linear_regr_info(w_jump_model_info['info'])
# w_jump_model_info = model_linear_regression(train=df_train_w_jump.loc[df_train_w_jump['popularity'] > 0],
#                                             test=df_test_w_jump.loc[df_test_w_jump['popularity'] > 0],
#                                             x_features=['log_reverse_rank', 'log_weeks_on_chart', 'log_jump_time', 'log_max_duration','diff_year'], 
#                                             y_feature='popularity',
#                                             intercept=True)
# print_linear_regr_info(w_jump_model_info['info'])
# model_info = model_linear_regression(train=df_train.loc[df_train['popularity'] > 0],
#                                      test=df_test.loc[df_test['popularity'] > 0],
#                                      x_features=['log_reverse_rank', 'log_weeks_on_chart', 'diff_year'],
#                                      y_feature='popularity',
#                                      intercept=True)
# print_linear_regr_info(model_info['info'])

# Possible Features

In [14]:
possible_x_features = [['log_reverse_rank', 'log_weeks_on_chart', 'diff_year']]
possible_y_features = ['strict_rule']

# $\text{Base Model - Linear Regression}$

In [18]:
def predict_value_by_linear_regression(row_data):
    rank = row_data['rank']
    x_val = row_data[model_1_x_features].values
    val = np.exp(model_1_regr.predict([x_val]))
    
    if val > 100:
        val = 100
    elif val < 0:
        val = 0
    else:
        val = val.round(decimals = 0)
    
    return val

model_1_x_features = possible_x_features[0]
model_1_y_feature = possible_y_features[0]
base_model_info = model_linear_regression(train = df_train.loc[(df_train[model_1_y_feature] > 0)] ,
                                          test = df_test.loc[(df_test[model_1_y_feature] > 0)],
                                          x_features = model_1_x_features,
                                          y_feature = model_1_y_feature,
                                          intercept = True)
model_1_regr = base_model_info['regr']
df_train['model_1_predicted'] = df_train.apply(predict_value_by_linear_regression, axis=1)
df_test['model_1_predicted'] = df_test.apply(predict_value_by_linear_regression, axis=1)

# Calcluate the Mean Absolute Error, Median Absolute Error, Mean Squared Error
print('train')
print_statistics_error_info(df_train.loc[df_train[model_1_y_feature] > 0], 'linear regression base model', model_1_y_feature, 'model_1_predicted')
print('\n test')
print_statistics_error_info(df_test.loc[df_test[model_1_y_feature] > 0], 'linear regression base model', model_1_y_feature, 'model_1_predicted')

train
Mean Absolute Error of linear regression base model is 12.5059248215
Median Absolute Error of linear regression base model is 10.0
Mean Squared Error of linear regression base model is 255.211253237

 test
Mean Absolute Error of linear regression base model is 12.6325320012
Median Absolute Error of linear regression base model is 10.0
Mean Squared Error of linear regression base model is 261.324383391


# $\text{Base Model - Linear Regression}_r$
- Build the linear regression model for every rank
- Predict the song popularity
 

In [19]:
def predict_value_by_rank(row_data):
    rank = row_data['rank']
    x_val = row_data[model_1_x_features].values
    val = np.exp(regr_coef_per_rank[rank].predict([x_val]))
    
    if val > 100:
        val = 100
    elif val < 0:
        val = 0
    else:
        val = val.round(decimals = 0)
    
    return val

regr_coef_per_rank = {}
train_stats = {'MAE':[], 'MSE':[]}
test_stats = {'MAE':[], 'MSE':[]}

for rank in range(1, 101):
    model_info = model_linear_regression(train = df_train.loc[(df_train[model_1_y_feature] > 0) & (df_train['rank'] == rank)] , \
                           test = df_test.loc[(df_test[model_1_y_feature] > 0) & (df_test['rank'] == rank)], \
                           x_features = model_1_x_features, \
                           y_feature = model_1_y_feature, \
                           intercept = True)
    regr_coef_per_rank[rank] = model_info['regr']

df_train['model_1_r_predicted'] = df_train.apply(predict_value_by_rank, axis=1)
df_test['model_1_r_predicted'] = df_test.apply(predict_value_by_rank, axis=1)

# Calcluate the Mean Absolute Error, Median Absolute Error, Mean Squared Error
print('train')
print_statistics_error_info(df_train.loc[df_train[model_1_y_feature] > 0], 'linear regression_r base model', model_1_y_feature, 'model_1_r_predicted')
print('\n test')
print_statistics_error_info(df_test.loc[df_test[model_1_y_feature] > 0], 'linear regression_r base model', model_1_y_feature, 'model_1_r_predicted')

train
Mean Absolute Error of linear regression_r base model is 10.9584870125
Median Absolute Error of linear regression_r base model is 9.0
Mean Squared Error of linear regression_r base model is 199.025190301

 test
Mean Absolute Error of linear regression_r base model is 11.274742429
Median Absolute Error of linear regression_r base model is 9.0
Mean Squared Error of linear regression_r base model is 210.023415548


# $\text{ML - Random Forest}$

In [None]:
def random_forest_machine_learning(df_train, df_test, x_features = None, y_feature = None, max_depth=2, random_state=0, n_estimators=100):
    X=np.array(df_train[x_features].values)
    y=np.array(df_train[y_feature].values)
    regr = RandomForestRegressor(max_depth=max_depth, random_state=0,n_estimators=100)
    regr.fit(X, y)
    if DebugMode:
        print(regr.feature_importances_)
    X_test=np.array(df_test[feature_to_keep].values)
    pred_test=regr.predict(X_test)
    df_test.insert(df_test.shape[1], column='pred_randomForest', value=np.exp(pred_test) ) 
    
    pred_train=regr.predict(X)
    df_train.insert(df_train.shape[1], column='pred_randomForest', value=np.exp(pred_train) )

## random forest with max_depth 2

## random forest with max_depth 3

In [None]:

plot_error_by_rank(df_train.loc[df_train[y_feature] > 0], 
                 df_test.loc[df_test[y_feature] > 0],
                 err_type = 'median_absolute_error',
                 compared_feature = [y_feature, 'predict'],
                 model_name = '')

plot_error_by_rank(df_train.loc[df_train[y_feature] > 0], 
                 df_test.loc[df_test[y_feature] > 0],
                 err_type = 'mean_absolute_error',
                 compared_feature = [y_feature, 'predict'],
                 model_name = '')

plot_error_by_rank(df_train.loc[df_train[y_feature] > 0], 
                 df_test.loc[df_test[y_feature] > 0],
                 err_type = 'mean_squared_error',
                 compared_feature = [y_feature, 'predict'],
                 model_name = '')
# plot_mae_by_year(df_test.loc[df_test['strict_rule'] > 0], df_train.loc[df_train['strict_rule'] > 0])

In [None]:
plt.figure(figsize=(20, 6), dpi=80)
x = np.arange(1, 101)
plt.bar(x,train_stats['MAE'], color = 'green', width = 0.4, label='train')
plt.bar(x + 0.4, test_stats['MAE'], color = 'orange', width = 0.4, label='test')
plt.xlabel("Peak Rank",fontsize=18)
plt.ylabel("Popularity Difference",fontsize=18)
plt.title("Mean Absolute Error by Rank (Base Model)",fontsize=18)
plt.rc('xtick',labelsize=14)
plt.rc('ytick',labelsize=14)
ax = plt.gca()
ax.spines['right'].set_visible(False)
ax.spines['top'].set_visible(False)
ax.legend()
plt.show()

In [None]:
print(mean_absolute_error(df_train['strict_rule'], df_train['predict']))
print(mean_squared_error(df_train['strict_rule'], df_train['predict']))
print(mean_absolute_error(df_test['strict_rule'], df_test['predict']))
print(mean_squared_error(df_test['strict_rule'], df_test['predict']))

In [None]:
plt.figure(figsize=(20, 6), dpi=80)
x = np.arange(1, 101)
plt.bar(x,train_stats['MAE'], color = 'green', width = 0.4, label='train')
plt.bar(x + 0.4, test_stats['MAE'], color = 'orange', width = 0.4, label='test')
plt.xlabel("Peak Rank",fontsize=18)
plt.ylabel("Popularity Difference",fontsize=18)
plt.title("Mean Absolute Error by Rank (Base Model)",fontsize=18)
plt.rc('xtick',labelsize=14)
plt.rc('ytick',labelsize=14)
ax = plt.gca()
ax.spines['right'].set_visible(False)
ax.spines['top'].set_visible(False)
ax.legend()
plt.show()

plt.figure(figsize=(20, 6), dpi=80)
x = np.arange(100)
plt.bar(x,train_stats['MSE'], color = 'y', width = 0.4, label='train')
plt.bar(x + 0.4, test_stats['MSE'], color = 'r', width = 0.4, label='test')
plt.xlabel("Peak Rank",fontsize=18)
plt.ylabel("Popularity Difference",fontsize=18)
plt.title("Mean Square Error of Popularity by Rank (Base Model)",fontsize=18)
plt.rc('xtick',labelsize=14)
plt.rc('ytick',labelsize=14)
ax = plt.gca()
ax.spines['right'].set_visible(False)
ax.spines['top'].set_visible(False)
ax.legend()
plt.show()

In [None]:
x_features = [['log_rank', 'log_weeks_on_chart', 'log_jump_time', 'log_max_duration', 'diff_year'],
              ['log_reverse_rank', 'log_weeks_on_chart', 'log_jump_time', 'log_max_duration', 'diff_year'],
              ['log_rank', 'log_weeks_on_chart', 'log_jump_time', 'log_max_duration', 'adjusted_diff_year'],
              ['log_reverse_rank', 'log_weeks_on_chart', 'log_jump_time', 'log_max_duration', 'adjusted_diff_year']]
y_features = ['strict_rule']

# df_train, df_test = train_test_split(df_all, test_size = test_size)
model_info_stats = {}
stats = {}
for idx, x_feature in enumerate(x_features):
    for y_feature in y_features:
#         for rank in range(1, 100):
        model_info = model_two(train = df_train.loc[(df_train[y_feature] > 0) & (df_train['rank'] == rank)] , \
                               test = df_test.loc[(df_test[y_feature] > 0) & (df_test['rank'] == rank)], \
                               x_features = x_feature, \
                               y_feature = y_feature, \
                               intercept = True)
        if idx not in stats:
            stats[idx] = {'train':[], 'test':[]}
        stats[idx]['train'].append(model_info['info']['train']['MAE'])
        stats[idx]['test'].append(model_info['info']['test']['MAE'])  

        if idx not in model_info_stats:
            model_info_stats[idx] = []
        model_info_stats[idx].append(model_info)  

In [None]:
stats = {'strict_rule': {}, 'popularity': {}}
mae_error = {'train': [], 'test': []}
mse_error = {'train': [], 'test': []}

    
MAE = {'strict_rule': [], 'popularity': []}
MSE = {'strict_rule': [], 'popularity': []}
for y_feature in ['strict_rule']:
    for x_features in [['log_reverse_rank', 'log_weeks_on_chart', 'log_jump_time', 'log_max_duration', 'diff_year']]:
        for rank in range(1, 101):
            model_info = model_two(train = df_train_wo_jump.loc[(df_train_wo_jump[y_feature] > 0)& (df_train_wo_jump['rank'] == rank)] , \
                            test = df_test_wo_jump.loc[(df_test_wo_jump[y_feature] > 0) & (df_test_wo_jump['rank'] == rank)], \
                            x_features = x_features, \
                            y_feature = y_feature, \
                            intercept = True)

            mae_error['train'].append(model_info['info']['train']['MAE'])
            mse_error['train'].append(model_info['info']['train']['MSE'])
            mae_error['test'].append(model_info['info']['test']['MAE'])
            mse_error['test'].append(model_info['info']['test']['MSE'])

# Advanced Model I
$
\text{popularity} = \text{rank}^{C_r}(\text{weeks_on_chart})^{C_w}(\text{jump_time})^{C_j} (\text{max_duration})^{C_m}e^{C_t \Delta t + C_i}
$


$$
\begin{align*}
\log(\text{popularity}) =& C_r\log(\text{rank}) + C_w\log(\text{weeks_on_chart}) \\&+ C_j\log(\text{jump_time}) + C_m\log(\text{max_duration}) + {C_t \Delta t} + C_i
\end{align*}
$$

In [None]:
for y_feature in ['strict_rule']:
    for x_features in [['log_reverse_rank', 'log_weeks_on_chart', 'log_jump_time', 'log_max_duration', 'diff_year']]:
#         for rank in range(1, 101):
            model_info = model_two(train = df_train.loc[(df_train[y_feature] > 0)& (df_train['rank'] == rank)] , \
                            test = df_test.loc[(df_test[y_feature] > 0) & (df_test['rank'] == rank)], \
                            x_features = x_features, \
                            y_feature = y_feature, \
                            intercept = True)

            mae_error['train'].append(model_info['info']['train']['MAE'])
            mse_error['train'].append(model_info['info']['train']['MSE'])
            mae_error['test'].append(model_info['info']['test']['MAE'])
            mse_error['test'].append(model_info['info']['test']['MSE'])

In [None]:
plt.figure(figsize=(20, 6), dpi=80)
x = np.arange(0, 100)
plt.bar(x,mae_error['train'], color = 'green', width = 0.25, label='train')
plt.bar(x + 0.4, mae_error['test'], color = 'orange', width = 0.25, label='test')
plt.xlabel("Peak Rank",fontsize=18)
plt.ylabel("Popularity Difference",fontsize=18)
plt.title("Mean Absolute Error by Rank (Base Model)",fontsize=18)
plt.rc('xtick',labelsize=14)
plt.rc('ytick',labelsize=14)
ax = plt.gca()
ax.spines['right'].set_visible(False)
ax.spines['top'].set_visible(False)
ax.legend()
plt.show()

In [None]:
plt.figure(figsize=(20, 6), dpi=80)
x = np.arange(100)
plt.bar(x,mse_error['train'], color = 'y', width = 0.25, label='train')
plt.bar(x + 0.4, mse_error['test'], color = 'r', width = 0.25, label='test')
plt.xlabel("Peak Rank",fontsize=18)
plt.ylabel("Popularity Difference",fontsize=18)
plt.title("Mean Square Error of Popularity by Rank (Base Model)",fontsize=18)
plt.rc('xtick',labelsize=14)
plt.rc('ytick',labelsize=14)
ax = plt.gca()
ax.spines['right'].set_visible(False)
ax.spines['top'].set_visible(False)
ax.legend()
plt.show()

In [None]:
model_info['train']['pop_difference'] = model_info['train'].apply(lambda row: row.strict_rule - row.predicted, axis = 1)
model_info['train']['abs_pop_difference'] = model_info['train']['pop_difference'].apply(lambda val: np.abs(val))

In [None]:
model_info['train'].sort_values(by=['pop_difference'],inplace=True)
underperform=model_info['train'].head(10)
overperform=model_info['train'].tail(10)

In [None]:
plt.plot(mae_error)

In [None]:
model_info_stats[2][0]['train']['year']

In [None]:
rank = 13
plt.scatter(model_info_stats[1]['train']['year'], model_info_stats[2]['train']['strict_rule'],  color='blue')
plt.scatter(model_info_stats[1]['train']['year'], model_info_stats[2]['train']['predicted'],  color='red')
# plt.plot(model_info_stats[2][0]['train']['year'], model_info_stats[2][0]['train']['popularity'])

# Test Feature Combination

In [None]:
features = ['log_weeks_on_chart', 'log_jump_time', 'log_max_duration']
features_set_w_dy = []
features_set_w_ady = []

for idx in range(len(features) + 1):
    set_combinations = itertools.combinations(features, idx)
    for subset in set_combinations:
        features_set_w_dy.append(list(subset))
        features_set_w_ady.append(list(subset))
        features_set_w_dy[-1].extend(['log_rank', 'diff_year'])
        features_set_w_ady[-1].extend(['log_rank', 'adjusted_diff_year'])

Experiment_Mode = False
if not Experiment_Mode:
    print(len(features_set_w_dy))
    for features_opt0, features_opt1 in zip (features_set_w_dy, features_set_w_ady):
        print(features_opt0)
        print(features_opt1, '\n')

In [None]:
possible_x_features = features_set_w_ady + features_set_w_dy

In [None]:
possible_x_features

In [None]:
# log_rank, log_weeks_on_chart, log_jump_time, log_max_duration, diff_year, adjusted_diff_year
stats = {'strict_rule':{}, 'lenient_rule':{}, 'popularity':{}}
mae_error = {'train':[], 'test':[]}
for _ in range(20):
    df_train, df_test = train_test_split(df_all, test_size = test_size)
    
    MAE = {'strict_rule':[], 'lenient_rule':[], 'popularity':[]}
    y_features = ['strict_rule', 'lenient_rule', 'popularity']
    y_features = ['strict_rule']
    for y_feature in y_features:
#         print('y features:', y_feature)
#         print()
#         test_x_features = [['log_weeks_on_chart','log_jump_time','log_max_duration','log_rank','adjusted_diff_year']]
        for x_features in possible_x_features:
    #         print('    x features:', x_features)
            model_info = model_two(train = df_train.loc[(df_train[y_feature] > 0)] , \
                                           test = df_test.loc[(df_test[y_feature] > 0)], \
                                           x_features = x_features, \
                                           y_feature = y_feature, \
                                           intercept = True)
#             for rank in range(1, 101):
#                 model_info = model_two(train = df_train.loc[(df_train[y_feature] > 0) & (df_train['rank'] == rank)] , \
#                                            test = df_test.loc[(df_test[y_feature] > 0) & (df_test['rank'] == rank)], \
#                                            x_features = x_features, \
#                                            y_feature = y_feature, \
#                                            intercept = True)
            info = model_info['info']
            mae_error['train'].append(info['train']['MAE'])
            mae_error['test'].append(info['test']['MAE'])
            MAE[y_feature].append([x_features, \
                                   info['coef'], \
                                   info['intercept'], \
                                   info['train']['MAE'], \
                                   info['train']['MSE'], \
                                   info['train']['Variance'], \
                                   info['test']['MAE'], \
                                   info['test']['MSE'], \
                                   info['test']['Variance']])
            if x_features == ['log_rank', 'diff_year'] or \
               x_features == ['log_rank', 'adjusted_diff_year']:
                print('base model:', \
                      x_features, '\n'\
                      'coef:', info['coef'], info['intercept'], '\n', \
                      info['train']['MAE'], \
                      info['train']['MSE'], \
                      info['train']['Variance'], \
                      info['test']['MAE'], \
                      info['test']['MSE'], \
                      info['test']['Variance'], '\n')

        local_info = min(MAE[y_feature],key=itemgetter(3))
        x_features_str = '-'.join(local_info[0])
        if x_features_str in stats[y_feature]:
            stats[y_feature][x_features_str] += 1
        else:
            stats[y_feature][x_features_str] = 1
#         print('minimal error model:', \
#               local_info[0], '\n', \
#               'coef:', local_info[1:3], '\n', \
#               local_info[3:], '\n')


    #         print('\n')
    #     print('\n')
    # print(df_train['popularity'].values.shape)
    # print(df_train['strict_rule'].apply(lambda x: 0 if x == 0 else np.log(x)).values.shape)
    # list(df_train)


In [None]:
plt.plot(mae_error['train'])
plt.plot(mae_error['test'])

In [None]:
for major_key in stats.keys():
    print(major_key)
    for minor_key in stats[major_key].keys():
        print(minor_key, stats[major_key][minor_key])
    print('\n')

# Visualization

In [None]:
def plot_absolute_error(predict_train, predict_test):
     # calculate absolute error of each year 
    predicted_test_year = predict_test.groupby('year')
    min_year = predict_test['year'].min()
    max_year = predict_test['year'].max()+1    
    
    error_year=[]
    for index, group in predicted_test_year:
        error= np.array(group['popularity']-group['predictedPopularity'])
        length=error.shape[0]
        error=np.abs(error)
        total_error=error.sum() 
        total_error=total_error/length
        error_year.append(total_error) 
    
    year = range(min_year, max_year)
    
    print("the mean absolute error of different years is: ")
    print(sum(error_year)/len(error_year))
    plt.figure(figsize=(20, 6), dpi=80)
    plt.plot(year,error_year)
    plt.xlabel("Year")
    plt.ylabel("Absolute Error")
    plt.title("for test case")
    plt.show()
    
    
    # PLOT BAR FIRUGE     
    plt.figure(figsize=(20, 6), dpi=80)
    plt.bar(year,error_year)
    plt.xlabel("Year",fontsize=18)
    plt.ylabel("Absolute Error",fontsize=18)
    plt.title("Absolute Error of Every Year",fontsize=18)
    plt.rc('xtick',labelsize=14)
    plt.rc('ytick',labelsize=14)
    plt.show()

# Add New Features

In [None]:
def advanced_insert_feature(df, func = None, feature_name = None):
    if not feature_name:
        raise ('Empty feature')

    new_data = df.apply(lambda x: func(x, feature_name), axis = 1)
    df.insert(df.shape[1], feature_name, new_data)
    
def insert_feature(df, func = None, feature_name = None):
    if not feature_name:
        raise ('Empty feature')

    new_data = df.apply(func, axis = 1)
    df.insert(df.shape[1], feature_name, new_data)

def new_feature(row_data, feature_name = None):
    if not feature_name:
        raise ('Empty feature')

    songid = row_data['song']
    
    if df_features[feature].loc[df_features[feature]['song'] == songid].shape[0] > 0:
        return 1
    else:
        return 0

In [None]:
df_features = {}
csv_files = [f for f in os.listdir('.') if os.path.isfile(f)]
for csv_file in csv_files:
    if 'csv' in csv_file and 'songs' in csv_file:
        feature = csv_file.split('.')[0]
        df_features[feature] = pd.read_csv(csv_file)
        df_features[feature]['songid'] = df_features[feature][['song', 'artist']].apply(lambda x: ''.join(x), axis=1)

df_features = {}
csv_files = ['songs-used-in-movies.csv',\
            'songs-used-in-tv-shows.csv',\
            'songs-used-in-commercials.csv']
for csv_file in csv_files:
    feature = csv_file.split('.')[0]
    df_features[feature] = pd.read_csv(csv_file)
    df_features[feature]['songid'] = df_features[feature][['song', 'artist']].apply(lambda x: ''.join(x), axis=1)

In [None]:
for feature in df_features.keys():
    advanced_insert_feature(df_train, new_feature, feature)
    advanced_insert_feature(df_test, new_feature, feature)

In [None]:
# create new fit feature
for feature in df_features.keys():
    new_feature = 'fit-' + feature
    df_train[new_feature] = df_train.apply(lambda row: np.log(row[feature] + 1), axis=1)
    df_test[new_feature] = df_test.apply(lambda row: np.log(row[feature] + 1), axis=1)

In [None]:
df_train['new_feature'] = df_train[list(df_features)].apply(lambda x: 1 if any(x) else 0, axis=1)
df_test['new_feature'] = df_test[list(df_features)].apply(lambda x: 1 if any(x) else 0, axis=1)

In [None]:
df_train['sum_feature'] = df_train[list(df_features)].apply(lambda x: np.log(sum(x)) if any(x) else 0, axis=1)
df_test['sum_feature'] = df_test[list(df_features)].apply(lambda x: np.log(sum(x)) if any(x) else 0, axis=1)

In [None]:
df_train['log_weeks_on_charts'] = df_train['weeks_on_chart'].apply(lambda x: np.log(x))
df_test['log_weeks_on_charts'] = df_test['weeks_on_chart'].apply(lambda x: np.log(x))

In [None]:
df_train['log_jump_time'] = df_train['jump_time'].apply(lambda x: np.log(x**2) if x > 0 else 0)
df_test['log_jump_time'] = df_test['jump_time'].apply(lambda x: np.log(x**2) if x > 0 else 0)
df_train['log_max_jump_duration'] = df_train['max_jump_duration'].apply(lambda x: np.log(x**2) if x > 0 else 0)
df_test['log_max_jump_duration'] = df_test['max_jump_duration'].apply(lambda x: np.log(x**2) if x > 0 else 0)

In [None]:
list(df_test)

In [None]:
list(df_features.keys())

# model -fit-movie, fit-tv-show, fit-commercials

In [None]:
def evaluation_featured_model(train = df_train, test = df_test, x_features = None, y_feature = None, intercept = True, full_info = False):
    x_train = train[x_features].values  
    y_train = train[y_feature].values
    x_test = test[x_features].values
    y_test = test[y_feature].values
    
    regr = linear_model.LinearRegression(fit_intercept=intercept) # with intercept 
    regr.fit(x_train, y_train)
    ytest_pred = regr.predict(x_test)
    ytrain_pred=regr.predict(x_train)
    
    if full_info:
        print('Coefficients: \n', regr.coef_)
        print('Intercept when fit_intercept=True : {:.5f}'.format(regr.intercept_))
    
    print("Mean squared error for test case is: %.3f"% mean_squared_error(y_test, ytest_pred))
    if full_info:
        print('Variance score for test case is: %.3f' % r2_score(y_test, ytest_pred))
    print("Mean squared error for train case is: %.3f"% mean_squared_error(y_train, ytrain_pred))
    if full_info:
        print('Variance score for test case is: %.3f' % r2_score(y_train, ytrain_pred))

# df_train_mid=df_train.assign(predictedlogP=pd.Series(ytrain_pred))
# df_test_mid=df_test.assign(predictedlogP=pd.Series(ytest_pred))
    
# predict_train=df_train_mid.assign(predictedPopularity=pd.Series( np.exp(df_train_mid['predictedlogP'])))
# predict_test=df_test_mid.assign(predictedPopularity=pd.Series( np.exp(df_test_mid['predictedlogP'])))

In [None]:
fit_features = ['fit-'+ feature for feature in df_features.keys()]
print(fit_features)

In [None]:
start_year = 1958
end_year = 2005
for rank in ['logRank']:
    for pop in ['logPopularity','logNewPopularity']:
        print(rank, pop)
        

        print('linear regression - base model')
        evaluation_featured_model(train=df_train.loc[df_train.year.isin([start_year, end_year])], test=df_test, x_features=[rank, 'Y_year'], y_feature=pop)

        print('\nlinear regression - base model 1')
        evaluation_featured_model(train=df_train[df_train.year.isin([start_year, end_year])], test=df_test,x_features=[rank, 'Y_year', 'log_weeks_on_charts'], y_feature=pop)
        
        print('\nlinear regression - base model 2')
        evaluation_featured_model(train=df_train[df_train.year.isin([start_year, end_year])],\
                                  test=df_test\
                                  ,x_features=[rank, 'Y_year', 'log_weeks_on_charts'],\
                                  y_feature=pop)
        
        print('\nlinear regression - base model 3')
        evaluation_featured_model(train=df_train[df_train.year.isin([start_year, end_year])],\
                                  test=df_test,\
                                  x_features=[rank,\
                                              'Y_year',\
                                              'log_weeks_on_charts',\
                                              'log_jump_time'],\
                                  y_feature=pop)
        
        print('\nlinear regression - base model 4')
        evaluation_featured_model(train=df_train[df_train.year.isin([start_year, end_year])],\
                                  test=df_test,\
                                  x_features=[rank,\
                                              'Y_year',\
                                              'log_weeks_on_charts',\
                                              'log_jump_time',\
                                              'log_max_jump_duration'],\
                                  y_feature=pop)
        
        print('\nlinear regression - base model 4')
        evaluation_featured_model(train=df_train[df_train.year.isin([start_year, end_year])],\
                                  test=df_test,\
                                  x_features=[rank,\
                                              'Y_year',\
                                              'log_weeks_on_charts',\
                                              'log_jump_time',\
                                              'log_max_jump_duration',\
                                              'all_time_greatest_artist'],\
                                  y_feature=pop)
        
        print('\nlinear regression - advanced model')
        features = [rank, 'Y_year', 'log_weeks_on_charts', 'log_jump_time', 'log_max_jump_duration']
        features.extend(fit_features)
        evaluation_featured_model(train=df_train[df_train.year.isin([start_year, end_year])], test=df_test,x_features=features, y_feature=pop)
        
        print('\nlinear regression - all as one model')
        evaluation_featured_model(train=df_train[df_train.year.isin([start_year, end_year])], test=df_test,x_features=[rank, 'Y_year', 'log_weeks_on_charts', 'log_jump_time', 'log_max_jump_duration','new_feature'], y_feature=pop)
        
        print('\nlinear regression - sum all feature model')
        evaluation_featured_model(train=df_train[df_train.year.isin([start_year, end_year])], test=df_test,x_features=[rank, 'Y_year', 'log_weeks_on_charts', 'log_jump_time', 'log_max_jump_duration','sum_feature'], y_feature=pop)
        print('\n')

In [None]:
print(df_train.loc[(df_train['songs-used-in-commercials'] == 1)].shape[0])
print(df_train.loc[(df_train['songs-used-in-movies'] == 1)].shape[0])
print(df_train.loc[(df_train['songs-used-in-tv-shows'] == 1)].shape[0])
print(df_train.loc[(df_train['songs-used-in-movies'] == 1) |\
                   (df_train['songs-used-in-tv-shows'] == 1) |\
                   (df_train['songs-used-in-commercials'] == 1)].shape[0])

In [None]:
df_features.keys()

In [None]:
corr = df_train[list(df_features.keys())].corr()
corr.style.background_gradient()

In [None]:
features = [rank, 'Y_year', 'log_weeks_on_charts', 'log_jump_time', 'log_max_jump_duration']
features.extend(fit_features)
features.remove('fit-songs-for-wedding-anniversaries')
corr = df_train[features].corr()
corr.style.background_gradient()

# KNeighbors Regressor

In [None]:
neigh = KNeighborsRegressor(n_neighbors=3)
df_train, df_test = train_test_split(df_all, test_size = test_size)

x_features = [['log_rank', 'log_weeks_on_chart', 'log_jump_time', 'log_max_duration', 'diff_year'],
              ['log_reverse_rank', 'log_weeks_on_chart', 'log_jump_time', 'log_max_duration', 'diff_year'],
              ['log_rank', 'log_weeks_on_chart', 'log_jump_time', 'log_max_duration', 'adjusted_diff_year'],
              ['log_reverse_rank', 'log_weeks_on_chart', 'log_jump_time', 'log_max_duration', 'adjusted_diff_year']]
y_features = ['strict_rule']


neigh.fit(df_train[['rank', 'weeks_on_chart', 'jump_time', 'max_jump_duration', 'year']], df_train[['popularity']])

train_predict_res = neigh.predict(df_train[['rank', 'weeks_on_chart', 'jump_time', 'max_jump_duration', 'year']])
test_predict_res = neigh.predict(df_test[['rank', 'weeks_on_chart', 'jump_time', 'max_jump_duration', 'year']])

print(mean_squared_error(df_train[['popularity']], train_predict_res),
      mean_absolute_error(df_train[['popularity']], train_predict_res))
print(mean_squared_error(df_test[['popularity']], test_predict_res),
      mean_absolute_error(df_test[['popularity']], test_predict_res))

# Random Forest Regressor

In [23]:
regr = RandomForestRegressor(max_depth=5, random_state=0, n_estimators=10)
x_features = ['rank', 'weeks_on_chart', 'jump_time', 'max_jump_duration', 'year', 'all_time_greatest_artist']
y_feature = 'strict_rule'
regr.fit(df_train[x_features], df_train[y_feature])

# print(regr.feature_importances_)

# train_predict_res = regr.predict(df_train[x_features])
# test_predict_res = regr.predict(df_test[x_features])

# print(mean_squared_error(df_train[y_feature], train_predict_res),
#       mean_absolute_error(df_train[y_feature], train_predict_res))
# print(mean_squared_error(df_test[y_feature], test_predict_res),
#       mean_absolute_error(df_test[y_feature], test_predict_res))

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=5,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
           oob_score=False, random_state=0, verbose=0, warm_start=False)

# Support Vector Regressor

In [None]:
clf = SVR(gamma='scale', C=1.0, epsilon=0.2)
x_features = ['rank', 'weeks_on_chart', 'jump_time', 'max_jump_duration', 'year', 'all_time_greatest_artist']
y_feature = 'strict_rule'
clf.fit(df_train[x_features], df_train[y_feature]) 
train_predict_res = clf.predict(df_train[x_features])
test_predict_res = clf.predict(df_test[x_features])

print(mean_squared_error(df_train[y_feature], train_predict_res),
      mean_absolute_error(df_train[y_feature], train_predict_res))
print(mean_squared_error(df_test[y_feature], test_predict_res),
      mean_absolute_error(df_test[y_feature], test_predict_res))