In [1]:
import pandas as pd

import os
import random
import datetime
from dateutil.parser import parse
import numpy as np
import itertools
import string
import time
import csv
import math
from operator import itemgetter

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn import preprocessing
from sklearn import linear_model
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import permutation_test_score

from sklearn import datasets
from sklearn.datasets import make_classification

from sklearn.ensemble import IsolationForest
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_extraction import FeatureHasher
from sklearn.svm import SVC

from sklearn.neighbors import LocalOutlierFactor

from sklearn.neighbors.kde import KernelDensity
from sklearn.cluster import KMeans

# Control Variable

In [10]:
Experiment_Mode = True
Permutation_Test = Experiment_Mode
CSV_File = 'D:/CSE519/Project/billboard_w_new_features_v7.csv"'
Year = 2018
Quarter = 4
df_all = None
df_train = None
df_test = None
test_size = 0.3

# New Features

In [11]:
def advanced_insert_feature(df, func = None, feature_name = None):
    if not feature_name:
        raise ('Empty feature')

    new_data = df.apply(lambda x: func(x, feature_name), axis = 1)
    df.insert(df.shape[1], feature_name, new_data)
    
def insert_feature(df, func = None, feature_name = None):
    if not feature_name:
        raise ('Empty feature')

    new_data = df.apply(func, axis = 1)
    df.insert(df.shape[1], feature_name, new_data)

# Features for base model

In [12]:
# helper function
adjusted_diff_time = {0: 0, 1: 0.25, 2: 0.5, 3: 0.75}
def adjust_time_duration(month):
    quarter = (month - 1)//3
    return adjusted_diff_time[quarter]

In [13]:
def difference_year(row_data, year = Year):
    return year - row_data['year']

def adjusted_difference_year(row_data, year = Year):
    return year + adjust_time_duration(Quarter) - row_data['year'] - adjust_time_duration(row_data['weekid'].month)

def log_rank(row_data):
    return np.log(row_data['rank'])

def log_weeks_on_chart(row_data):
    return np.log(row_data['weeks_on_chart'])

def log_jump_time(row_data):
    if row_data['jump_time'] == 0:
        return 0
    else:
        return np.log(row_data['jump_time'])

def log_max_duration(row_data):
    if row_data['max_jump_duration'] == 0:
        return 0
    else:
        return np.log(row_data['max_jump_duration'])
    
def log_popularity(row_data, rule):
    if row_data[rule] == 0:
        return 0
    else:
        return np.log(row_data[rule])

def insert_base_feature(df):
    insert_feature(df, difference_year, 'diff_year')
    insert_feature(df, adjusted_difference_year, 'adjusted_diff_year')
    insert_feature(df, log_rank, 'log_rank')
    insert_feature(df, log_weeks_on_chart, 'log_weeks_on_chart')
    insert_feature(df, log_jump_time, 'log_jump_time')
    insert_feature(df, log_max_duration, 'log_max_duration')

# Read Data

In [14]:
if Experiment_Mode:
    df_billboard_100 = pd.read_csv(CSV_File)
    df_billboard_100['weekid'] = pd.to_datetime(df_billboard_100.weekid, infer_datetime_format = True)
    df_billboard_100['lenient_rule'] = df_billboard_100.apply(lambda row: row.strict_rule if row.lenient_rule == 0 else row.lenient_rule, axis=1)
    df_billboard_100['popularity'] = df_billboard_100.apply(lambda row: max(row.strict_rule, row.lenient_rule), axis=1)
    peak_zero= df_billboard_100.loc[df_billboard_100['peak_position']==0]
    peak=peak_zero['week_position'].min()
    rows_to_change = df_billboard_100['peak_position'] == 0
    df_billboard_100.loc[rows_to_change, 'peak_position'] = peak
    df_billboard_100.rename(columns={"peak_position":"rank"},inplace=True) # rename the column name 
    df_billboard_100 = df_billboard_100.sort_values("weekid", ascending=True)
    df_billboard_100 = df_billboard_100.drop_duplicates(["songid"], keep='first')
    df_all = df_billboard_100.loc[df_billboard_100['popularity']>0]
    insert_base_feature(df_all)

    df_train, df_test = train_test_split(df_all, test_size = test_size)
else:
    df_train = pd.read_csv("week_update_TrainV7.csv")
    df_test = pd.read_csv("week_update_TestV7.csv")


FileNotFoundError: File b'D:/CSE519/Project/billboard_w_new_features_v7.csv"' does not exist

In [7]:
df_all.head()

Unnamed: 0,url,weekid,week_position,song,performer,songid,instance,previous_week_position,rank,weeks_on_chart,...,strict_rule,lenient_rule,all_time_greatest_artist,popularity,diff_year,adjusted_diff_year,log_rank,log_weeks_on_chart,log_jump_time,log_max_duration
123256,http://www.billboard.com/charts/hot-100/1958-0...,1958-08-02,14,If Dreams Came True,Pat Boone,If Dreams Came TruePat Boone,1,,12,10,...,0,22,0,22,60,59.75,2.484907,2.302585,0.0,0.0
155393,http://www.billboard.com/charts/hot-100/1958-0...,1958-08-02,18,Little Star,The Elegants,Little StarThe Elegants,1,,1,17,...,0,35,0,35,60,59.75,0.0,2.833213,0.0,0.0
109532,http://www.billboard.com/charts/hot-100/1958-0...,1958-08-02,97,I Believe In You,Robert & Johnny,I Believe In YouRobert & Johnny,1,,93,2,...,0,4,0,4,60,59.75,4.532599,0.693147,0.0,0.0
79442,http://www.billboard.com/charts/hot-100/1958-0...,1958-08-02,15,For Your Precious Love,Jerry Butler and The Impressions,For Your Precious LoveJerry Butler and The Imp...,1,,15,5,...,0,42,0,42,60,59.75,2.70805,1.609438,0.0,0.0
79380,http://www.billboard.com/charts/hot-100/1958-0...,1958-08-02,62,For Your Love,Ed Townsend,For Your LoveEd Townsend,1,,62,1,...,0,20,0,20,60,59.75,4.127134,0.0,0.0,0.0


# Features for advanced model

In [8]:
def new_feature(row_data, feature_name = None):
    if not feature_name:
        raise ('Empty feature')

    songid = row_data['song']
    
    if df_features[feature].loc[df_features[feature]['song'] == songid].shape[0] > 0:
        return 1
    else:
        return 0

# Base Model I
$popularity = rank C_t^{\Delta t}$

$\log(popularity) = \log(rank) + \Delta t \log(C_{t})$

In [30]:
def evaluate_bm1(train = df_train, test = df_test, x_features = None, y_feature = None, intercept = True, full_info = False):
    precision = ':.4f'
    x_train = train[x_features].values
    y_train = train[y_feature].values
    log_y_train = train.apply(lambda row: -row['log_rank'] if row[y_feature] == 0 else np.log(row[y_feature]) - row['log_rank'], axis = 1).values
    x_test = test[x_features].values
    y_test = test[y_feature].values
    log_y_test = test.apply(lambda row: -row['log_rank'] if row[y_feature] == 0 else np.log(row[y_feature]) - row['log_rank'], axis = 1).values
    
    regr = linear_model.LinearRegression()  
    print(type(x_train))
    regr.fit(x_train.reshape(-1,1), log_y_train)
    
    log_y_test_pred = regr.predict(x_test.reshape(-1,1))
    log_y_train_pred= regr.predict(x_train.reshape(-1,1))

#     y_test_pred = pd.Series(log_y_test_pred).add(test['log_rank']).apply(lambda x: np.exp(x))
#     y_train_pred = pd.Series(log_y_train_pred).add(train['log_rank']).apply(lambda x: np.exp(x))
    
    info = {'coef': regr.coef_, \
            'intercept': regr.intercept_, \
            'test': {'Log-MSE': mean_squared_error(log_y_test, log_y_test_pred), \
                     'Log-MAE': mean_absolute_error(log_y_test, log_y_test_pred), \
                     'Log-Variance': r2_score(log_y_test, log_y_test_pred)}, \
#                      'MSE': mean_squared_error(y_test, y_test_pred), \
#                      'MAE': mean_absolute_error(y_test, y_test_pred), \
#                      'Variance': r2_score(y_test, y_test_pred)},\
            'train': {'Log-MSE': mean_squared_error(log_y_train, log_y_train_pred), \
                     'Log-MAE': mean_absolute_error(log_y_train, log_y_train_pred), \
                     'Log-Variance': r2_score(log_y_train, log_y_train_pred)}}
#                      'MSE': mean_squared_error(y_train, y_train_pred), \
#                      'MAE': mean_absolute_error(y_train, y_train_pred), \
#                      'Variance': r2_score(y_train, y_train_pred)}}

    if full_info:
        print('Coefficients: \n', regr.coef_, end=' ')

        if intercept:
            print('Intercept: {:.5f}'.format(regr.intercept_))
        else:
            print('\n')
        print_str = '(Log MSE:{' + precision + '}, MSE:{' + precision + '},' + \
                    'Log MAE:{' + precision + '}, MAE:{' + precision + '},' + \
                    'Log Variance:{' + precision + '},' + 'Variance:{' + precision + '})' 
        print("        train case:", end = ' ')
        print(print_str.format(info['train']['Log-MSE'],\
                               info['train']['MSE'],\
                               info['train']['Log-MAE'],\
                               info['train']['MAE'],\
                               info['train']['Log-Variance'],\
                               info['train']['Variance']))
        print("        test case:", end = ' ')
        print(print_str.format(info['test']['Log-MSE'],\
                               info['test']['MSE'],\
                               info['test']['Log-MAE'],\
                               info['test']['MAE'],\
                               info['test']['Log-Variance'],\
                               info['test']['Variance']))
    
    return info

In [33]:
# log_rank, log_weeks_on_chart, log_jump_time, log_max_duration, diff_year, adjusted_diff_year
stats = {'strict_rule':{}, 'lenient_rule':{}, 'popularity':{}}
for _ in range(3):
    df_train, df_test = train_test_split(df_all, test_size = test_size)
    
    MAE = {'strict_rule':[], 'lenient_rule':[], 'popularity':[]}
    for y_feature in ['strict_rule', 'lenient_rule', 'popularity']:
        for x_features in ['diff_year', 'adjusted_diff_year']:
            info = evaluate_bm1(train = df_train.loc[df_train[y_feature] > 0] , \
                                test = df_test.loc[df_test[y_feature] > 0], \
                                x_features = x_features, \
                                y_feature = y_feature, \
                                intercept = True, \
                                full_info = False)
            print(info)
#             MAE[y_feature].append([x_features, \
#                                    info['coef'], \
#                                    info['intercept'], \
#                                    info['train']['MAE'], \
#                                    info['train']['Variance'], \
#                                    info['test']['MAE'], \
#                                    info['test']['Variance']])

#         local_info = min(MAE[y_feature],key=itemgetter(3))
#         print(local_info[3:])
#         x_features_str = ''.join(local_info[0])
#         if x_features_str in stats[y_feature]:
#             stats[y_feature][x_features_str] += 1
#         else:
#             stats[y_feature][x_features_str] = 1

<class 'numpy.ndarray'>
{'coef': array([-0.01193829]), 'intercept': 0.58729359848843621, 'test': {'Log-MSE': 2.5475677115870932, 'Log-MAE': 1.2466241439843273, 'Log-Variance': 0.019274776425998397}, 'train': {'Log-MSE': 2.5985401026471138, 'Log-MAE': 1.2596212406818135, 'Log-Variance': 0.014902368895032936}}
<class 'numpy.ndarray'>
{'coef': array([-0.0119407]), 'intercept': 0.58584758613833277, 'test': {'Log-MSE': 2.5478237583819947, 'Log-MAE': 1.2466955716987134, 'Log-Variance': 0.019176207289235858}, 'train': {'Log-MSE': 2.5985080860753413, 'Log-MAE': 1.2596236254744053, 'Log-Variance': 0.014914506267466376}}
<class 'numpy.ndarray'>
{'coef': array([-0.02012452]), 'intercept': 0.52640003956237136, 'test': {'Log-MSE': 2.8107869501720719, 'Log-MAE': 1.3014013671886207, 'Log-Variance': 0.048109100871394972}, 'train': {'Log-MSE': 2.814322245970923, 'Log-MAE': 1.3055387993130478, 'Log-Variance': 0.04371307643149891}}
<class 'numpy.ndarray'>
{'coef': array([-0.02014659]), 'intercept': 0.524

# Base Model II
$
\text{popularity} = \text{rank}^{C_r}(\text{weeks_on_chart})^{C_w}(\text{jump_time})^{C_j} (\text{max_duration})^{C_m}e^{C_t \Delta t}
$


$$
\begin{align*}
\log(\text{popularity}) =& C_r\log(\text{rank}) + C_w\log(\text{weeks_on_chart}) \\&+ C_j\log(\text{jump_time}) + C_m\log(\text{max_duration}) + {C_t \Delta t}
\end{align*}
$$

In [None]:
def evaluate_base_model(train = df_train, test = df_test, x_features = None, y_feature = None, intercept = True, full_info = False):
    precision = ':.4f'
    x_train = train[x_features].values
    y_train = train[y_feature].values
    log_y_train = train[y_feature].apply(lambda x: 0 if x == 0 else np.log(x)).values
    x_test = test[x_features].values
    y_test = test[y_feature].values
    log_y_test = test[y_feature].apply(lambda x: 0 if x == 0 else np.log(x)).values
    
    regr = linear_model.LinearRegression(fit_intercept=intercept)  
    
    regr.fit(x_train, log_y_train)
    
    log_y_test_pred = regr.predict(x_test)
    log_y_train_pred= regr.predict(x_train)

    y_test_pred = pd.Series(log_y_test_pred).apply(lambda x: np.exp(x))
    y_train_pred = pd.Series(log_y_train_pred).apply(lambda x: np.exp(x))
    
    info = {'coef': regr.coef_, \
            'intercept': regr.intercept_, \
            'test': {'Log-MSE': mean_squared_error(log_y_test, log_y_test_pred), \
                     'Log-MAE': mean_absolute_error(log_y_test, log_y_test_pred), \
                     'Log-Variance': r2_score(log_y_test, log_y_test_pred), \
                     'MSE': mean_squared_error(y_test, y_test_pred), \
                     'MAE': mean_absolute_error(y_test, y_test_pred), \
                     'Variance': r2_score(y_test, y_test_pred)},\
            'train': {'Log-MSE': mean_squared_error(log_y_train, log_y_train_pred), \
                     'Log-MAE': mean_absolute_error(log_y_train, log_y_train_pred), \
                     'Log-Variance': r2_score(log_y_train, log_y_train_pred), \
                     'MSE': mean_squared_error(y_train, y_train_pred), \
                     'MAE': mean_absolute_error(y_train, y_train_pred), \
                     'Variance': r2_score(y_train, y_train_pred)}}

    if full_info:
        print('Coefficients: \n', regr.coef_, end=' ')

        if intercept:
            print('Intercept: {:.5f}'.format(regr.intercept_))
        else:
            print('\n')
        print_str = '(Log MSE:{' + precision + '}, MSE:{' + precision + '},' + \
                    'Log MAE:{' + precision + '}, MAE:{' + precision + '},' + \
                    'Log Variance:{' + precision + '},' + 'Variance:{' + precision + '})' 
        print("        train case:", end = ' ')
        print(print_str.format(info['train']['Log-MSE'],\
                               info['train']['MSE'],\
                               info['train']['Log-MAE'],\
                               info['train']['MAE'],\
                               info['train']['Log-Variance'],\
                               info['train']['Variance']))
        print("        test case:", end = ' ')
        print(print_str.format(info['test']['Log-MSE'],\
                               info['test']['MSE'],\
                               info['test']['Log-MAE'],\
                               info['test']['MAE'],\
                               info['test']['Log-Variance'],\
                               info['test']['Variance']))
    
    return info

In [None]:
mean_absolute_error(df_all['rank'].apply(lambda x: np.exp(x)), df_all['rank'])
mean_absolute_error(df_all['rank'].apply(lambda x: np.exp(x)).apply(lambda x:np.log(x)),df_all['rank'])

# Test Feature Combination

In [None]:
features = ['log_weeks_on_chart', 'log_jump_time', 'log_max_duration']
features_set_w_dy = []
features_set_w_ady = []

for idx in range(len(features) + 1):
    set_combinations = itertools.combinations(features, idx)
    for subset in set_combinations:
        features_set_w_dy.append(list(subset))
        features_set_w_ady.append(list(subset))
        features_set_w_dy[-1].extend(['log_rank', 'diff_year'])
        features_set_w_ady[-1].extend(['log_rank', 'adjusted_diff_year'])

if not Experiment_Mode:
    print(len(features_set_w_dy))
    for features_opt0, features_opt1 in zip (features_set_w_dy, features_set_w_ady):
        print(features_opt0)
        print(features_opt1, '\n')

In [None]:
possible_x_features = features_set_w_ady + features_set_w_dy

In [None]:
possible_x_features

In [None]:
# log_rank, log_weeks_on_chart, log_jump_time, log_max_duration, diff_year, adjusted_diff_year
stats = {'strict_rule':{}, 'lenient_rule':{}, 'popularity':{}}
for _ in range(3):
    df_train, df_test = train_test_split(df_all, test_size = test_size)
    
    MAE = {'strict_rule':[], 'lenient_rule':[], 'popularity':[]}
    for y_feature in ['strict_rule', 'lenient_rule', 'popularity']:
#         print('y features:', y_feature)
#         print()
        for x_features in possible_x_features:
    #         print('    x features:', x_features)
            info = evaluate_base_model(train = df_train.loc[df_train[y_feature] > 0] , \
                                       test = df_test.loc[df_test[y_feature] > 0], \
                                       x_features = x_features, \
                                       y_feature = y_feature, \
                                       intercept = True, \
                                       full_info = False)
            MAE[y_feature].append([x_features, \
                                   info['coef'], \
                                   info['intercept'], \
                                   info['train']['MAE'], \
                                   info['train']['Variance'], \
                                   info['test']['MAE'], \
                                   info['test']['Variance']])
#             if x_features == ['log_rank', 'diff_year'] or \
#                x_features == ['log_rank', 'adjusted_diff_year']:
#                 print('base model:', \
#                       x_features, '\n'\
#                       'coef:', info['coef'], info['intercept'], '\n', \
#                       info['train']['MAE'], \
#                       info['train']['Variance'], \
#                       info['test']['MAE'], \
#                       info['test']['Variance'], '\n')

        local_info = min(MAE[y_feature],key=itemgetter(3))
        x_features_str = ''.join(local_info[0])
        if x_features_str in stats[y_feature]:
            stats[y_feature][x_features_str] += 1
        else:
            stats[y_feature][x_features_str] = 1
#         print('minimal error model:', \
#               local_info[0], '\n', \
#               'coef:', local_info[1:3], '\n', \
#               local_info[3:], '\n')

    #         print('\n')
    #     print('\n')
    # print(df_train['popularity'].values.shape)
    # print(df_train['strict_rule'].apply(lambda x: 0 if x == 0 else np.log(x)).values.shape)
    # list(df_train)

In [None]:
for major_key in stats.keys():
    print(major_key)
    for minor_key in stats[major_key].keys():
        print(minor_key, stats[major_key][minor_key])
    print('\n')

# Add New Features

In [None]:
def advanced_insert_feature(df, func = None, feature_name = None):
    if not feature_name:
        raise ('Empty feature')

    new_data = df.apply(lambda x: func(x, feature_name), axis = 1)
    df.insert(df.shape[1], feature_name, new_data)
    
def insert_feature(df, func = None, feature_name = None):
    if not feature_name:
        raise ('Empty feature')

    new_data = df.apply(func, axis = 1)
    df.insert(df.shape[1], feature_name, new_data)

def new_feature(row_data, feature_name = None):
    if not feature_name:
        raise ('Empty feature')

    songid = row_data['song']
    
    if df_features[feature].loc[df_features[feature]['song'] == songid].shape[0] > 0:
        return 1
    else:
        return 0

In [None]:
df_features = {}
csv_files = [f for f in os.listdir('.') if os.path.isfile(f)]
for csv_file in csv_files:
    if 'csv' in csv_file and 'songs' in csv_file:
        feature = csv_file.split('.')[0]
        df_features[feature] = pd.read_csv(csv_file)
        df_features[feature]['songid'] = df_features[feature][['song', 'artist']].apply(lambda x: ''.join(x), axis=1)

df_features = {}
csv_files = ['songs-used-in-movies.csv',\
            'songs-used-in-tv-shows.csv',\
            'songs-used-in-commercials.csv']
for csv_file in csv_files:
    feature = csv_file.split('.')[0]
    df_features[feature] = pd.read_csv(csv_file)
    df_features[feature]['songid'] = df_features[feature][['song', 'artist']].apply(lambda x: ''.join(x), axis=1)

In [None]:
for feature in df_features.keys():
    advanced_insert_feature(df_train, new_feature, feature)
    advanced_insert_feature(df_test, new_feature, feature)

In [None]:
# create new fit feature
for feature in df_features.keys():
    new_feature = 'fit-' + feature
    df_train[new_feature] = df_train.apply(lambda row: np.log(row[feature] + 1), axis=1)
    df_test[new_feature] = df_test.apply(lambda row: np.log(row[feature] + 1), axis=1)

In [None]:
df_train['new_feature'] = df_train[list(df_features)].apply(lambda x: 1 if any(x) else 0, axis=1)
df_test['new_feature'] = df_test[list(df_features)].apply(lambda x: 1 if any(x) else 0, axis=1)

In [None]:
df_train['sum_feature'] = df_train[list(df_features)].apply(lambda x: np.log(sum(x)) if any(x) else 0, axis=1)
df_test['sum_feature'] = df_test[list(df_features)].apply(lambda x: np.log(sum(x)) if any(x) else 0, axis=1)

In [None]:
df_train['log_weeks_on_charts'] = df_train['weeks_on_chart'].apply(lambda x: np.log(x))
df_test['log_weeks_on_charts'] = df_test['weeks_on_chart'].apply(lambda x: np.log(x))

In [None]:
df_train['log_jump_time'] = df_train['jump_time'].apply(lambda x: np.log(x**2) if x > 0 else 0)
df_test['log_jump_time'] = df_test['jump_time'].apply(lambda x: np.log(x**2) if x > 0 else 0)
df_train['log_max_jump_duration'] = df_train['max_jump_duration'].apply(lambda x: np.log(x**2) if x > 0 else 0)
df_test['log_max_jump_duration'] = df_test['max_jump_duration'].apply(lambda x: np.log(x**2) if x > 0 else 0)

In [None]:
list(df_test)

In [None]:
list(df_features.keys())

# model -fit-movie, fit-tv-show, fit-commercials

In [None]:
def evaluation_featured_model(train = df_train, test = df_test, x_features = None, y_feature = None, intercept = True, full_info = False):
    x_train = train[x_features].values  
    y_train = train[y_feature].values
    x_test = test[x_features].values
    y_test = test[y_feature].values
    
    regr = linear_model.LinearRegression(fit_intercept=intercept) # with intercept 
    regr.fit(x_train, y_train)
    ytest_pred = regr.predict(x_test)
    ytrain_pred=regr.predict(x_train)
    
    if full_info:
        print('Coefficients: \n', regr.coef_)
        print('Intercept when fit_intercept=True : {:.5f}'.format(regr.intercept_))
    
    print("Mean squared error for test case is: %.3f"% mean_squared_error(y_test, ytest_pred))
    if full_info:
        print('Variance score for test case is: %.3f' % r2_score(y_test, ytest_pred))
    print("Mean squared error for train case is: %.3f"% mean_squared_error(y_train, ytrain_pred))
    if full_info:
        print('Variance score for test case is: %.3f' % r2_score(y_train, ytrain_pred))

# df_train_mid=df_train.assign(predictedlogP=pd.Series(ytrain_pred))
# df_test_mid=df_test.assign(predictedlogP=pd.Series(ytest_pred))
    
# predict_train=df_train_mid.assign(predictedPopularity=pd.Series( np.exp(df_train_mid['predictedlogP'])))
# predict_test=df_test_mid.assign(predictedPopularity=pd.Series( np.exp(df_test_mid['predictedlogP'])))

In [None]:
fit_features = ['fit-'+ feature for feature in df_features.keys()]
print(fit_features)

In [None]:
start_year = 1958
end_year = 2005
for rank in ['logRank']:
    for pop in ['logPopularity','logNewPopularity']:
        print(rank, pop)
        

        print('linear regression - base model')
        evaluation_featured_model(train=df_train.loc[df_train.year.isin([start_year, end_year])], test=df_test, x_features=[rank, 'Y_year'], y_feature=pop)

        print('\nlinear regression - base model 1')
        evaluation_featured_model(train=df_train[df_train.year.isin([start_year, end_year])], test=df_test,x_features=[rank, 'Y_year', 'log_weeks_on_charts'], y_feature=pop)
        
        print('\nlinear regression - base model 2')
        evaluation_featured_model(train=df_train[df_train.year.isin([start_year, end_year])],\
                                  test=df_test\
                                  ,x_features=[rank, 'Y_year', 'log_weeks_on_charts'],\
                                  y_feature=pop)
        
        print('\nlinear regression - base model 3')
        evaluation_featured_model(train=df_train[df_train.year.isin([start_year, end_year])],\
                                  test=df_test,\
                                  x_features=[rank,\
                                              'Y_year',\
                                              'log_weeks_on_charts',\
                                              'log_jump_time'],\
                                  y_feature=pop)
        
        print('\nlinear regression - base model 4')
        evaluation_featured_model(train=df_train[df_train.year.isin([start_year, end_year])],\
                                  test=df_test,\
                                  x_features=[rank,\
                                              'Y_year',\
                                              'log_weeks_on_charts',\
                                              'log_jump_time',\
                                              'log_max_jump_duration'],\
                                  y_feature=pop)
        
        print('\nlinear regression - base model 4')
        evaluation_featured_model(train=df_train[df_train.year.isin([start_year, end_year])],\
                                  test=df_test,\
                                  x_features=[rank,\
                                              'Y_year',\
                                              'log_weeks_on_charts',\
                                              'log_jump_time',\
                                              'log_max_jump_duration',\
                                              'all_time_greatest_artist'],\
                                  y_feature=pop)
        
        print('\nlinear regression - advanced model')
        features = [rank, 'Y_year', 'log_weeks_on_charts', 'log_jump_time', 'log_max_jump_duration']
        features.extend(fit_features)
        evaluation_featured_model(train=df_train[df_train.year.isin([start_year, end_year])], test=df_test,x_features=features, y_feature=pop)
        
        print('\nlinear regression - all as one model')
        evaluation_featured_model(train=df_train[df_train.year.isin([start_year, end_year])], test=df_test,x_features=[rank, 'Y_year', 'log_weeks_on_charts', 'log_jump_time', 'log_max_jump_duration','new_feature'], y_feature=pop)
        
        print('\nlinear regression - sum all feature model')
        evaluation_featured_model(train=df_train[df_train.year.isin([start_year, end_year])], test=df_test,x_features=[rank, 'Y_year', 'log_weeks_on_charts', 'log_jump_time', 'log_max_jump_duration','sum_feature'], y_feature=pop)
        print('\n')

In [None]:
print(df_train.loc[(df_train['songs-used-in-commercials'] == 1)].shape[0])
print(df_train.loc[(df_train['songs-used-in-movies'] == 1)].shape[0])
print(df_train.loc[(df_train['songs-used-in-tv-shows'] == 1)].shape[0])
print(df_train.loc[(df_train['songs-used-in-movies'] == 1) |\
                   (df_train['songs-used-in-tv-shows'] == 1) |\
                   (df_train['songs-used-in-commercials'] == 1)].shape[0])

In [None]:
df_features.keys()

In [None]:
corr = df_train[list(df_features.keys())].corr()
corr.style.background_gradient()

In [None]:
features = [rank, 'Y_year', 'log_weeks_on_charts', 'log_jump_time', 'log_max_jump_duration']
features.extend(fit_features)
features.remove('fit-songs-for-wedding-anniversaries')
corr = df_train[features].corr()
corr.style.background_gradient()