In [1]:
import pandas as pd
import numpy as np
from datetime import datetime

In [2]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder,MinMaxScaler
from sklearn import linear_model
from sklearn.model_selection import cross_val_score, StratifiedKFold,KFold


In [41]:
from sklearn.tree import DecisionTreeRegressor

In [3]:
#DATA PATH
DATA_PATH = './data/'


In [4]:
imputed_df = pd.read_csv(DATA_PATH + 'imputed_df.csv',index_col=False)

In [6]:
imputed_df.columns

Index(['Popularity', 'Artist_followers', 'Track_number', 'Tracks_in_album',
       'danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness',
       ...
       'viral rap', 'vlaamse kinderliedje', 'vocal harmony group', 'volkspop',
       'world chill', 'zimdancehall', 'zouk riddim', 'year', 'month', 'day'],
      dtype='object', length=1229)

In [44]:
imputed_df.columns

Index(['Popularity', 'Artist_followers', 'Track_number', 'Tracks_in_album',
       'danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness',
       ...
       'viral rap', 'vlaamse kinderliedje', 'vocal harmony group', 'volkspop',
       'world chill', 'zimdancehall', 'zouk riddim', 'year', 'month', 'day'],
      dtype='object', length=1229)

In [8]:
list(imputed_df.columns)

['Popularity',
 'Artist_followers',
 'Track_number',
 'Tracks_in_album',
 'danceability',
 'energy',
 'key',
 'loudness',
 'mode',
 'speechiness',
 'acoustics',
 'instrumentalness',
 'liveliness',
 'valence',
 'tempo',
 'duration_ms',
 'time_signature',
 'Days_since_release',
 'Explicit_false',
 'Explicit_true',
 'album',
 'compilation',
 'single',
 'syuzhet_norm',
 'bing_norm',
 'afinn_norm',
 'nrc_norm',
 'syuzhet',
 'bing',
 'afinn',
 'nrc',
 'anger',
 'anticipation',
 'disgust',
 'fear',
 'joy',
 'sadness',
 'surprise',
 'trust',
 'negative',
 'positive',
 'n_words',
 'anger_norm',
 'anticipation_norm',
 'disgust_norm',
 'fear_norm',
 'joy_norm',
 'sadness_norm',
 'surprise_norm',
 'trust_norm',
 'negative_norm',
 'positive_norm',
 'negative_bog_jr',
 'positive_bog_jr',
 'Bayes',
 'Negative_Bayes',
 'Neutral_Bayes',
 'Positive_Bayes',
 'Celebrate',
 'Desire',
 'Explore',
 'Fun',
 'Hope',
 'Love',
 'Nostalgia',
 'Thug',
 'bing_norm_negative',
 'bing_norm_neutral',
 'bing_norm_positi

In [21]:
countries = list(imputed_df.columns)[69:104]

# Modeling

## 1. Linear Regression

In [29]:
skf = StratifiedKFold(n_splits=5, shuffle=True,random_state=42)

In [42]:
def print_scores(data,label,data_type,model,features,country):
    pipe_cls = None
    data = data[data[country]==1]
    if(model == 'linear regression'):
        pipe_cls = Pipeline([('preprocess',StandardScaler()),
                            ('clf', linear_model.LinearRegression())])
    elif(model == 'Decision tree regression'):
        pipe_cls = Pipeline([('preprocess',StandardScaler()),
                            ('clf', DecisionTreeRegressor())])
        
    kf = KFold(n_splits=5,random_state=42,shuffle=True)
    
    r2 = cross_val_score(pipe_cls, data[features], data[label],cv=kf,scoring='r2')
    
    print("data type: {}\nlabel: {}\nmodel : {}\n country: {}".format(data_type,label,model,country))
    print("r2 score: {}".format(np.mean(r2)))
    print()
    
    variance = cross_val_score(pipe_cls, data[features], data[label],cv=kf,scoring='explained_variance')
    
    print("data type: {}\nlabel: {}\nmodel : {}\n country: {}".format(data_type,label,model,country))
    print("variance score: {}".format(np.mean(variance)))
    print()
    
    rmse = cross_val_score(pipe_cls, data[features], data[label],cv=kf,scoring='neg_root_mean_squared_error')
    
    print("data type: {}\nlabel: {}\nmodel : {}\n country: {}".format(data_type,label,model,country))
    print("rmse score: {}".format(-1*np.mean(rmse)))
    print()
    
    

In [31]:
features_columns = [column for column in imputed_df.columns if column!='Popularity']

In [32]:
len(features_columns)

1228

In [33]:
for country in countries:
    print_scores(imputed_df,'Popularity','imputed_df','linear regression',features_columns,country)

data type: imputed_df
label: Popularity
model : linear regression
 country: Argentina
r2 score: -4.808593981663946e+22

data type: imputed_df
label: Popularity
model : linear regression
 country: Argentina
variance score: -4.79689031050006e+22

data type: imputed_df
label: Popularity
model : linear regression
 country: Argentina
rmse score: 2755125881638297.5

data type: imputed_df
label: Popularity
model : linear regression
 country: Australia
r2 score: -5.532453262881367e+22

data type: imputed_df
label: Popularity
model : linear regression
 country: Australia
variance score: -5.528982944378582e+22

data type: imputed_df
label: Popularity
model : linear regression
 country: Australia
rmse score: 2597360676050038.5

data type: imputed_df
label: Popularity
model : linear regression
 country: Austria
r2 score: -1.8096857239324643e+23

data type: imputed_df
label: Popularity
model : linear regression
 country: Austria
variance score: -1.8093210735566873e+23

data type: imputed_df
label: 

KeyboardInterrupt: 

## 1. Decision tree Regression

In [43]:
for country in countries:
    print_scores(imputed_df,'Popularity','imputed_df','Decision tree regression',features_columns,country)

data type: imputed_df
label: Popularity
model : Decision tree regression
 country: Argentina
r2 score: 0.37018563795930515

data type: imputed_df
label: Popularity
model : Decision tree regression
 country: Argentina
variance score: 0.3710338079975878

data type: imputed_df
label: Popularity
model : Decision tree regression
 country: Argentina
rmse score: 12506.157297055015

data type: imputed_df
label: Popularity
model : Decision tree regression
 country: Australia
r2 score: -0.016197613933810074

data type: imputed_df
label: Popularity
model : Decision tree regression
 country: Australia
variance score: 0.05589583000502683

data type: imputed_df
label: Popularity
model : Decision tree regression
 country: Australia
rmse score: 14466.374789142885

data type: imputed_df
label: Popularity
model : Decision tree regression
 country: Austria
r2 score: 0.14631292564943874

data type: imputed_df
label: Popularity
model : Decision tree regression
 country: Austria
variance score: 0.1353080719

data type: imputed_df
label: Popularity
model : Decision tree regression
 country: Norway
r2 score: 0.2545699627366632

data type: imputed_df
label: Popularity
model : Decision tree regression
 country: Norway
variance score: 0.26285620960527123

data type: imputed_df
label: Popularity
model : Decision tree regression
 country: Norway
rmse score: 9602.489391746436

data type: imputed_df
label: Popularity
model : Decision tree regression
 country: Peru
r2 score: 0.5022673618808392

data type: imputed_df
label: Popularity
model : Decision tree regression
 country: Peru
variance score: 0.4867362632065295

data type: imputed_df
label: Popularity
model : Decision tree regression
 country: Peru
rmse score: 13135.280477475002

data type: imputed_df
label: Popularity
model : Decision tree regression
 country: Philippines
r2 score: 0.3434847962383857

data type: imputed_df
label: Popularity
model : Decision tree regression
 country: Philippines
variance score: 0.2175991035559377

data type: imp

KeyboardInterrupt: 

In [34]:
print(-1.9850923944007732e+20)

-1.9850923944007732e+20


In [40]:
2e2

200.0

In [None]:
LinearRegression()