In [1]:
import pandas as pd
import numpy as np
from datetime import datetime

In [2]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder,MinMaxScaler
from sklearn import linear_model
from sklearn.model_selection import cross_val_score, StratifiedKFold,KFold


In [3]:
from tqdm.notebook import tqdm

In [4]:
import pickle

In [5]:
from sklearn.tree import DecisionTreeRegressor

In [6]:
#DATA PATH
DATA_PATH = './data/'

#MODELS_PATH
MODELS_PATH = './models/'


In [7]:
df_feature_engineered = pd.read_csv(DATA_PATH + 'df_feature_engineered.csv',index_col=False)

In [8]:
df_feature_engineered.columns

Index(['Popularity', 'Artist_followers', 'Track_number', 'Tracks_in_album',
       'danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness',
       ...
       'viral rap', 'vlaamse kinderliedje', 'vocal harmony group', 'volkspop',
       'world chill', 'zimdancehall', 'zouk riddim', 'year', 'month', 'day'],
      dtype='object', length=1229)

In [9]:
df_feature_engineered.columns

Index(['Popularity', 'Artist_followers', 'Track_number', 'Tracks_in_album',
       'danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness',
       ...
       'viral rap', 'vlaamse kinderliedje', 'vocal harmony group', 'volkspop',
       'world chill', 'zimdancehall', 'zouk riddim', 'year', 'month', 'day'],
      dtype='object', length=1229)

In [10]:
list(df_feature_engineered.columns)

['Popularity',
 'Artist_followers',
 'Track_number',
 'Tracks_in_album',
 'danceability',
 'energy',
 'key',
 'loudness',
 'mode',
 'speechiness',
 'acoustics',
 'instrumentalness',
 'liveliness',
 'valence',
 'tempo',
 'duration_ms',
 'time_signature',
 'Days_since_release',
 'Explicit_false',
 'Explicit_true',
 'album',
 'compilation',
 'single',
 'syuzhet_norm',
 'bing_norm',
 'afinn_norm',
 'nrc_norm',
 'syuzhet',
 'bing',
 'afinn',
 'nrc',
 'anger',
 'anticipation',
 'disgust',
 'fear',
 'joy',
 'sadness',
 'surprise',
 'trust',
 'negative',
 'positive',
 'n_words',
 'anger_norm',
 'anticipation_norm',
 'disgust_norm',
 'fear_norm',
 'joy_norm',
 'sadness_norm',
 'surprise_norm',
 'trust_norm',
 'negative_norm',
 'positive_norm',
 'negative_bog_jr',
 'positive_bog_jr',
 'Bayes',
 'Negative_Bayes',
 'Neutral_Bayes',
 'Positive_Bayes',
 'Celebrate',
 'Desire',
 'Explore',
 'Fun',
 'Hope',
 'Love',
 'Nostalgia',
 'Thug',
 'bing_norm_negative',
 'bing_norm_neutral',
 'bing_norm_positi

In [11]:
countries = list(df_feature_engineered.columns)[69:104]

In [12]:
len(countries)

35

# Modeling

In [13]:
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
import warnings
from sklearn.neighbors import KNeighborsRegressor
warnings.filterwarnings('ignore')

In [14]:
cols_with_na = [val for val in df_feature_engineered.columns if df_feature_engineered[val].isna().sum() > 0]

In [15]:
cols_with_na

['Days_since_release',
 'syuzhet_norm',
 'bing_norm',
 'afinn_norm',
 'nrc_norm',
 'syuzhet',
 'bing',
 'afinn',
 'nrc',
 'anger',
 'anticipation',
 'disgust',
 'fear',
 'joy',
 'sadness',
 'surprise',
 'trust',
 'negative',
 'positive',
 'n_words',
 'anger_norm',
 'anticipation_norm',
 'disgust_norm',
 'fear_norm',
 'joy_norm',
 'sadness_norm',
 'surprise_norm',
 'trust_norm',
 'negative_norm',
 'positive_norm',
 'negative_bog_jr',
 'positive_bog_jr',
 'Bayes',
 'Negative_Bayes',
 'Neutral_Bayes',
 'Positive_Bayes',
 'Celebrate',
 'Desire',
 'Explore',
 'Fun',
 'Hope',
 'Love',
 'Nostalgia',
 'Thug']

In [16]:

num_transformer_w_standardizer = Pipeline([('imputer',SimpleImputer(missing_values=np.nan,strategy='mean')),
('scaler', StandardScaler())])

num_transformer_no_scaling = Pipeline([('imputer',SimpleImputer(missing_values=np.nan,strategy='mean'))])

num_transformer_w_minmaxscaler = Pipeline([('imputer',SimpleImputer(missing_values=np.nan,strategy='mean')),
                                           ('scaler', MinMaxScaler())])

cat_transformer = Pipeline([
('imputer', SimpleImputer(missing_values=np.nan,strategy='most_frequent')),('onehot',OneHotEncoder(handle_unknown='ignore'))])





In [17]:
preprocessor_standardscaler = ColumnTransformer([
        ('num', num_transformer_w_standardizer, cols_with_na)])

# preprocessor_noscaler = ColumnTransformer([
#         ('num', num_transformer_no_scaling, quantitave_attr),
#         ('cat', cat_transformer, cateorical_attributes)])


# preprocessor_minmaxscaler = ColumnTransformer([
#         ('num', num_transformer_w_minmaxscaler, quantitave_attr),
#         ('cat', cat_transformer, cateorical_attributes)])

In [18]:
def print_scores(data,label,data_type,features,country):
    best_model = None
    best_r2_score = float('-inf')
    best_mae_score = float('-inf')
    best_rmse_score = float('-inf')
    data = data[data[country]==1]
    country_df = pd.DataFrame(columns = ['Model','R2','MAE','RMSE'])
    model_param_pairs = [(linear_model.Lasso(),
                          [{'reg__alpha':[0,0.5,1],'reg__fit_intercept':[True,False]}]),
                          (linear_model.Ridge(),
                           [{'reg__alpha':[0,0.5,1],'reg__fit_intercept':[True,False]}]),
                          (DecisionTreeRegressor(),
                           [{'reg__criterion':["squared_error", "friedman_mse", "absolute_error", "poisson"],'reg__splitter':["random","best"]}]),
                          (RandomForestRegressor(),
                           [{'reg__criterion':["squared_error", "friedman_mse", "absolute_error", "poisson"],'reg__max_features':["sqrt","log2"]}]),
                          (SVR(),
                           [{'reg__kernel':["linear", "poly", "rbf", "sigmoid", "precomputed"],'reg__gamma':["scale","auto"]}]),
                          (KNeighborsRegressor(),
                           [{'reg__n_neighbors':[3, 5, 9, 14],'reg__weights':["uniform","distance"]}])]
    
    for model_param_pair in tqdm(model_param_pairs):
        model = model_param_pair[0]
        print(model)
        model_no_params = model
        
        param_grid = model_param_pair[1]
        
        pipe_reg = Pipeline([('preprocess',preprocessor_standardscaler),
                            ('reg', model)])


        kf = KFold(n_splits=5,random_state=42,shuffle=True)      
        grid_search = GridSearchCV(pipe_reg,param_grid,cv=kf,scoring = ['r2'],refit='r2')


        grid_search.fit(data[features], data[label])
        r2 = grid_search.best_score_
            

        #create pipeline with best params identified
        pipe_reg_best = Pipeline([('preprocess',preprocessor_standardscaler),
                            ('reg', model)])

        pipe_reg_best.set_params(**grid_search.best_params_)
    #     print(pipe_reg_best)


    #     return grid_search


    #     print(data[features])
    #     print(data[label])
        
        #r2 score
#         r2 = cross_val_score(pipe_reg_best, data[features], data[label],cv=kf,scoring='r2')

#         print(r2)

#         print("data type: {}\nlabel: {}\nmodel : {}\n country: {}".format(data_type,label,model_name,country))
#         print("r2 score: {}".format(np.mean(r2)))
#         print()

    #     variance = cross_val_score(pipe_cls, data[features], data[label],cv=kf,scoring='explained_variance')

    #     print("data type: {}\nlabel: {}\nmodel : {}\n country: {}".format(data_type,label,model,country))
    #     print("variance score: {}".format(np.mean(variance)))
    #     print()


        rmse_scores = cross_val_score(pipe_reg_best, data[features], data[label],cv=kf,scoring='neg_root_mean_squared_error')
            
#         print("data type: {}\nlabel: {}\nmodel : {}\n country: {}".format(data_type,label,model,country))
#         print("rmse score: {}".format(-1*np.mean(rmse)))
#         print()
        rmse = -1*np.mean(rmse_scores)
        mae_scores = cross_val_score(pipe_reg_best, data[features], data[label],cv=kf,scoring='neg_mean_absolute_error')
        mae = -1*np.mean(mae_scores)
        
        country_df = country_df
        
        
        country_df.loc[len(country_df.index)] = [str(model_no_params), r2, mae, rmse]
        
        if(grid_search.best_score_ > best_r2_score):
            best_r2_score = grid_search.best_score_
            best_model = str(model)
            #save model
            with open(MODELS_PATH + country + '_best_model.pkl','wb') as f:
                pickle.dump(grid_search,f)
                
            best_mae_score = mae
            best_rmse_score = rmse
            
            
    print()

    return country_df,best_model,best_r2_score,best_rmse_score,best_mae_score


In [19]:
features_columns = [column for column in df_feature_engineered.columns if column!='Popularity']

In [20]:
len(features_columns)

1228

In [21]:
countries

['Argentina',
 'Australia',
 'Austria',
 'Belgium',
 'Brazil',
 'Canada',
 'Chile',
 'Colombia',
 'Costa Rica',
 'Denmark',
 'Ecuador',
 'Finland',
 'France',
 'Germany',
 'Global',
 'Indonesia',
 'Ireland',
 'Italy',
 'Malaysia',
 'Mexico',
 'Netherlands',
 'New Zealand',
 'Norway',
 'Peru',
 'Philippines',
 'Poland',
 'Portugal',
 'Singapore',
 'Spain',
 'Sweden',
 'Switzerland',
 'Taiwan',
 'Turkey',
 'UK',
 'USA']

In [None]:
country_wise_results = dict()
overall_results_df = pd.DataFrame(columns = ['Country','Best model','Best r2','Best RMSE','Best MAE'])

for country in countries[:16]:
    print(country)
    country_df,best_model,best_r2,best_rmse,best_mae = print_scores(df_feature_engineered,'Popularity','imputed_df',features_columns,country)
    country_wise_results[country] = country_df
    overall_results_df.loc[len(overall_results_df)] = [country,best_model,best_r2,best_rmse,best_mae]
    

Argentina


  0%|          | 0/6 [00:00<?, ?it/s]

Lasso()
Ridge()
DecisionTreeRegressor()
RandomForestRegressor()
SVR()
KNeighborsRegressor()

Australia


  0%|          | 0/6 [00:00<?, ?it/s]

Lasso()
Ridge()
DecisionTreeRegressor()
RandomForestRegressor()
SVR()
KNeighborsRegressor()

Austria


  0%|          | 0/6 [00:00<?, ?it/s]

Lasso()
Ridge()
DecisionTreeRegressor()
RandomForestRegressor()
SVR()
KNeighborsRegressor()

Belgium


  0%|          | 0/6 [00:00<?, ?it/s]

Lasso()
Ridge()
DecisionTreeRegressor()
RandomForestRegressor()


In [299]:
country_wise_results['Argentina']

Unnamed: 0,Model,R2,MAE,RMSE
0,Lasso(alpha=1),-0.011117,10815.150837,16316.535696
1,Ridge(alpha=1),-0.011197,10815.940604,16317.162875
2,"DecisionTreeRegressor(criterion='poisson', spl...",-0.513048,12233.543863,19408.983964
3,RandomForestRegressor(criterion='friedman_mse'...,-0.211668,11321.773527,17834.38989
4,SVR(kernel='linear'),-0.213614,8517.610958,17870.572222
5,KNeighborsRegressor(n_neighbors=14),-0.02766,10582.346388,16436.383578


In [301]:
overall_results_df

Unnamed: 0,Country,Best model,Best r2,Best RMSE,Best MAE
0,Argentina,Lasso(alpha=1),-0.011117,16316.535696,10815.150837


### Predictions -

In [306]:
# load
country_model = None
with open(MODELS_PATH + 'Argentina_best_model.pkl', 'rb') as f:
    country_model = pickle.load(f)

In [311]:
sample_data = df_feature_engineered[df_feature_engineered['USA']==1]

In [312]:
sample_data

Unnamed: 0,Popularity,Artist_followers,Track_number,Tracks_in_album,danceability,energy,key,loudness,mode,speechiness,...,viral rap,vlaamse kinderliedje,vocal harmony group,volkspop,world chill,zimdancehall,zouk riddim,year,month,day
1,8.00,11427104.0,1,1,0.767,0.709,1.0,-4.470,1.0,0.3360,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2018.0,11.0,5.0
17,581.20,16931568.0,7,12,0.572,0.530,6.0,-8.521,0.0,0.0654,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2017.0,4.0,7.0
40,100.25,5910376.0,1,1,0.488,0.538,6.0,-4.974,1.0,0.0760,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2017.0,8.0,3.0
59,56.00,8092392.0,7,11,0.515,0.917,1.0,-7.312,1.0,0.0417,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2017.0,9.0,15.0
81,62.40,17132813.0,3,10,0.593,0.712,2.0,-6.325,0.0,0.0286,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2017.0,5.0,19.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
89414,247.20,1262102.0,14,15,0.469,0.080,2.0,-15.489,1.0,0.0425,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2020.0,4.0,17.0
89420,28.00,711265.0,5,7,0.711,0.462,10.0,-7.132,0.0,0.0550,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2016.0,12.0,23.0
89421,364.65,5004777.0,1,1,0.615,0.871,1.0,-5.308,1.0,0.0894,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2017.0,4.0,14.0
89422,4919.35,5647016.0,1,10,0.863,0.576,5.0,-5.687,0.0,0.2390,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2016.0,8.0,26.0


In [317]:
predictions = country_model.predict(sample_data[features_columns])
sample_data['Predicted Popularity in '+ 'Argentina'] = predictions

In [320]:
sample_data.sort_values('Predicted Popularity in Argentina',ascending=False)

Unnamed: 0,Popularity,Artist_followers,Track_number,Tracks_in_album,danceability,energy,key,loudness,mode,speechiness,...,vlaamse kinderliedje,vocal harmony group,volkspop,world chill,zimdancehall,zouk riddim,year,month,day,Predicted Popularity in Argentina
86362,31.20,4222467.0,1,1,0.662,0.866,6.0,-3.328,1.0,0.1590,...,0.0,0.0,0.0,0.0,0.0,0.0,2018.0,1.0,12.0,33133.851999
87078,31.20,4222467.0,1,1,0.663,0.875,6.0,-3.301,1.0,0.1830,...,0.0,0.0,0.0,0.0,0.0,0.0,2018.0,1.0,12.0,33133.851999
86870,409.60,385840.0,3,46,0.698,0.649,8.0,-6.764,1.0,0.4150,...,0.0,0.0,0.0,0.0,0.0,0.0,2015.0,9.0,25.0,28793.187881
49024,38761.20,10919803.0,6,24,0.907,0.633,2.0,-5.145,1.0,0.1840,...,0.0,0.0,0.0,0.0,0.0,0.0,2018.0,1.0,26.0,28552.997011
81553,353.35,5004777.0,13,13,0.531,0.797,10.0,-7.018,1.0,0.1540,...,0.0,0.0,0.0,0.0,0.0,0.0,2017.0,5.0,5.0,25929.981109
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
87018,43.20,3724826.0,2,14,0.621,0.854,1.0,-3.697,1.0,0.3200,...,0.0,0.0,0.0,0.0,0.0,0.0,2018.0,9.0,7.0,-8177.984660
28788,164.80,6028580.0,3,10,0.368,0.505,4.0,-7.208,1.0,0.0285,...,0.0,0.0,0.0,0.0,0.0,0.0,1994.0,11.0,1.0,-9022.789850
84908,395.20,11968318.0,10,10,0.701,0.485,11.0,-10.305,0.0,0.3640,...,0.0,0.0,0.0,0.0,0.0,0.0,2016.0,12.0,9.0,-10898.057601
50869,791.75,61128.0,10,14,0.549,0.449,9.0,-13.645,0.0,0.1040,...,0.0,0.0,0.0,0.0,0.0,0.0,2018.0,2.0,9.0,-11478.964275
