# Predicting Tempo from Lyrics

#### Add in a short summary


-------------

In [1]:
# Classic Packages
import pandas as pd

# Modeling 
from xgboost import plot_importance

#Plotting
import matplotlib.pyplot as plt

# Model Modules
from modules.feature_engineering import *
from modules.feature_reduction import *
from modules.model_build import *
from modules.clean_data import *
from modules.hyperopt_xgboost import *
from modules.hyperopt_hyperparameter import *

## Load Data

In [2]:
data_link = '../build_training/spotify_artist_info.csv'

In [3]:
song_df, song_features = generate_model_dataframe(pd.read_csv(data_link))

KeyError: "['anger', 'disgust', 'fear', 'joy', 'neutral', 'sadness', 'surprise'] not in index"

In [None]:
y_var = ['tempo']

## Preprocessing

### Feature Engineering

In [None]:
# fix genres
song_df, song_features = clean_genres(song_df, song_features)

In [None]:
# release date -> release year
song_df, song_features = release_year(song_df, song_features)

In [None]:
song_features = set_dependent_variable(song_df, song_features, y_var)

In [None]:
# drop records where lyrics are not scored through NLP model
song_df = song_df[(~song_df.POSITIVE.isna()) & 
                    (~song_df.acousticness.isna())].reset_index(drop=True)

### Data Cleaning

In [None]:
song_df, song_features = remove_columns_from_feature_set(song_df, song_features, ['artist_name','genres','album_type'])

In [None]:
song_features = remove_id_columns_from_feature_set(song_features)

In [None]:
song_df = clean_continuous_numeric_columns(song_df, song_features)

In [None]:
song_df = clean_ordinal_categorical_columns(song_df, song_features)

In [None]:
song_df.isna().sum()

In [None]:
song_df = clean_binary_categorical_columns(song_df, song_features)

In [None]:
# check max number of missing values per column
song_df.isna().sum().max()

In [None]:
## feature correlation
song_features = highly_correlated_features(song_df, song_features, corr_threshold = 0.95)

## Model Building

In [None]:
## split data into train/test
training, _, _, _, X_test, y_test, _, _ = create_testing_dataframes(song_df, 
                                                              ids = ['lyric_location'],
                                                              stratify_by=['key'],
                                                              dv=y_var,
                                                              nfolds=5)

# split training into train & validation
_, _, X_train, y_train, X_val, y_val, _, _ = create_testing_dataframes(song_df, 
                                                          ids = ['lyric_location'],
                                                          stratify_by=['key'],
                                                          dv=y_var,
                                                          nfolds=5)

In [None]:
feature_space = create_hyperopt_feature_space(song_features[song_features['in_model'] == 1]['feature'].tolist())
hyperparameter_space = create_hyperopt_hyperparameters_space('xgb_regressor')

In [None]:
## train model

hyperopt_regression_model = Hyperopt_XGB('xgb_regressor',
                                         X_train,
                                         y_train,
                                         X_val,
                                         y_val,
                                         feature_space,
                                         hyperparameter_space)

feature_list, regression_model = hyperopt_regression_model.optimize(max_evals = 1500)

## Model Evaluation

In [None]:
model_columns = [feature for feature in feature_list.keys() if feature_list[feature] == 1 and feature not in hyperparameter_space.keys()]

In [None]:
model_columns

In [None]:
lift_chart(regression_model.predict(X_test[model_columns]), list(y_test.iloc[:,0])).shape[1]

In [None]:
## residual plot??

In [None]:
## Feature importance
plot_importance(regression_model);

## SHAP Plots