## Modelling schema

This notebook provides schema for preparing data transformation pipeline for modelling, data needs to be transformed to model ready form.  
This notebook serves as a guide, feel free to modify your code however you want - it just serves to explain the transformation logic.

In [1]:
import os
import sys

sys.dont_write_bytecode = True

import numpy as np
import pandas as pd

from etl import FrequencyEncoder, CircleOfFifthsEncoding, ConvertNull

from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

from sklearn.model_selection import train_test_split

from dotenv import load_dotenv
load_dotenv()

False

loading raw data

In [3]:
DATA_DIR = '../Development/gulas16/Data/'
DATA_FILE = 'spotify_tracks_kaggle_weekly.csv'

In [4]:
data = pd.read_csv(DATA_DIR + DATA_FILE)

Data splitting -> working with predefined $X_{train}$

In [5]:
RANDOM_STATE = 21
TEST_SIZE = 0.1

In [6]:
X = data.drop('popularity', axis=1)
y = data['popularity']

In [7]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=TEST_SIZE, random_state=RANDOM_STATE
)

Dropping columns defined in EDA

In [8]:
drop_columns = ['track_id', 'artwork_url', 'track_url', 'track_name', 'album_name']

In [9]:
X_train = X_train.drop(drop_columns, axis=1, errors='ignore')
X_test = X_test.drop(drop_columns, axis=1, errors='ignore')

transformation pipeline

In [4]:
target = 'popularity'

onehot_col = ['language']
circle_of_fifths_col = ['key']
artist_name_col = ['artist_name']

numeric_columns = list(X_train.columns[X_train.dtypes != object].difference(['key', 'mode']))

nan_columns = ['acousticness', 'danceability', 'energy', 'liveness', 'speechiness', 'tempo', 'valence']

NameError: name 'X_train' is not defined

In [3]:
numeric_pipeline = Pipeline(steps=[
    ('scaling', StandardScaler())
])

artist_name_pipeline = Pipeline(steps=[
    ('encoding', FrequencyEncoder()),
    ('scaling', StandardScaler())
])


transformations = ColumnTransformer(transformers=[
    
    ('onehot_encoding', OneHotEncoder(sparse_output=False), onehot_col),
    ('trigonometric_encoding', CircleOfFifthsEncoding(), circle_of_fifths_col),
    ('artist_encoding', artist_name_pipeline, artist_name_col),
    ('nummeric_processing', numeric_pipeline, numeric_columns)

], remainder='drop')


preprocessing = Pipeline(steps=[
    ('null_values', ConvertNull(columns=nan_columns)),
    ('transformation', transformations)
])

NameError: name 'onehot_col' is not defined

transformation of data

In [12]:
pd.DataFrame(preprocessing.fit_transform(X_train))

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,12,13,14,15,16,17,18,19,20,21
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.500000,8.660254e-01,-1.136780,...,0.705974,-1.247168,-0.452275,-0.482662,0.021763,0.057611,0.411660,0.285070,-0.998566,0.577459
1,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.500000,-8.660254e-01,-0.751348,...,-0.232340,0.438704,-0.476210,0.788392,0.024609,-0.028825,-1.271181,0.285070,-0.773930,0.681340
2,1.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.000000,1.224647e-16,-0.558060,...,-0.086448,-0.321994,-0.476210,-0.111938,0.024680,-0.573281,1.007857,0.285070,-0.910996,0.889102
3,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.000000,0.000000e+00,0.226529,...,-0.375460,0.594956,-0.476210,0.611857,0.025782,-0.443182,-0.632847,0.285070,-0.221858,0.473579
4,0.0,0.0,0.0,0.0,1.0,0.0,0.0,-0.500000,-8.660254e-01,-0.323599,...,0.796792,-1.066245,-0.476053,-0.676263,0.023992,-0.534073,0.069415,0.285070,-0.773930,0.265817
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
56080,0.0,0.0,0.0,0.0,0.0,0.0,1.0,-0.866025,-5.000000e-01,-0.878301,...,-0.943066,-0.860651,-0.476210,-0.447355,0.023165,-0.537637,-0.457916,-5.671029,-1.291735,0.577459
56081,0.0,1.0,0.0,0.0,0.0,0.0,0.0,-0.866025,-5.000000e-01,-0.301868,...,0.090251,0.944466,-0.476167,-0.476778,0.023966,-0.353181,1.907194,0.285070,1.415320,0.369698
56082,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.500000,-8.660254e-01,-0.377353,...,-1.217829,1.413220,2.429753,0.152865,0.023702,-0.249815,0.356159,0.285070,0.082733,0.992982
56083,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.866025,5.000000e-01,-0.211514,...,0.522147,-0.794861,-0.475891,-0.653313,0.023778,-0.173181,-1.515118,0.285070,-0.023873,-0.149706
