# MLOps using only standard Python

## Configuration

In [17]:
data_directory = 'data'
init_data_url = 'https://github.com/wodecki/datasets/blob/main/genres_classification/genres_mod.parquet?raw=true'
cleaned_data_filename = 'processed_data.csv'

## If required - download environment packages

In [18]:
# !conda install pandas os pyarrow fastparquet

## Import libraries

In [19]:
import pandas as pd
import numpy as np
import os

## Prepare folders

In [20]:
if not os.path.exists(data_directory):
    os.makedirs(data_directory)

## 1. Load data

In [21]:
df = pd.read_parquet(init_data_url)
df.sample(5)

Unnamed: 0,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,type,duration_ms,time_signature,genre,song_name,title
4007,0.279,0.277,11,-17.168,0,0.085,0.773,0.91,0.138,0.0819,165.792,audio_features,259765,4,Dark Trap,Kenopsia,
17352,0.735,0.482,8,-9.445,1,0.139,0.00623,0.0144,0.448,0.315,181.995,audio_features,217933,4,RnB,In My Feelings,
36462,0.56,0.943,1,-4.606,0,0.08,0.00181,0.151,0.0975,0.0382,174.034,audio_features,309306,4,dnb,,Liquid Drum & Bass
39486,0.228,0.843,11,-5.711,0,0.0818,0.297,0.424,0.0958,0.0423,149.692,audio_features,233600,4,hardstyle,,Hardstyle by Q-dance
24223,0.805,0.943,11,-6.503,0,0.0563,0.000749,0.868,0.679,0.856,126.0,audio_features,249545,4,techhouse,,Dirtybird//Techhouse by Mack\


## Explore the data

In [22]:
print('''
Data has {} columns and {} rows\n\n 
Data types:\n{}\n\n
Data summary:\n{}'''
.format(df.columns.values, df.shape[0], df.dtypes, df.describe()))


Data has ['danceability' 'energy' 'key' 'loudness' 'mode' 'speechiness'
 'acousticness' 'instrumentalness' 'liveness' 'valence' 'tempo' 'type'
 'duration_ms' 'time_signature' 'genre' 'song_name' 'title'] columns and 42896 rows

 
Data types:
danceability        float64
energy              float64
key                   int64
loudness            float64
mode                  int64
speechiness         float64
acousticness        float64
instrumentalness    float64
liveness            float64
valence             float64
tempo               float64
type                 object
duration_ms           int64
time_signature        int64
genre                object
song_name            object
title                object
dtype: object


Data summary:
       danceability        energy           key      loudness          mode  \
count  42896.000000  42896.000000  42896.000000  33726.000000  42896.000000   
mean       0.639336      0.762560      5.368566     -6.463899      0.549492   
std        0.1

In [23]:
df.isnull().sum()
# We can ommit empty song name and titles and fill this with empty values as they are not a part of our analysis.
# It does not provide any value to our case.

danceability            0
energy                  0
key                     0
loudness             9170
mode                    0
speechiness             0
acousticness            0
instrumentalness        0
liveness                0
valence                 0
tempo                   0
type                    0
duration_ms             0
time_signature          0
genre                   0
song_name           21085
title               21817
dtype: int64

## Data cleansing

In [24]:
df_clean = df.copy()
print('Number of rows before cleansing: {}'.format(df_clean.size))

Number of rows before cleansing: 729232


### Deduplication

In [25]:
print('Data size before deduplication: {}'.format(df_clean.shape[0]))
df_clean = df_clean.drop_duplicates().reset_index(drop=True)
print('Data size after deduplication: {}'.format(df_clean.shape[0]))

Data size before deduplication: 42896
Data size after deduplication: 41975


### Handle NaN values

In [26]:
print('Number of NaN titles before replacement: {}'.format(df_clean['title'].isnull().sum()))
df_clean['title'].fillna(value='', inplace=True)
print('Number of NaN titles after replacement: {}'.format(df_clean['title'].isnull().sum()))

Number of NaN titles before replacement: 21240
Number of NaN titles after replacement: 0


In [27]:
print('Number of NaN song names before replacement: {}'.format(df_clean['song_name'].isnull().sum()))
df_clean['song_name'].fillna(value='', inplace=True)
print('Number of NaN song names after replacement: {}'.format(df_clean['song_name'].isnull().sum()))

Number of NaN song names before replacement: 20741
Number of NaN song names after replacement: 0


In [28]:
loudness_median = df_clean['loudness'].median()

print('Number of NaN loudness before replacement: {}'.format(df_clean['loudness'].isnull().sum()))
df_clean['loudness'].fillna(value=loudness_median, inplace=True)
print('Number of NaN loudness after replacement: {}'.format(df_clean['loudness'].isnull().sum()))

Number of NaN loudness before replacement: 9007
Number of NaN loudness after replacement: 0


In [29]:
print(df_clean.isnull().sum())
print(df_clean.shape)

danceability        0
energy              0
key                 0
loudness            0
mode                0
speechiness         0
acousticness        0
instrumentalness    0
liveness            0
valence             0
tempo               0
type                0
duration_ms         0
time_signature      0
genre               0
song_name           0
title               0
dtype: int64
(41975, 17)


## Feature engineering

In [30]:
df_clean['text_feature'] = df_clean['title'] + ' ' + df_clean['song_name']

## Save cleaned data

### Compare data dimensionality

In [31]:
print('Data before: {}\nData after: {}'.format(df.shape, df_clean.shape))

Data before: (42896, 17)
Data after: (41975, 18)


In [32]:
data_file = os.path.join(data_directory, cleaned_data_filename)
df_clean.to_csv(data_file, index=False)

In [33]:
df_clean.describe()

Unnamed: 0,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature
count,41975.0,41975.0,41975.0,41975.0,41975.0,41975.0,41975.0,41975.0,41975.0,41975.0,41975.0,41975.0,41975.0
mean,0.63938,0.762855,5.371674,-6.418807,0.549184,0.13644,0.095894,0.284569,0.21419,0.35652,147.42499,251211.48324,3.972555
std,0.156495,0.183793,3.666697,2.609506,0.497581,0.126124,0.170647,0.371246,0.175699,0.233086,23.79437,103140.264753,0.268073
min,0.0651,0.000243,0.0,-32.929,0.0,0.0227,1e-06,0.0,0.0107,0.0187,57.967,25600.0,1.0
25%,0.524,0.632,1.0,-7.559,0.0,0.0491,0.00172,0.0,0.0996,0.161,129.9155,179998.0,4.0
50%,0.646,0.804,6.0,-6.235,1.0,0.0754,0.0163,0.00645,0.135,0.321,144.962,224951.0,4.0
75%,0.766,0.923,9.0,-5.0455,1.0,0.193,0.106,0.725,0.2945,0.521,160.976,302013.0,4.0
max,0.988,1.0,11.0,3.148,1.0,0.946,0.988,0.989,0.988,0.988,220.29,913052.0,5.0
