In [2]:
from sklearn import preprocessing
import pandas as pd
from functions import *

In [3]:
# read in the Top100 and Top20 of all 8 countries into 2 dataframes
top20 = pd.read_csv('data/Top20_8_countries_backward.csv', header=0, index_col=False)
top100 = pd.read_csv('data/Top100_8_countries_backward.csv', header=0, index_col=False)

In [4]:
# create 2 new dataframe for the Top100 and Top20 of Japan 
top20_jp = top20.query('region == "Japan"').copy()
top100_jp = top100.query('region == "Japan"').copy()

In [5]:
# show row and column count
display(top20_jp.shape, top100_jp.shape)

(29155, 8)

(145790, 8)

In [6]:
# show column names
top20_jp.columns.to_list

<bound method IndexOpsMixin.tolist of Index(['title', 'rank', 'date', 'artist', 'track_id', 'region', 'trend',
       'streams'],
      dtype='object')>

In [8]:
# split the dataframes
top20_jp_streams = top20_jp.copy()
top20_jp_rest = top20_jp.copy()
top100_jp_streams = top100_jp.copy()
top100_jp_rest = top100_jp.copy()

In [9]:
# remove duplicated columns for later merge 
top20_jp_rest.drop('streams', axis=1, inplace=True)
top20_jp_streams.drop(['title', 'rank', 'date', 'artist', 'track_id', 'region', 'trend'], axis=1, inplace=True)
top100_jp_rest.drop('streams', axis=1, inplace=True)
top100_jp_streams.drop(['title', 'rank', 'date', 'artist', 'track_id', 'region', 'trend'], axis=1, inplace=True)

In [10]:
# check the top of the dataframes
display(top20_jp_rest.head(3), top20_jp_streams.head(3), top100_jp_rest.head(3), top100_jp_streams.head(3))

Unnamed: 0,title,rank,date,artist,track_id,region,trend
40,Closer,1,2017-01-01,"The Chainsmokers, Halsey",7BKLCZ1jbUBVqRi2FVlTVw,Japan,SAME_POSITION
41,I Feel It Coming,2,2017-01-01,"The Weeknd, Daft Punk",5GXAXm5YOmYT0kL5jHvYBt,Japan,MOVE_UP
42,Let Me Love You,3,2017-01-01,"DJ Snake, Justin Bieber",4pdPtRcBmOSQDlJ3Fk945m,Japan,MOVE_DOWN


Unnamed: 0,streams
40,6342.0
41,5697.0
42,5149.0


Unnamed: 0,title,rank,date,artist,track_id,region,trend
200,Closer,1,2017-01-01,"The Chainsmokers, Halsey",7BKLCZ1jbUBVqRi2FVlTVw,Japan,SAME_POSITION
201,I Feel It Coming,2,2017-01-01,"The Weeknd, Daft Punk",5GXAXm5YOmYT0kL5jHvYBt,Japan,MOVE_UP
202,Let Me Love You,3,2017-01-01,"DJ Snake, Justin Bieber",4pdPtRcBmOSQDlJ3Fk945m,Japan,MOVE_DOWN


Unnamed: 0,streams
200,6342.0
201,5697.0
202,5149.0


In [11]:
# normalizing, scaling the min to 0 and the max to 1
scaler = preprocessing.MinMaxScaler()
d = scaler.fit_transform(top20_jp_streams)
scaled_t20 = pd.DataFrame(d, columns=top20_jp_streams.columns)

In [12]:
# the same for the Top100
scaler = preprocessing.MinMaxScaler()
d = scaler.fit_transform(top100_jp_streams)
scaled_t100 = pd.DataFrame(d, columns=top100_jp_streams.columns)

In [13]:
# reset the index starting from 0 for the Top100
top100_jp_rest.reset_index(drop=True, inplace=True)

In [14]:
# check the dataframe
display(scaled_t100.head(3), top100_jp_rest.head(3))

Unnamed: 0,streams
0,0.014273
1,0.012515
2,0.011021


Unnamed: 0,title,rank,date,artist,track_id,region,trend
0,Closer,1,2017-01-01,"The Chainsmokers, Halsey",7BKLCZ1jbUBVqRi2FVlTVw,Japan,SAME_POSITION
1,I Feel It Coming,2,2017-01-01,"The Weeknd, Daft Punk",5GXAXm5YOmYT0kL5jHvYBt,Japan,MOVE_UP
2,Let Me Love You,3,2017-01-01,"DJ Snake, Justin Bieber",4pdPtRcBmOSQDlJ3Fk945m,Japan,MOVE_DOWN


In [15]:
# reset the index for the Top100
top20_jp_rest.reset_index(drop=True, inplace=True)

In [16]:
# check the dataframe
display(scaled_t20.head(3), top20_jp_rest.head(3))

Unnamed: 0,streams
0,0.008905
1,0.007137
2,0.005635


Unnamed: 0,title,rank,date,artist,track_id,region,trend
0,Closer,1,2017-01-01,"The Chainsmokers, Halsey",7BKLCZ1jbUBVqRi2FVlTVw,Japan,SAME_POSITION
1,I Feel It Coming,2,2017-01-01,"The Weeknd, Daft Punk",5GXAXm5YOmYT0kL5jHvYBt,Japan,MOVE_UP
2,Let Me Love You,3,2017-01-01,"DJ Snake, Justin Bieber",4pdPtRcBmOSQDlJ3Fk945m,Japan,MOVE_DOWN


In [17]:
# show row and column count
display(top20_jp_rest.shape, scaled_t20.shape, top100_jp_rest.shape, scaled_t100.shape)

(29155, 7)

(29155, 1)

(145790, 7)

(145790, 1)

In [18]:
# recombine the normalized data
t20_final = pd.concat([top20_jp_rest, scaled_t20], axis=1)
t100_final = pd.concat([top100_jp_rest, scaled_t100], axis=1)

In [19]:
# show row and column count
display(t20_final.shape, t100_final.shape)

(29155, 8)

(145790, 8)

In [20]:
# check the last rows of the dataframe
display(t20_final.tail(3), t100_final.tail(3))

Unnamed: 0,title,rank,date,artist,track_id,region,trend,streams
29152,LIKEY,18,2018-01-31,TWICE,4Eb5cLWxKQUoUQaKUYSRgI,Japan,MOVE_DOWN,0.027183
29153,愛唄,19,2018-01-31,GReeeeN,7iXHLtPosbHvglVSJ4rxLq,Japan,SAME_POSITION,0.026257
29154,君が好き,20,2018-01-31,Shota Shimizu,5Bg0HkfaRYRx8A7tMrjT9Z,Japan,MOVE_UP,0.025969


Unnamed: 0,title,rank,date,artist,track_id,region,trend,streams
145787,rockstar,98,2018-01-31,"Post Malone, 21 Savage",7wGoVu4Dady5GV0Sv4UIsx,Japan,MOVE_UP,0.013431
145788,Thunder,99,2018-01-31,Imagine Dragons,0tKcYR2II1VCQWT79i5NrW,Japan,MOVE_DOWN,0.013387
145789,First Love,100,2018-01-31,Hikaru Utada,3MmAtS5q33iKDcRvu6U5p3,Japan,MOVE_DOWN,0.013295


In [21]:
# export the normalized data to csv
save_to_csv(t20_final,'Top20_Japan_normalized')
save_to_csv(t100_final,'Top100_Japan_normalized')

PosixPath('/Users/inthesea/Downloads/Top100_Japan_normalized.csv')