In [None]:
import pandas as pd
import numpy as np
import pickle

### Import the Scrapy JSON file

In [None]:
df_raw = pd.read_json("scraped_reviews.json")

In [None]:
len(df_raw)

In [None]:
#df_raw.head()

In [None]:
df_raw.describe()

__Plot a histogram of what we are trying to predict ("Score")__

In [None]:
sns.distplot(df_raw.score);

## Tidy up some of the data

### Convert "publish_date" to datetime format and drop the old column

In [None]:
df_raw["pub_date"] = pd.to_datetime(df_raw.publish_date, format="%Y-%m-%dT%I:%M:%S")

In [None]:
df_raw = df_raw.drop('publish_date', axis=1)

### Drop reviews for "none" records (compilations, original soundtracks, etc)

In [None]:
df_raw.artist.value_counts(ascending=False).head()

In [None]:
df = df_raw[df_raw.artist != "None"]

In [None]:
len(df)

In [None]:
df.loc[df['artist'] == "None"]

### Create a new column "author_review_count"

In [None]:
author_count_dict = df['author'].value_counts().to_dict()

In [None]:
df['author_review_count'] = df['author'].map(author_count_dict)

Can now filter the df based for reviews where the author has written 10 or more reviews

In [None]:
#df.loc[df['author_review_count'] > 9]

### Create a new column "total_number_releases"

In [None]:
artist_count_dict = df['artist'].value_counts().to_dict()

In [None]:
df['total_number_releases'] = df['artist'].map(artist_count_dict)

Can now filter the df based on artists with > 1 release

In [None]:
#df.loc[df['total_number_releases'] > 19]

### Create a new column "release_number"

In [None]:
df = (
    pd.merge(df,(df[['artist', 'pub_date']]
 .groupby(['artist'])
 .rank()), left_index=True, right_index=True)
)

In [None]:
#df.head()

Rename column as "release_number" and drop "pub_date_y"

In [None]:
df['release_number'] = df['pub_date_y'].astype(int)

In [None]:
df = df.drop('pub_date_y', axis=1)

In [None]:
#df.loc[df['artist'] == 'Animal Collective'].sort_values('release_number')

### Create a new column "artist_prior_mean_score"

In [None]:
#df.head()

In [None]:
#df.loc[df['artist'] == 'Animal Collective']

In [None]:
df['cum_mean_score'] = np.zeros_like(df['score'])

In [None]:
for a in df.artist.unique():
    artist_bool = df.artist == a
    for i,row_i in df[artist_bool].iterrows():
        curr_rel_no = row_i['release_number']
        if curr_rel_no == 1:
            df['cum_mean_score'][i] = np.nan
        else:
            df['cum_mean_score'][i] = df[artist_bool & (df.release_number <= curr_rel_no)]['score'].mean()

In [None]:
df = (
    pd.merge(df,(df[['author', 'pub_date_x']]
 .groupby(['author'])
 .rank()), left_index=True, right_index=True)
)

In [None]:
#df.loc[df['author'] == 'Philip Sherburne']

In [None]:
df['review_number'] = df['pub_date_x_y'].astype(int)

In [None]:
df = df.drop('pub_date_x_y', axis=1)

In [None]:
#df.loc[df['author'] == 'Philip Sherburne']

In [None]:
df['cum_mean_score_author'] = np.zeros_like(df['score'])

In [None]:
for a in df.author.unique():
    author_bool = df.author == a
    for i,row_i in df[author_bool].iterrows():
        curr_rev = row_i['review_number']
        if curr_rev == 1:
            df['cum_mean_score_author'][i] = np.nan
        else:
            df['cum_mean_score_author'][i] = df[author_bool & (df.review_number <= curr_rev)]['score'].mean()

In [None]:
#df.loc[df['author'] == 'Philip Sherburne']

In [None]:
#df = df.drop('author_mean_score', axis=1)

In [None]:
df.head()

### Pull in additional data ("spotify_popularity" from Spotify API)

Refer to Jupyter Notebook "spotipy.ipynb"

In [None]:
with open("spotify.pkl", 'rb') as picklefile: 
    spotify_dictionary = pickle.load(picklefile)

In [None]:
len(spotify_dictionary)

In [None]:
df['spotify_popularity'] = df['artist'].map(spotify_dictionary)

In [None]:
#df.head()

In [None]:
len(df[df.spotify_popularity.isnull()])

### Create a pickle of this dataframe for future processing

In [None]:
with open('review2_2_df.pkl', 'wb') as picklefile:
    pickle.dump(df, picklefile)