# Amazon Musical Instruments Review

In [1]:
import numpy as np, pandas as pd
import os, sys
import math
import shutil
import zipfile
import string
import random

# Paths and Variables

In [2]:
dataset_name = 'musical_instruments'

In [3]:
input_dir = './raw'
output_dir = './processed'

outp_fname = os.path.join(output_dir, f'{dataset_name}.csv')
outp_train_fname = os.path.join(output_dir, f'{dataset_name}_train.csv')
outp_test_fname = os.path.join(output_dir, f'{dataset_name}_test.csv')
outp_test_key_fname = os.path.join(output_dir, f'{dataset_name}_test_key.csv')

In [4]:
inp_fname = 'Musical_instruments_reviews.csv'

# Read data into a DataFrame

In [5]:
data = pd.read_csv(os.path.join(input_dir, inp_fname))
print(data.shape)
data.head()

(10261, 9)


Unnamed: 0,reviewerID,asin,reviewerName,helpful,reviewText,overall,summary,unixReviewTime,reviewTime
0,A2IBPI20UZIR0U,1384719342,"cassandra tu ""Yeah, well, that's just like, u...","[0, 0]","Not much to write about here, but it does exac...",5.0,good,1393545600,"02 28, 2014"
1,A14VAT5EAX3D9S,1384719342,Jake,"[13, 14]",The product does exactly as it should and is q...,5.0,Jake,1363392000,"03 16, 2013"
2,A195EZSQDW3E21,1384719342,"Rick Bennette ""Rick Bennette""","[1, 1]",The primary job of this device is to block the...,5.0,It Does The Job Well,1377648000,"08 28, 2013"
3,A2C00NNG1ZQQG2,1384719342,"RustyBill ""Sunday Rocker""","[0, 0]",Nice windscreen protects my MXL mic and preven...,5.0,GOOD WINDSCREEN FOR THE MONEY,1392336000,"02 14, 2014"
4,A94QU4C90B1AX,1384719342,SEAN MASLANKA,"[0, 0]",This pop filter is great. It looks and perform...,5.0,No more pops when I record my vocals.,1392940800,"02 21, 2014"


In [6]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10261 entries, 0 to 10260
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   reviewerID      10261 non-null  object 
 1   asin            10261 non-null  object 
 2   reviewerName    10234 non-null  object 
 3   helpful         10261 non-null  object 
 4   reviewText      10254 non-null  object 
 5   overall         10261 non-null  float64
 6   summary         10261 non-null  object 
 7   unixReviewTime  10261 non-null  int64  
 8   reviewTime      10261 non-null  object 
dtypes: float64(1), int64(1), object(7)
memory usage: 721.6+ KB


In [7]:
id_col = "reviewerID_asin"
target_col = "overall"
text_col = "text"

In [8]:
data[target_col].value_counts()

5.0    6938
4.0    2084
3.0     772
2.0     250
1.0     217
Name: overall, dtype: int64

# Prepare Data

In [9]:
# create unique id from reviewerId and asin concatenation
data[id_col] = data.apply(lambda row: row["reviewerID"] + "_" + row["asin"], axis=1)
data.head()

Unnamed: 0,reviewerID,asin,reviewerName,helpful,reviewText,overall,summary,unixReviewTime,reviewTime,reviewerID_asin
0,A2IBPI20UZIR0U,1384719342,"cassandra tu ""Yeah, well, that's just like, u...","[0, 0]","Not much to write about here, but it does exac...",5.0,good,1393545600,"02 28, 2014",A2IBPI20UZIR0U_1384719342
1,A14VAT5EAX3D9S,1384719342,Jake,"[13, 14]",The product does exactly as it should and is q...,5.0,Jake,1363392000,"03 16, 2013",A14VAT5EAX3D9S_1384719342
2,A195EZSQDW3E21,1384719342,"Rick Bennette ""Rick Bennette""","[1, 1]",The primary job of this device is to block the...,5.0,It Does The Job Well,1377648000,"08 28, 2013",A195EZSQDW3E21_1384719342
3,A2C00NNG1ZQQG2,1384719342,"RustyBill ""Sunday Rocker""","[0, 0]",Nice windscreen protects my MXL mic and preven...,5.0,GOOD WINDSCREEN FOR THE MONEY,1392336000,"02 14, 2014",A2C00NNG1ZQQG2_1384719342
4,A94QU4C90B1AX,1384719342,SEAN MASLANKA,"[0, 0]",This pop filter is great. It looks and perform...,5.0,No more pops when I record my vocals.,1392940800,"02 21, 2014",A94QU4C90B1AX_1384719342


In [10]:
# create text using concatenation of summary and review Text

# data['text'] = data[['summary', 'reviewText']].apply(' '.join, axis=1)
data['text'] = data.apply( lambda row:  str(row['summary']) + " " + str(row['reviewText']), axis = 1)

In [11]:
data = data[[id_col, target_col, text_col]]
data.head()

Unnamed: 0,reviewerID_asin,overall,text
0,A2IBPI20UZIR0U_1384719342,5.0,"good Not much to write about here, but it does..."
1,A14VAT5EAX3D9S_1384719342,5.0,Jake The product does exactly as it should and...
2,A195EZSQDW3E21_1384719342,5.0,It Does The Job Well The primary job of this d...
3,A2C00NNG1ZQQG2_1384719342,5.0,GOOD WINDSCREEN FOR THE MONEY Nice windscreen ...
4,A94QU4C90B1AX_1384719342,5.0,No more pops when I record my vocals. This pop...


# Insert Id Column

In [12]:
# insert Id column 
if id_col not in data.columns:
    N = data.shape[0]
    data.insert(0, id_col, np.arange(N))
    print(data.head())

# Shuffle Data

In [13]:
# shuffle data
data = data.sample(frac=1, random_state=42)
print(data.shape)
data.head()

(10261, 3)


Unnamed: 0,reviewerID_asin,overall,text
2507,AKAVVQMXSAIGX_B0002GWFEQ,5.0,GREAT solution for acoustic guitar I've been u...
5159,A6KYDNP84GGGJ_B000NGVQKO,4.0,Not crazy about them and took a long time to a...
932,A2MPM6M93OXIJT_B0002D0CLM,3.0,Not what I would grab to play with I recently ...
1190,ASJAKT8DJIAC5_B0002DV7U2,5.0,works great and easy to carry I have two of th...
2619,AJH2W783HOXZV_B0002GXZK4,5.0,Very Nice Guitar This guitar sounds awesome an...


# Utility to Save DF as a zipped file

In [14]:
def save_df_to_zipped_csv(df, ftype=None): 
    if ftype is not None: 
        suffix = f'_{ftype}'
    else: 
        suffix = ''
        
    zipped_f_name = f'{dataset_name}{suffix}.zip'
    archive_f_name = f'{dataset_name}{suffix}.csv'   
    compression_opts = dict(method='zip',
                        archive_name=archive_f_name)      
    df.to_csv(os.path.join(output_dir, zipped_f_name), index=False, compression=compression_opts )

# Save Main Data File

In [15]:
# # save original file as csv
# data.to_csv(outp_fname, index=False)

# save as zipped file 
save_df_to_zipped_csv(data)

# Train Test Split

In [16]:
from sklearn.model_selection import train_test_split
test_size = 0.1

data_train, data_test = train_test_split(data, test_size=test_size, random_state=42)


data_test_key = data_test[[id_col, target_col]].copy()
data_test = data_test.drop(columns=[target_col])
print(data_train.shape, data_test.shape, data_test_key.shape)

# # Save original files as csv
# data_train.to_csv(outp_train_fname, index=False)
# data_test.to_csv(outp_test_fname, index=False)
# data_test_key.to_csv(outp_test_key_fname, index=False)

(9234, 3) (1027, 2) (1027, 2)


In [17]:
# zip files
save_df_to_zipped_csv(data_train, "train")
save_df_to_zipped_csv(data_test, "test")
save_df_to_zipped_csv(data_test_key, "test_key")