Amazon US Customer Reviews - Link Analysis

In [None]:
from google.colab import files
import os
import json
import getpass

# Prompt the user for input
use_upload = input('Do you want to upload the Kaggle credentials file? (y/n): ')
if use_upload.lower() == 'y':
    # Upload the file
    uploaded = files.upload()

    # Rename the file to KaggleCredential.json
    for name in uploaded.keys():
        if 'kaggle' in name.lower():
            os.rename(name, 'KaggleCredential.json')
            break
else:
    # Manually input the credentials
    username = input('Enter your Kaggle username: ')
    password = getpass.getpass('Enter your Kaggle key: ')

    # Save the credentials to a JSON file
    credentials = {'username': username, 'key': password}
    with open('KaggleCredential.json', 'w') as f:
        json.dump(credentials, f)

# Load the credentials from the JSON file
with open('KaggleCredential.json', 'r') as f:
    credentials = json.load(f)

# Get the Kaggle username from the credentials
username = credentials['username']
password = credentials['key']
print(username)

In [2]:
import getpass
import json

# Save your credentials to a JSON file
credentials = {'username': username, 'key': password}
kaggle_dir = '/root/.kaggle'
if not os.path.exists(kaggle_dir):
    os.makedirs(kaggle_dir)
with open(os.path.join(kaggle_dir, 'kaggle.json'), 'w') as f:
    json.dump(credentials, f)

# Set the file permissions to read/write only for the owner
!chmod 600 /root/.kaggle/kaggle.json

In [None]:
import kaggle
datasets = !kaggle datasets list -s 'Amazon US Customer Reviews'
datasets

In [None]:
kaggle.api.dataset_list_files('cynthiarempel/amazon-us-customer-reviews-dataset').files #list of file in main dataset

In [6]:
# Import necessary libraries
import kaggle
import zipfile

# Authenticate Kaggle API
kaggle.api.authenticate()

In [None]:
kaggle.api.dataset_download_file('cynthiarempel/amazon-us-customer-reviews-dataset','amazon_reviews_us_Digital_Software_v1_00.tsv') # choose dataset among the ones above

In [8]:
# Extract the zip file to the current working directory (i.e., the root directory in Colab)
with zipfile.ZipFile('amazon_reviews_us_Digital_Software_v1_00.tsv.zip', 'r') as zip_ref:
    zip_ref.extractall()  # Extract to the current directory

In [None]:
#!pip3 install pyspark
import pyspark
from pyspark.sql import SparkSession

import pyspark.sql.functions as f
from pyspark.sql import Window

spark = SparkSession.builder \
                    .appName("Malchiodi's Project") \
                    .getOrCreate()
spark

Product Linkage

In [14]:
df = spark.sparkContext.textFile('amazon_reviews_us_Digital_Software_v1_00.tsv', minPartitions=8) # import as rdd dataset
#df1 = spark.sparkContext.textFile('amazon_reviews_us_Digital_Software_v1_00.tsv', minPartitions=8)
#df = spark.sparkContext.union([df0, df1])

In [15]:
def parse_data(line):
    fields = line.split("\t")
    return fields[1], fields[3] #keep only customer and product id columns

In [16]:
df = df.map(parse_data).groupByKey().mapValues(list) # customer linkage --> df.map(parse_data).map(lambda x: (x[1], x[0])).groupByKey().mapValues(list)

In [17]:
df = df.filter(lambda x: len(x[1])>1).map(lambda x: x) # keep only (k, v) with more than one v --> product bought by more than one customer  / customer who bought more than one product

In [18]:
import itertools
def combination(row): # define every possible linkage: e.g. (0, [1, 2, 3]) --> (1, 2), (1, 3), (2, 3), (2, 1), (3, 2), (3, 1)
    l = row[1]
    k = row[0]
    res1 = [(v[0], v[1]) for v in itertools.combinations(l, 2)]
    to_add = []
    for x in res1:
        to_add.append(tuple(reversed(x)))
    return (res1+to_add) 

In [19]:
df = df.map(lambda x: combination(x)).flatMap(lambda l: l) #from list of list to list

In [20]:
total_nodes = df.groupByKey().count() # how many single nodes?

In [21]:
id2id = df
# compute the out-degree for each node
id2degree = id2id.countByKey() #count by key --> for each key, how many link?

In [22]:
prods = list(id2degree.keys())
p = 1/(total_nodes) 
p2diz = {}
for prod in prods:
    p2diz[prod] = p

In [23]:
# compute sparse transition matrix
P = id2id.map(lambda x:(x[0],x[1],1/id2degree[x[0]])) #(i, j, Mij)
PT = P.map(lambda x: (x[1],x[0],x[2])) #(j, i , Mij)

In [24]:
# P*p for some iteration
for i in range(70):
    new_p = PT.map(lambda x:(x[0],(x[2]*p2diz[x[1]])))\
              .reduceByKey(lambda x,y: x+y)\
              .collect()
    for idx,prb in new_p:
        p2diz[idx] = prb

In [25]:
p2diz_sort = sorted(p2diz.items(), key=lambda x:x[1], reverse=True) #order the diz by probability and print the most recurrent products 

In [None]:
print('Most quoted products:')
for kv in range(len(p2diz_sort[:20])):
    print(f'With prob: {p2diz_sort[kv][1]}, you take product with code: {p2diz_sort[kv][0]}')

Customer linkage

In [27]:
df = spark.sparkContext.textFile('amazon_reviews_us_Digital_Software_v1_00.tsv', minPartitions=8)

In [28]:
def parse_data(line):
    fields = line.split("\t")
    return fields[1], fields[3] #keep only customer and product id columns

In [29]:
df = df.map(parse_data).map(lambda x: (x[1], x[0])).groupByKey().mapValues(list)

In [30]:
df = df.filter(lambda x: len(x[1])>1).map(lambda x: x)

In [31]:
import itertools
def combination(row):
    l = row[1]
    k = row[0]
    res1 = [(v[0], v[1]) for v in itertools.combinations(l, 2)]
    to_add = []
    for x in res1:
        to_add.append(tuple(reversed(x)))
    return (res1+to_add)

In [32]:
df = df.map(lambda x: combination(x)).flatMap(lambda l: l)

In [33]:
total_nodes = df.groupByKey().count()

In [34]:
id2id = df
# compute the out-degree for each node
id2degree = id2id.countByKey()

In [35]:
prods = list(id2degree.keys())
p = 1/(total_nodes)
p2diz = {}
for prod in prods:
    p2diz[prod] = p

In [36]:
# compute sparse transition matrix
P = id2id.map(lambda x:(x[0],x[1],1/id2degree[x[0]]))
PT = P.map(lambda x: (x[1],x[0],x[2]))

In [None]:
# P*p for some iteration
for i in range(70):
    new_p = PT.map(lambda x:(x[0],(x[2]*p2diz[x[1]])))\
              .reduceByKey(lambda x,y: x+y)\
              .collect()
    for idx,prb in new_p:
        p2diz[idx] = prb

In [None]:
p2diz_sort = sorted(p2diz.items(), key=lambda x:x[1], reverse=True)

In [None]:
print('Most quoted customers:')
for kv in range(len(p2diz_sort[:20])):
    print(f'With prob: {p2diz_sort[kv][1]}, you are similar to customer with code: {p2diz_sort[kv][0]}')