## DVD Recommendation

In [3]:
import glob
import random
import re
import nltk
from stemming.porter2 import stem
from nltk.corpus import stopwords
import string
import numpy as np
import pandas as pd
import csv
import networkx as nx
from matplotlib import pylab
from matplotlib import pyplot as plt
import scipy.sparse

In [4]:
Metadata_File = open('amazon-meta.txt', 'r', encoding = 'utf-8', errors = 'ignore')
Amazon_Products = {}

In [5]:
# Initialize variables
(Id, ASIN, Title, Categories, Group, Copurchased, SalesRank, TotalReviews, AvgRating, DegreeCentrality, ClusteringCoeff) = ("", "", "", "", "", "", 0, 0, 0.0, 0, 0.0)

# Process each line in the file
for line in Metadata_File:
    line = line.strip()

    # Parsing the file contents
    if line.startswith("Id"):
        Id = line[3:].strip()
    elif line.startswith("ASIN"):
        ASIN = line[5:].strip()
    elif line.startswith("title"):
        Title = ' '.join(line[6:].strip().split())
    elif line.startswith("group"):
        Group = line[6:].strip()
    elif line.startswith("salesrank"):
        SalesRank = line[10:].strip()
    elif line.startswith("similar"):
        ls = line.split()
        Copurchased = ' '.join(ls[2:])
    elif line.startswith("categories"):
        ls = line.split()
        Categories = ' '.join((Metadata_File.readline()).lower() for i in range(int(ls[1].strip())))
        Categories = re.compile('[%s]' % re.escape(string.digits + string.punctuation)).sub(' ', Categories)
        Categories = ' '.join(set(Categories.split()) - set(stopwords.words("english")))
        Categories = ' '.join(stem(word) for word in Categories.split())
    elif line.startswith("reviews"):
        ls = line.split()
        TotalReviews = ls[2].strip()
        AvgRating = ls[7].strip()
    elif line == "":
        if ASIN:
            Amazon_Products[ASIN] = {
                'Id': Id,
                'Title': Title,
                'Categories': ' '.join(set(Categories.split())),
                'Group': Group,
                'Copurchased': Copurchased,
                'SalesRank': int(SalesRank),
                'TotalReviews': int(TotalReviews),
                'AvgRating': float(AvgRating),
                'DegreeCentrality': DegreeCentrality,
                'ClusteringCoeff': ClusteringCoeff
            }
        # Reset variables for the next product
        (Id, ASIN, Title, Categories, Group, Copurchased, SalesRank, TotalReviews, AvgRating, DegreeCentrality, ClusteringCoeff) = ("", "", "", "", "", "", 0, 0, 0.0, 0, 0.0)

# Close the file
Metadata_File.close()

In [7]:
Amazon_DVD = {}
for asin, metadata in Amazon_Products.items():
    if (metadata['Group'] == 'DVD'):
        Amazon_DVD[asin] = Amazon_Products[asin]

In [8]:
for asin, metadata in Amazon_DVD.items():
    Amazon_DVD[asin]['Copurchased'] = ' '.join([cp for cp in metadata['Copurchased'].split() if cp in Amazon_DVD.keys()])

In [9]:
Amazon_DVD

{'0790747324': {'Id': '21',
  'Title': 'The Time Machine',
  'Categories': 'warner today time amazon p dori art theme categori genr adventur dvds futurist hous lloyd monster travel countri titl studio intern featur pal drama sebastian alan georg sci special yvett director costum taylor specialti bissel outlet f h helmor l b fi mutant fantasi deal home paul cabot actor scienc tom kingdom rod c free classic video unit dvd fiction general young whit mimieux store com actress',
  'Group': 'DVD',
  'Copurchased': 'B00007JMD8 6305350221 B00004RF9B B00005JKFR B00005NG6A',
  'SalesRank': 795,
  'TotalReviews': 140,
  'AvgRating': 4.5,
  'DegreeCentrality': 0,
  'ClusteringCoeff': 0.0},
 'B000056PNC': {'Id': '37',
  'Title': 'Mark Messier - Leader, Champion & Legend',
  'Categories': 'featur dvd biographi hockey general genr special titl sport',
  'Group': 'DVD',
  'Copurchased': '',
  'SalesRank': 46018,
  'TotalReviews': 7,
  'AvgRating': 3.5,
  'DegreeCentrality': 0,
  'ClusteringCoeff': 0.0

In [13]:
# Initialize the graph for DVD co-purchases
dvd_co_purchase_graph = nx.Graph()

# Loop through each DVD item and its metadata
for dvd_id, details in Amazon_DVD.items():
    dvd_co_purchase_graph.add_node(dvd_id)  # Add each DVD as a node
    for related_dvd in details['Copurchased'].split():
        clean_id = related_dvd.strip()
        dvd_co_purchase_graph.add_node(clean_id)  # Add related DVDs as nodes
        similarity_measure = 0  # Initialize similarity score

        # Create sets of categories for the current and related DVD
        current_dvd_categories = set(details['Categories'].split())
        related_dvd_categories = set(Amazon_DVD[clean_id]['Categories'].split())

        # Calculate intersection and union of categories
        categories_intersection = current_dvd_categories & related_dvd_categories
        categories_union = current_dvd_categories | related_dvd_categories

        # Compute similarity score if intersection is non-empty
        if categories_intersection:
            similarity_measure = round(len(categories_intersection) / len(categories_union), 2)

        # Add edge with similarity score as weight
        dvd_co_purchase_graph.add_edge(dvd_id, clean_id, weight=similarity_measure)

In [14]:
# Calculate the degree centrality for each node in the graph
degree_centralities = nx.degree(dvd_co_purchase_graph) 

# Iterate through each node in the dvd co-purchase graph
for dvd_id in nx.nodes(dvd_co_purchase_graph):
    dvd_metadata = Amazon_DVD[dvd_id]  # Retrieve metadata for the DVD
    dvd_metadata['Degree_Centrality'] = int(degree_centralities[dvd_id])  # Assign degree centrality

    # Construct an ego graph for the current node
    dvd_ego_graph = nx.ego_graph(dvd_co_purchase_graph, dvd_id, radius=1) 

    # Calculate and assign the clustering coefficient
    dvd_metadata['Clustering_Coefficient'] = round(nx.average_clustering(dvd_ego_graph), 2)
    
    # Update the DVD metadata
    Amazon_DVD[dvd_id] = dvd_metadata


In [15]:
# Opening and preparing the file for writing DVD data
dvd_data_file = open('amazon-dvd-data.txt', 'w', encoding='utf-8', errors='ignore')

# Writing the header line to the file
header = ["ID", "ASIN", "Title", "Category", "Group", "Co-purchase", "Sales Rank", "Review Count", "Average Rating", "Degree Centrality", "Clustering Coefficient"]
dvd_data_file.write("\t".join(header) + "\n")

# Iterating over the Amazon DVD data to write each item's metadata
for dvd_asin, details in Amazon_DVD.items():
    dvd_data_file.write("\t".join([
        details['Id'], 
        dvd_asin, 
        details['Title'], 
        details['Categories'], 
        details['Group'], 
        details['Copurchased'], 
        str(details['SalesRank']), 
        str(details['TotalReviews']), 
        str(details['AvgRating']), 
        str(details['Degree_Centrality']), 
        str(details['Clustering_Coefficient'])
    ]) + "\n")

# Closing the file after writing is complete
dvd_data_file.close()


In [16]:
# Opening a file to write the weighted edge list of the DVD co-purchase graph
dvd_copurchase_edge_list_file = open("amazon-dvd-copurchase-edges.edgelist", 'wb')

# Writing the weighted edge list to the file using NetworkX
nx.write_weighted_edgelist(dvd_co_purchase_graph, dvd_copurchase_edge_list_file)

# Closing the file after writing
dvd_copurchase_edge_list_file.close()


In [17]:
# Opening the file containing Amazon DVD data for reading
dvd_data_file = open('amazon-dvd-data.txt', 'r', encoding='utf-8', errors='ignore')

# Dictionary to store DVD information
amazon_dvd_data = {}

# Skipping the header line
dvd_data_file.readline()

# Iterating over each line in the file to parse DVD information
for record in dvd_data_file:
    fields = record.split("\t")  # Splitting the line into fields
    dvd_info = {}  # Dictionary to hold individual DVD metadata

    # Assigning metadata fields to the dictionary
    dvd_info['ID'] = fields[0].strip()
    dvd_asin = fields[1].strip()
    dvd_info['Title'] = fields[2].strip()
    dvd_info['Category'] = fields[3].strip()
    dvd_info['Group'] = fields[4].strip()
    dvd_info['Co-purchase'] = fields[5].strip()
    dvd_info['Sales Rank'] = int(fields[6].strip())
    dvd_info['Review Count'] = int(fields[7].strip())
    dvd_info['Average Rating'] = float(fields[8].strip())
    dvd_info['Degree of Centrality'] = int(fields[9].strip())
    dvd_info['Clustering Coefficient'] = float(fields[10].strip())

    # Storing the DVD information in the main dictionary
    amazon_dvd_data[dvd_asin] = dvd_info

# Closing the file
dvd_data_file.close()


In [18]:
# Opening the file containing the DVD co-purchase edge list
dvd_copurchase_list_file = open("amazon-dvd-copurchase-edges.edgelist", "rb")

# Reading the weighted edge list to create a graph
dvd_copurchase_network = nx.read_weighted_edgelist(dvd_copurchase_list_file)

# Closing the file after reading
dvd_copurchase_list_file.close()


In [21]:
print("Recommendations for Customers Who Bought This DVD:")
Customer_Purchased_Asin = 'B00005R1O7'
print("ASIN: ", Customer_Purchased_Asin)
print("Title: ", amazon_dvd_data[Customer_Purchased_Asin]['Title'])
print("Sales Rank: ", amazon_dvd_data[Customer_Purchased_Asin]['Sales Rank'])
print("Total Reviews: ", amazon_dvd_data[Customer_Purchased_Asin]['Review Count'])
print("Average Rating: ", amazon_dvd_data[Customer_Purchased_Asin]['Average Rating'])
print("Degree of Centrality: ", amazon_dvd_data[Customer_Purchased_Asin]['Degree of Centrality'])
print("Clustering Coefficient: ", amazon_dvd_data[Customer_Purchased_Asin]['Clustering Coefficient'])


Recommendations for Customers Who Bought This DVD:
ASIN:  B00005R1O7
Title:  The Crawling Eye (Widescreen European Edition)
Sales Rank:  1899
Total Reviews:  42
Average Rating:  4.0
Degree of Centrality:  80
Clustering Coefficient:  0.57


In [22]:
n = Customer_Purchased_Asin
#Creating an ego graph
ego = nx.ego_graph(dvd_co_purchase_graph, n, radius = 1)
#converting ego graph into a new graph for further processing
Purchased_ASIN_Ego_Graph = nx.Graph(ego)

In [23]:
threshold = 0.5 # finding the nodes having similarity measure based on category above the threshold value
Purchased_ASIN_Ego_Trim_Graph = nx.Graph()
for f, t, e in Purchased_ASIN_Ego_Graph.edges(data = True):
    if e['weight'] >= threshold: 
        Purchased_ASIN_Ego_Trim_Graph.add_edge(f, t)

In [24]:
Purchased_ASIN_Neighbors = Purchased_ASIN_Ego_Trim_Graph.neighbors(Customer_Purchased_Asin)

In [25]:
ASIN_Meta = []
for asin in Purchased_ASIN_Neighbors:
    ASIN = asin
    Title = Amazon_DVD[ASIN]['Title']
    SalesRank = Amazon_DVD[ASIN]['SalesRank']
    TotalReviews = Amazon_DVD[ASIN]['TotalReviews']
    AvgRating = Amazon_DVD[ASIN]['AvgRating']
    DegreeCentrality = Amazon_DVD[ASIN]['DegreeCentrality']
    ASIN_Meta.append((ASIN, Title, SalesRank, TotalReviews, AvgRating, DegreeCentrality, ClusteringCoeff))

In [26]:
Top5_ByAbgRating_ThenByTotalReviews = sorted(ASIN_Meta, key = lambda x: (x[4], x[3]), reverse = True)[:5]

In [31]:
print()
print("Top 5 Recommendations By Average Rating Then By TotalReviews for Users Purchased The DVD: ")
print('ASIN\t', 'Title\t', 'SalesRank\t', 'TotalReviews\t', 'AvgRating\t', 'DegreeCentrality\t', 'ClusteringCoeff')
for asin in Top5_ByAbgRating_ThenByTotalReviews:
    print(asin)

print()


Top 5 Recommendations By Average Rating Then By TotalReviews for Users Purchased The DVD: 
ASIN	 Title	 SalesRank	 TotalReviews	 AvgRating	 DegreeCentrality	 ClusteringCoeff
('B000059PPL', 'The Day the Earth Caught Fire', 5337, 16, 4.5, 6, 0.0)
('B00004WGCA', 'They Came from Beyond Space', 41155, 13, 3.5, 3, 0.0)
('6305772681', 'Devil Girl from Mars', 11644, 13, 3.5, 5, 0.0)
('B00005A0Q0', 'They Came from Beyond Space', 16133, 13, 3.5, 3, 0.0)
('B00004W19F', 'First Spaceship on Venus', 8052, 30, 3.0, 9, 0.0)

