## Importing the required modules

In [49]:
import string
import re
from nltk.corpus import stopwords
from stemming.porter2 import stem
import networkx 
from operator import itemgetter

## Accessing the meta-data

In [2]:
Metadata_File = open('amazon-meta.txt', 'r', encoding = 'utf-8', errors = 'ignore')
Amazon_Products = {}

In [3]:
(Id, ASIN, Title, Categories, Group, Copurchased, SalesRank, TotalReviews, AvgRating, DegreeCentrality, ClusteringCoeff) = \
    ("", "", "", "", "", "", 0, 0, 0.0, 0, 0.0)

for line in Metadata_File:
   
    line = line.strip()

    if(line.startswith("Id")):
        Id = line[3:].strip()

    elif(line.startswith("ASIN")):
        ASIN = line[5:].strip()

    elif(line.startswith("title")):
        Title = line[6:].strip()
        Title = ' '.join(Title.split())

    elif(line.startswith("group")):
        Group = line[6:].strip()

    elif(line.startswith("salesrank")):
        SalesRank = line[10:].strip()

    elif(line. startswith("similar")):
        ls = line.split()
        Copurchased = ' '.join([c for c in ls[2:]])

    elif(line.startswith("categories")):
        ls = line.split()
        Categories = ' '.join((Metadata_File.readline()).lower() for i in range(int(ls[1].strip())))
        Categories = re.compile('[%s]' % re.escape(string.digits + string.punctuation)).sub(' ', Categories)
        Categories = ' '.join(set(Categories.split()) - set(stopwords.words("english")))
        Categories = ' '.join(stem(word) for word in Categories.split())

    elif(line.startswith("reviews")):
        ls = line.split()
        TotalReviews = ls[2].strip()
        AvgRating = ls[7].strip()

    elif(line == ""):
        try:
            MetaData = {}
            if (ASIN != ""):
                Amazon_Products[ASIN] = MetaData   
            MetaData['Id'] = Id
            MetaData['Title'] = Title
            MetaData['Categories'] = ' '.join(set(Categories.split()))
            MetaData['Group'] = Group
            MetaData['Copurchased'] = Copurchased
            MetaData['SalesRank'] = int(SalesRank)
            MetaData['TotalReviews'] = int(TotalReviews)
            MetaData['AvgRating'] = float(AvgRating)
            MetaData['DegreeCentrality'] = DegreeCentrality
            MetaData['ClusteringCoeff'] = ClusteringCoeff
        except NameError:
            continue
    (Id, ASIN, Title, Categories, Group, Copurchased, SalesRank, TotalReviews, AvgRating, DegreeCentrality, ClusteringCoeff) = \
        ("", "", "", "", "", "", 0, 0, 0.0, 0, 0.0)

Metadata_File.close()

## Filtering out data related to book records

In [4]:
Amazon_Books = {}
for asin, metadata in Amazon_Products.items():
    if (metadata['Group'] == 'Book'):
        Amazon_Books[asin] = Amazon_Products[asin]

In [5]:
for asin, metadata in Amazon_Books.items():
    Amazon_Books[asin]['Copurchased'] = ' '.join([cp for cp in metadata['Copurchased'].split() if cp in Amazon_Books.keys()])

In [6]:
Amazon_Books

{'0827229534': {'Id': '1',
  'Title': 'Patterns of Preaching: A Sermon Sampler',
  'Categories': 'spiritu religion subject sermon christian book preach clergi',
  'Group': 'Book',
  'Copurchased': '0804215715 156101074X 0687023955 0687074231 082721619X',
  'SalesRank': 396585,
  'TotalReviews': 2,
  'AvgRating': 5.0,
  'DegreeCentrality': 0,
  'ClusteringCoeff': 0.0},
 '0738700797': {'Id': '2',
  'Title': 'Candlemas: Feast of Flames',
  'Categories': 'spiritu religion subject base book witchcraft wicca earth',
  'Group': 'Book',
  'Copurchased': '0738700827 1567184960 1567182836 0738700525 0738700940',
  'SalesRank': 168596,
  'TotalReviews': 12,
  'AvgRating': 4.5,
  'DegreeCentrality': 0,
  'ClusteringCoeff': 0.0},
 '0486287785': {'Id': '3',
  'Title': 'World War II Allied Fighter Planes Trading Cards',
  'Categories': 'hobbi garden subject craft book general home',
  'Group': 'Book',
  'Copurchased': '',
  'SalesRank': 1270652,
  'TotalReviews': 1,
  'AvgRating': 5.0,
  'DegreeCentr

## Adding recommendation property

In [20]:
Copurchase_Graph = networkx.Graph() 
for asin, metadata in Amazon_Books.items():
    Copurchase_Graph.add_node(asin) 
    for a in metadata['Copurchased'].split():
        Copurchase_Graph.add_node(a.strip()) 
        similarity = 0 
        n1 = set((Amazon_Books[asin]['Categories']).split())
        n2 = set((Amazon_Books[a]['Categories']).split()) 
        n1In2 = n1 & n2           
        n1Un2 = n1 | n2
        if(len(n1In2)) > 0:
            similarity = round(len(n1In2) / len(n1Un2), 2) 
            Copurchase_Graph.add_edge(asin, a.strip(), weight = similarity)

In [23]:
dc = networkx.degree(Copurchase_Graph) 
for asin in networkx.nodes(Copurchase_Graph):
    metadata = Amazon_Books[asin] 
    metadata['DegreeCentrality'] = int(dc[asin]) 
    ego = networkx.ego_graph(Copurchase_Graph, asin, radius = 1) 
    metadata['ClusteringCoeff'] = round(networkx.average_clustering(ego), 2)
    Amazon_Books[asin] = metadata 

In [24]:
Amazon_Books_File = open('amazon-books.txt', 'w', encoding = 'utf-8', errors = 'ignore')

Amazon_Books_File.write("Id\t" + "ASIN\t" + "Title\t" + "Categories\t" + "Group\t" + "Copurchased\t" + "SalesRank\t" + "TotalReviews\t" + "AvgRating\t" "DegreeCentrality\t" + 
                        "ClusteringCoeff\n") 

for asin, metadata in Amazon_Books.items(): # converting the meta-data into txt file
     Amazon_Books_File.write(metadata['Id'] + "\t" + \
                             asin + "\t" +  \
                             metadata['Title'] + "\t" + \
                             metadata['Categories'] + "\t" + \
                             metadata['Group'] + "\t" +  \
                             metadata['Copurchased'] + "\t" + \
                             str(metadata['SalesRank']) + "\t" + \
                             str(metadata['TotalReviews']) + "\t" + 
                             str(metadata['AvgRating']) + "\t" + \
                             str(metadata['DegreeCentrality']) + "\t" + \
                             str(metadata['ClusteringCoeff']) + "\n")

Amazon_Books_File.close()

In [25]:
# writing the adjacency edge list
Amazon_Books_File = open("amazon-books-copurchase.edgelist", 'wb') 
networkx.write_weighted_edgelist(Copurchase_Graph, Amazon_Books_File) 
Amazon_Books_File.close() 

## Reading the text file

In [27]:
Books_File = open('amazon-books.txt', 'r', encoding = 'utf-8', errors = 'ignore')
Books = {}
Books_File.readline()
for line in Books_File:
    cell = line.split("\t") 
    MetaData = {}
    MetaData['Id'] = cell[0].strip() 
    ASIN = cell[1].strip()
    MetaData['Title'] = cell[2].strip() 
    MetaData['Categories'] = cell[3].strip() 
    MetaData['Group'] = cell[4].strip() 
    MetaData['Copurchased'] = cell[5].strip()
    MetaData['SalesRank'] = int(cell[6].strip())
    MetaData['TotalReviews'] = int(cell[7].strip())
    MetaData['AvgRating'] = float(cell[8].strip())
    MetaData['DegreeCentrality'] = int(cell[9].strip())
    MetaData['ClusteringCoeff'] = float(cell[10].strip())
    Books[ASIN] = MetaData 
Books_File.close()

In [28]:
Books_File = open("amazon-books-copurchase.edgelist", "rb") 
Copurchase_Graph = networkx.read_weighted_edgelist(Books_File) 
Books_File.close() 

## Giving Book id as input for recommending books

In [29]:
print("Looking for Recommendations for Customer Purchasing this Book: ")
print("---------------------------------------------------------------")
Purchased_ASIN = '0805047905'
print("ASIN = ", Purchased_ASIN)
print("Title = ", Books[Purchased_ASIN]['Title'])
print("SalesRank = ", Books[Purchased_ASIN]['SalesRank'])
print("TotalReviews = ", Books[Purchased_ASIN]['TotalReviews'])
print("AvgRating = ", Books[Purchased_ASIN]['AvgRating'])
print("DegreeCentrality = ", Books[Purchased_ASIN]['DegreeCentrality'])
print("ClusteringCoeff = ", Books[Purchased_ASIN]['ClusteringCoeff'])

Looking for Recommendations for Customer Purchasing this Book: 
---------------------------------------------------------------
ASIN =  0805047905
Title =  Brown Bear, Brown Bear, What Do You See?
SalesRank =  171
TotalReviews =  172
AvgRating =  5.0
DegreeCentrality =  213
ClusteringCoeff =  0.66


In [30]:
n = Purchased_ASIN
ego = networkx.ego_graph(Copurchase_Graph, n, radius = 1)
Purchased_ASIN_Ego_Graph = networkx.Graph(ego)

In [44]:
threshold = 0.5 # finding the nodes having similarity measure based on category above the threshold value
Purchased_ASIN_Ego_Trim_Graph = networkx.Graph()
for f, t, e in Purchased_ASIN_Ego_Graph.edges(data = True):
    if e['weight'] >= threshold: 
        Purchased_ASIN_Ego_Trim_Graph.add_edge(f, t)

In [45]:
Purchased_ASIN_Neighbors = Purchased_ASIN_Ego_Trim_Graph.neighbors(Purchased_ASIN)

In [46]:
ASIN_Meta = []
for asin in Purchased_ASIN_Neighbors:
    ASIN = asin
    Title = Amazon_Books[ASIN]['Title']
    SalesRank = Amazon_Books[ASIN]['SalesRank']
    TotalReviews = Amazon_Books[ASIN]['TotalReviews']
    AvgRating = Amazon_Books[ASIN]['AvgRating']
    DegreeCentrality = Amazon_Books[ASIN]['DegreeCentrality']
    ASIN_Meta.append((ASIN, Title, SalesRank, TotalReviews, AvgRating, DegreeCentrality, ClusteringCoeff))

## Showing Top 5 Recommendations

In [47]:
Top5_ByAbgRating_ThenByTotalReviews = sorted(ASIN_Meta, key = lambda x: (x[4], x[3]), reverse = True)[:5]

In [51]:
print()
print("Top 5 Recommendations By AvgRating Then By TotalReviews for Users Purchased The Book: ")
print("--------------------------------------------------------------------------------------")
print('ASIN\t', 'Title\t', 'SalesRank\t', 'TotalReviews\t', 'AvgRating\t', 'DegreeCentrality\t', 'ClusteringCoeff')
for asin in Top5_ByAbgRating_ThenByTotalReviews:
    print(asin)

print()


Top 5 Recommendations By AvgRating Then By TotalReviews for Users Purchased The Book: 
--------------------------------------------------------------------------------------
ASIN	 Title	 SalesRank	 TotalReviews	 AvgRating	 DegreeCentrality	 ClusteringCoeff
('0152010661', 'Time for Bed', 3122, 87, 5.0, 60, 0.0)
('0694006246', 'Big Red Barn Board Book', 4457, 40, 5.0, 27, 0.0)
('1581170769', 'What Makes a Rainbow?: Pop-Up', 40821, 29, 5.0, 7, 0.0)
('0064435962', 'From Head to Toe', 187777, 22, 5.0, 5, 0.0)
('0694013013', 'From Head to Toe Board Book', 6026, 22, 5.0, 47, 0.0)

