-
Notifications
You must be signed in to change notification settings - Fork 0
/
extractVendors.py
executable file
·136 lines (114 loc) · 5.29 KB
/
extractVendors.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
import sys
import re
import os
import pandas as pd
from pandas import ExcelWriter
import urllib.request
from bs4 import BeautifulSoup
def ExtractVendors(product,out_dir='./out',log_dir=None):
if(log_dir):
if not os.path.exists(log_dir):
os.makedirs(log_dir)
log_file = open(log_dir + "extractProducts.log", "a")
old_stdout = sys.stdout
sys.stdout = log_file
prod_name = product.Name # Product is a tuple : [Index, Name, URL, subCategory, Category, Industry]
prod_url = product.URL
prod_subCat = product.subCategory
prod_cat = product.Category
prod_indus = product.Industry
# Query the URL for its html
try:
prod_page = urllib.request.urlopen(prod_url)
except Exception as e:
print(f"Error fetching product URL\n{e}\nSkipping ..")
prod_page = None
vendors = pd.DataFrame(columns={'Name','URL','Phone','Address','Category','Industry'})
if(prod_page):
# Parsing the URL page into BeautifulSoup format
prod_soup = BeautifulSoup(prod_page, 'html.parser')
# Extracting vendor box from the webpage
vendor_list = prod_soup.find('ul', attrs={'id': 'm'})
vendors_box = vendor_list.find_all('li', attrs={'id': re.compile('LST')})
for vendor_box in vendors_box:
try:
vendor_info_box = vendor_box.find(
'div', attrs={'class': 'r-cl b-gry'})
vendor_info = vendor_info_box.find('a', href=True)
vendor_name = vendor_info.getText().strip()
vendor_url = vendor_info['href']
# print(vendor_url.text)
# print(vendor_url['href'])
try:
vendor_number = vendor_info_box.find('div', attrs={'id': re.compile('mobenq')}).getText()
vendor_number = vendor_number.strip('cCaAlL')
# print(vendor_number)
# print(vendor_number.text)
except Exception as e:
print(f"!! !! Error fetching vendor phone\n{e}\n")
vendor_number = 'Not Found'
try:
vendor_address = vendor_info_box.find('p', attrs={'class': 'sm clg'}).getText().strip()
# print(vendor_address.text)
except Exception as e:
print(f"!! Error fetching vendor address\n{e}\n")
vendor_address = 'Not Found'
vendors = vendors.append({'Name': vendor_name,
'URL': vendor_url,
'Phone': vendor_number,
'Address': vendor_address,
'Category': prod_cat,
'Industry': prod_indus},
ignore_index=True, sort=False)
except Exception as e:
print(f"\n!!!!!!!!!!!!! Error fetching vendor data\n{e} .. SKIPPING ...\n")
if (len(vendors.index) > 0):
vendors.sort_values('Name', inplace=True)
vendors.drop_duplicates('URL', inplace=True)
print(f'\nFound : {len(vendors.index)} vendors in {prod_name} : creating file ...')
prod_out_dir = out_dir + '/' + prod_indus + '/' + prod_cat + '/' + prod_subCat
vendors_file = prod_out_dir + '/vendorsOf_' + prod_name + '.xlsx'
if not os.path.exists(prod_out_dir):
os.makedirs(prod_out_dir)
writer = ExcelWriter(vendors_file) # TO DO : adapt script to write multiple sheets per file, one industry per file
try:
vendors.to_excel(writer, index=False)
writer.save()
print(f"Saved vendors of {prod_name} at:\n{vendors_file}\n")
writer.close()
except Exception as e:
print(f"\n***** Error saving to {vendors_file}\n{e}\nSkipping ..")
del writer
else:
print(f"\nNo vendors found at {prod_url} : SKIPPING ..\n")
if(log_dir):
sys.stdout = old_stdout
log_file.close()
return vendors
g_vendors = pd.DataFrame(columns={'Name','URL','Phone','Address','Category','Industry'})
file_name = './out/all_products.xlsx'
products = pd.read_excel(file_name)
i = 1
ttl_prods = len(products.index)
for product in products.itertuples():
if i < 17175:
print("Skipping scraped product URL...")
else:
vendors = ExtractVendors(product)
g_vendors = g_vendors.append(vendors, ignore_index=False)
del vendors
print(f"\n\n%%% {i*100/ttl_prods}% of products scanned ..")
i += 1
g_vendors.sort_values('Name',inplace=True)
g_vendors.drop_duplicates('URL',inplace=True)
writer = ExcelWriter('./out/all_vendors.xlsx')
try:
g_vendors.to_excel(writer, index=False, encoding='UTF-8')
writer.save()
writer.close()
print(f"Found {len(g_vendors.index)} distinct vendors !\nRecap file saved as XLSX at {file_name}")
except Exception as e:
print(f"Error saving recap file as XLSX")
writer.close()
g_vendors.to_csv('./out/all_vendors.csv',index=False,sep='|',encoding='UTF-8')
print(f"Recap file saved as CSV at ./out/all_vendors.csv\nDONE")