-
Notifications
You must be signed in to change notification settings - Fork 0
/
extractProducts.py
executable file
·140 lines (111 loc) · 6.21 KB
/
extractProducts.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
import sys
import re
import os
import pandas as pd
from pandas import ExcelWriter
import urllib.request
from bs4 import BeautifulSoup
def ExtractProducts(base_url, category, out_dir='./out', log_dir=None):
if(log_dir):
if not os.path.exists(log_dir):
os.makedirs(log_dir)
old_stdout = sys.stdout
log_file = open(log_dir + "extractProducts.log", "a")
sys.stdout = log_file
category_name = category.Name
category_url = category.URL
category_s_industry = category.Industry # Tuple = [Index, URL, Industry, Name]
cat_out_dir = out_dir + '/' + category_s_industry + '/' + category_name
try:
category_page = urllib.request.urlopen(category_url)
category_soup = BeautifulSoup(category_page, 'html.parser')
category_box = category_soup.find_all(
'section', attrs={'class': 'ctgry'})
except Exception as e:
print(
f"++++++++++++++++++++++ WARNING : {e} : Skipping cat {category_url}")
category_box = []
subCategories = pd.DataFrame(columns=['Name','URL','Category','Industry'])
products = pd.DataFrame(columns=['Name','URL','subCategory','Category','Industry'])
for cat in category_box:
try:
subCategory_box = cat.find_all('li', attrs={'class': 'box'})
except Exception as e:
print(f"Error fetching subCategory_box from {category_name} ... Skipping")
subCategory_box = []
for subCategory in subCategory_box:
products_per_subCat = pd.DataFrame(columns=['Name', 'URL','Available SKUs','subCategory','Category','Industry'])
try:
subCategory_soup = subCategory.find('a', attrs={'class':'GNTitle title'})
subCategory_name = subCategory_soup.getText().strip()
subCategory_url = base_url + subCategory_soup['href']
subCategories = subCategories.append({'Name':subCategory_name,
'URL':subCategory_url,
'Category':category_name,
'Industry':category_s_industry},
ignore_index=True, sort=False)
except Exception as e:
print(f"Error fetching subCategory data\n{e}\nProceeding to products in subCategory")
subCategory_name = 'Other products'
subCat_out_dir = cat_out_dir + '/' + subCategory_name
products_box = []
try:
prod = subCategory.find_all('div', attrs={'class':'lik'}, recursive=True)
products_box.extend(prod)
except Exception as e:
print(f"\n\nError fetching products from {subCategory_name}\n{e}\n SKIPPINg ...")
for prod_box in products_box:
try:
prod_soup = prod_box.find('a', href=True)
prod_url = base_url + prod_soup['href']
prod_name = prod_soup.getText().strip()
products_per_subCat = products_per_subCat.append({'Name':prod_name,
'URL':prod_url,
'subCategory': subCategory_name,
'Category':category_name,
'Industry':category_s_industry},
ignore_index=True, sort=False)
products = products.append({'Name':prod_name,
'URL':prod_url,
'subCategory': subCategory_name,
'Category':category_name,
'Industry':category_s_industry},
ignore_index=True, sort=False)
except Exception as e:
print(f"Error fetching product from {subCategory_name}\n{e} : SKIPPING ..\n")
if not os.path.exists(subCat_out_dir):
os.makedirs(subCat_out_dir)
writer = ExcelWriter(subCat_out_dir + '/products.xlsx') # TO DO : adapt script to write multiple sheets per file, one industry per file
products_per_subCat.to_excel(writer, index=False)
writer.save()
writer.close()
print(f"Saved products of {subCategory_name} data at {subCat_out_dir}/products.xlsx\n")
if not os.path.exists(cat_out_dir):
os.makedirs(cat_out_dir)
writer = ExcelWriter(cat_out_dir + '/subCategories.xlsx') # TO DO : adapt script to write multiple sheets per file, one industry per file
subCategories.sort_values('Name',inplace=True)
subCategories.drop_duplicates('URL',inplace=True)
subCategories.to_excel(writer, index=False)
writer.save()
writer.close()
print(f"Found {len(subCategories.index)} subCategories TOTAL in category : {category_name}\nSaved data at {cat_out_dir}/subCategories.xlsx")
writer = ExcelWriter(cat_out_dir + '/all_products.xlsx') # TO DO : adapt script to write multiple sheets per file, one industry per file
products = products.append(subCategories, ignore_index=True, sort=False)
products.sort_values('Name',inplace=True)
products.drop_duplicates('URL',inplace=True)
products.to_excel(writer, index=False)
writer.save()
writer.close()
print(f"Found {len(products.index)} products TOTAL in category : {category_name}\nSaved data at {cat_out_dir}/all_products.xlsx\n")
if(log_dir):
sys.stdout = old_stdout
log_file.close()
return products
# categories = pd.DataFrame(columns=['Name', 'URL','Industry'])
# categories = categories.append({'Name': 'Test CAT',
# 'URL':'/indianexporters/glue.html',
# 'Industry': 'Test Industry'},
# ignore_index=True)
# sellers = pd.DataFrame(columns=['Name', 'URL', 'Phone', 'Address','Category','Industry'])
# sellers = ExtractProducts(sellers,categories,base_url)
# print(sellers.iloc[:10,:])