# Imposter Express

The goal of this project is to find imposter reviews on Ali Express

In [47]:
#Import some relevant packages

#For pulling things from AliExpress
import pickle
import selenium
from selenium import webdriver
import time
import json
from datetime import datetime
from bs4 import BeautifulSoup
import csv
import requests

#For working with SQL in Python
from sqlalchemy import create_engine
from sqlalchemy_utils import database_exists, create_database
import psycopg2

#For generally exploring things within Python
import pandas as pd
import matplotlib as mpl

import os

## Scrape Data
### Pull Cookies
This bit of code has you log into Aliexpress in Firefox and pulls the relevant cookies

In [35]:
browser = webdriver.Firefox()


def get_cookies():
    browser.get("https://login.aliexpress.com/buyer.htm?return=https%3A%2F%2Fwww.aliexpress.com%2F&random=CEA73DF4D81D4775227F78080B9B6126")
    print('input your username and password in Firefox and hit Submit')
    input('Hit Enter here if you have summited the form: <Enter>')
    cookies = browser.get_cookies()
    pickle.dump(cookies, open("cookies.pickle", "wb"))


def set_cookies():
    browser.get("https://aliexpress.com")
    cookies = pickle.load(open("cookies.pickle", "rb"))
    for cookie in cookies:
        browser.add_cookie(cookie)
    browser.get("https://bestselling.aliexpress.com/en")


if __name__ == '__main__':
    get_cookies()

input your username and password in Firefox and hit Submit
Hit Enter here if you have summited the form: <Enter>


### Scrape some data from AliExpress

With the login now in hand, the goal is to pull a bunch of data from Aliexpress

In [12]:
driver = webdriver.Firefox()
driver.get("https://aliexpress.com")
cookies = pickle.load(open("cookies.pickle", "rb"))
for cookie in cookies:
    driver.add_cookie(cookie)


def extract_product_urls_from_list_page(list_page_url):
    driver.get(list_page_url)
    time.sleep(5)
    cats = driver.find_elements_by_css_selector('span.title')

    all_links = set()
    for ind, cat in enumerate(cats):
        print(cat.text)
        try:
            cat.click()
        except Exception:
            continue
        if ind == 0:
            items = driver.find_elements_by_class_name('item-desc')
            links = [item.get_attribute('href') for item in items]
        else:
            items = driver.find_elements_by_css_selector('div.title > a')
            links = [item.get_attribute('href') for item in items]
        for link in links:
            all_links.add(link)
        time.sleep(2)
    return all_links


if __name__ == '__main__':
    extract_product_urls_from_list_page('https://www.aliexpress.com/all-wholesale-products.html?spm=2114.11010108.22.1.650c649bLCkkI1')

In [36]:
driver = webdriver.Firefox()
driver.get("https://aliexpress.com")
cookies = pickle.load(open("cookies.pickle", "rb"))
for cookie in cookies:
    driver.add_cookie(cookie)


def extract_product_urls_from_other_page(list_page_url):
    driver.get(list_page_url)
    time.sleep(5)
    all_links=set()
    items = driver.find_elements_by_class_name('product')
    links = [item.get_attribute('href') for item in items]
    for link in links:
        all_links.add(link)
    time.sleep(2)
    return all_links


if __name__ == '__main__':
    extract_product_urls_from_list_page('https://www.aliexpress.com/category/100003109/women-clothing-accessories.html?minPrice=&maxPrice=&isBigSale=n&isFreeShip=n&isNew=n&isFavorite=n&shipFromCountry=us&shipCompanies=&SearchText=&CatId=100003109&g=y&SortType=total_tranpro_desc&needQuery=n')

In [37]:

driver = webdriver.Firefox()
driver.get("https://aliexpress.com")
cookies = pickle.load(open("cookies.pickle", "rb"))
for cookie in cookies:
    driver.add_cookie(cookie)


def extract_product_info(product_url):
    driver.get(product_url)
    content = driver.page_source

    soup = BeautifulSoup(content, "html.parser")
    
    if soup.find('input', {'id': 'hid-product-id'})['value'] is not None:
        product_id = soup.find('input', {'id': 'hid-product-id'})['value']
    else:
        product_id = 1
    title = soup.find('h1', {'class': 'product-name'}).text
    price = float(soup.find('span', {'id': 'j-sku-price'}).text.split('-')[0])

    if soup.find('span', {'id': 'j-sku-discount-price'}):
        discount_price = float(soup.find('span', {'id': 'j-sku-discount-price'}).text.split('-')[0])
    else:
        discount_price = None

    properties = soup.findAll('li', {'class': 'property-item'})
    attrs_dict = {}
    for item in properties:
        name = item.find('span', {'class': 'propery-title'}).text[:-1]
        val = item.find('span', {'class': 'propery-des'}).text
        attrs_dict[name] = val
    description = json.dumps(attrs_dict)

    stars = float(soup.find('span', {'class': 'percent-num'}).text)
    votes = int(soup.find('span', {'itemprop': 'reviewCount'}).text)
    orders = int(soup.find('span', {'id': 'j-order-num'}).text.split()[0].replace(',', ''))
    wishlists = 0  # int(soup.find('span', {'id': 'j-wishlist-num'}).text.strip()[1:-1].split()[0])

    try:
        shipping_cost = soup.find('span', {'class': 'logistics-cost'}).text
        shipping_company = soup.find('span', {'id': 'j-shipping-company'}).text
    except Exception:
        shipping_cost = ''
        shipping_company = ''
    is_free_shipping = shipping_cost == 'Free Shipping'
    is_epacket = shipping_company == 'ePacket'

    primary_image_url = soup.find('div', {'id': 'magnifier'}).find('img')['src']

    store_id = soup.find('span', {'class': 'store-number'}).text.split('.')[-1]
    store_name = soup.find('span', {'class': 'shop-name'}).find('a').text
    store_start_date = soup.find('span', {'class': 'store-time'}).find('em').text
    store_start_date = datetime.strptime(store_start_date, '%b %d, %Y')

    if soup.find('span', {'class': 'rank-num'}):
        store_feedback_score = int(soup.find('span', {'class': 'rank-num'}).text)
        store_positive_feedback_rate = float(soup.find('span', {'class': 'positive-percent'}).text[:-1]) * 0.01
    else:
        driver.refresh()
        try:
            store_feedback_score = int(soup.find('span', {'class': 'rank-num'}).text)
            store_positive_feedback_rate = float(soup.find('span', {'class': 'positive-percent'}).text[:-1]) * 0.01
        except Exception:
            store_feedback_score = -1
            store_positive_feedback_rate = -1

    try:
        cats = [item.text for item in soup.find('div', {'class': 'ui-breadcrumb'}).findAll('a')]
        category = '||'.join(cats)
    except Exception:
        category = ''

    row = {
        'product_id': product_id,
        'title': title,
        'description': description,
        'price': price,
        'discount_price': discount_price,
        'stars': stars,
        'votes': votes,
        'orders': orders,
        'wishlists': wishlists,
        'is_free_shipping': is_free_shipping,
        'is_epacket': is_epacket,
        'primary_image_url': primary_image_url,
        'store_id': store_id,
        'store_name': store_name,
        'store_start_date': store_start_date,
        'store_feedback_score': store_feedback_score,
        'store_positive_feedback_rate': store_positive_feedback_rate,
        'category': category,
        'product_url': product_url
    }
    return row


#if __name__ == '__main__':
 #   extract_product_info('https://www.aliexpress.com/item/Hair-Accessories-Synthetic-Wig-Donuts-Bud-Head-Band-Ball-French-Twist-Magic-DIY-Tool-Bun-Maker/32457370321.html?scm=1007.13442.37932.0&pvid=f8b9f498-65d4-400f-a14f-38b4bba77546&tpp=1')

In [38]:
def extract_product_reviews(product_id, output_file_name, max_page=100):
    url_template = 'https://m.aliexpress.com/ajaxapi/EvaluationSearchAjax.do?type=all&index={}&pageSize=20&productId={}&country=US'
    initial_url = url_template.format(1, product_id)
    print(product_id)
    reviews = []

    s = requests.Session()

    resp = s.get(initial_url)
    if resp.status_code == 200:
        data = resp.json()
        total_page = data['totalPage']
        total_page = min([total_page, max_page])
        reviews += data['evaViewList']

        if total_page > 1:
            next_page = 2
            while next_page <= total_page:
                print('{}\t{}/{}'.format(product_id, next_page, total_page))
                next_url = url_template.format(next_page, product_id)
                resp = s.get(next_url)

                next_page += 1

                try:
                    data = resp.json()
                except Exception:
                    continue

                reviews += data['evaViewList']

    filtered_reviews = []
    for review in reviews:
        data = {
            'product_id': product_id,
            'anonymous': review['anonymous'],
            'buyerCountry': review['buyerCountry'],
            'buyerEval': review['buyerEval'],
            'buyerFeedback': review['buyerFeedback'],
            'buyerGender': review['buyerGender'] if 'buyerGender' in review else '',
            'buyerHeadPortrait': review['buyerHeadPortrait'] if 'buyerHeadPortrait' in review else '',
            'buyerId': review['buyerId'] if 'buyerId' in review else '',
            'buyerName': review['buyerName'] if 'buyerName' in review else '',
            'evalDate': review['evalDate'],
            'image': review['images'][0] if 'images' in review and len(review['images']) > 0 else '',
            'logistics': review['logistics'] if 'logistics' in review else '',
            'skuInfo': review['skuInfo'] if 'skuInfo' in review else '',
            'thumbnail': review['thumbnails'][0] if 'thumbnails' in review and len(review['thumbnails']) > 0 else '',
        }
        filtered_reviews.append(data)

    if len(filtered_reviews)>1:
        keys = filtered_reviews[0].keys()
        with open(output_file_name, 'a') as output_file:
            dict_writer = csv.DictWriter(output_file, keys)
            if product_id == 32243608596:
                dict_writer.writeheader()
            dict_writer.writerows(filtered_reviews)
    return filtered_reviews


#if __name__ == '__main__':
 #   extract_product_reviews('32457370321','some_reviews.csv')

In [63]:
#The goal of this cell is to call of the scraper functions to actually scrape something

#Next-up: Categories/Home/Household-Cleaning

set_of_product_links = extract_product_urls_from_other_page('https://www.aliexpress.com/category/100006206/pet-products.html?spm=2114.search0103.0.0.5ea46101mwMcye&site=glo&SortType=total_tranpro_desc&g=y&tag=')
name_of_review_file = 'pet_reviews_001.csv'
name_of_product_info_file = 'pet_products_001.csv'

driver = webdriver.Firefox()
driver.get("https://aliexpress.com")
cookies = pickle.load(open("cookies.pickle", "rb"))
for cookie in cookies:
    driver.add_cookie(cookie)

product_list = []
for i, current_link in enumerate(set_of_product_links):
    try:
        product_dict = extract_product_info(current_link)
    except:
        continue
    if product_dict is not None:
        product_list.append(product_dict)
        current_id = product_dict['product_id']
        time.sleep(2)
        if current_id != 1:
            extract_product_reviews(current_id, name_of_review_file)
    time.sleep(2)
    
os.system("printf '\a'") # or '\7'

32835243137
32835243137	2/3
32835243137	3/3
32804033384
32726788021
32839442854
32839442854	2/8
32839442854	3/8
32839442854	4/8
32839442854	5/8
32839442854	6/8
32839442854	7/8
32839442854	8/8
32830482857
32830482857	2/5
32830482857	3/5
32830482857	4/5
32830482857	5/5
32787592039
32787592039	2/5
32787592039	3/5
32787592039	4/5
32787592039	5/5
32779909272
32779909272	2/4
32779909272	3/4
32779909272	4/4
32837141505
32837141505	2/7
32837141505	3/7
32837141505	4/7
32837141505	5/7
32837141505	6/7
32837141505	7/7
32654842387
32654842387	2/2
32800168233
32800168233	2/4
32800168233	3/4
32800168233	4/4
32770427361
32770427361	2/3
32770427361	3/3
32844553211
32844553211	2/4
32844553211	3/4
32844553211	4/4
32802445220
32802445220	2/4
32802445220	3/4
32802445220	4/4
32580161345
32580161345	2/5
32580161345	3/5
32580161345	4/5
32580161345	5/5
32848063583
32848063583	2/6
32848063583	3/6
32848063583	4/6
32848063583	5/6
32848063583	6/6
32860406262
32860406262	2/7
32860406262	3/7
32860406262	4/7
32860406

0

In [31]:
driver = webdriver.Firefox()
driver.get("https://aliexpress.com")

In [7]:
set_of_product_links

{'https://www.aliexpress.com/store/product/1-Pcs-Delicate-Rose-Flower-Pendant-Necklace-Charm-Gold-Silver-Beauty-Rose-Jewelry-Necklace-For-Women/2310040_32818439807.html?ws_ab_test=searchweb0_0,searchweb201602_4_10065_10068_10130_10547_10546_10059_10548_315_10545_10696_100031_5017615_531_10084_10083_10103_451_10618_452_10307_5017715,searchweb201603_45,ppcSwitch_5&algo_expid=0a4954fd-946d-4bad-8af9-7e1bf0d6f16d-36&algo_pvid=0a4954fd-946d-4bad-8af9-7e1bf0d6f16d&priceBeautifyAB=0',
 'https://www.aliexpress.com/store/product/1Pc-Newest-JESUS-CROSS-Fashion-Pendant-Necklace-Jewelry-Stainless-Steel-Chain-Christian-Symbol-Nice-Gift-High/1826166_32820963692.html?ws_ab_test=searchweb0_0,searchweb201602_4_10065_10068_10130_10547_10546_10059_10548_315_10545_10696_100031_5017615_531_10084_10083_10103_451_10618_452_10307_5017715,searchweb201603_45,ppcSwitch_5&algo_expid=0a4954fd-946d-4bad-8af9-7e1bf0d6f16d-39&algo_pvid=0a4954fd-946d-4bad-8af9-7e1bf0d6f16d&priceBeautifyAB=0',
 'https://www.aliexpress.

In [18]:
current_link

NameError: name 'current_link' is not defined

In [64]:
list_of_product_links = list(set_of_product_links)
with open('pet_products_001.pkl', 'wb') as f:
    pickle.dump(list_of_product_links, f)

In [79]:
len(list_of_product_links)

48

In [65]:
with open('pet_products_001.csv', 'w') as output_file:
    keys = keys = product_list[0].keys()
    dict_writer = csv.DictWriter(output_file, keys)
    dict_writer.writeheader()
    dict_writer.writerows(product_list)

In [None]:
forced_set_of_links