# <center>An example of SNS</center>

- Threadless.com is a crowdsouring website for graphic designs.
- Desginers submit artworks and recieve ratings from the community within a seven-day period. 
- Designs with the best scores will be selected to print on T-shirts and other products for sale. 

### Webscraping objectives

- Get a sample of users and artifacts. Consider a sampling strategy. 
- Scrape artifact-level features.
- Scrape user-level features. 

In [1]:
import requests
from bs4 import BeautifulSoup
import csv
import re
import time


In [2]:
# Get five urls of pages as a sample of latest artifacts.

link="https://www.threadless.com/designs/archive?page="
num=list(range(1,6))
pages=[]
for i in num:
    page=link+str(i)
    pages.append(page)
print(pages)


['https://www.threadless.com/designs/archive?page=1', 'https://www.threadless.com/designs/archive?page=2', 'https://www.threadless.com/designs/archive?page=3', 'https://www.threadless.com/designs/archive?page=4', 'https://www.threadless.com/designs/archive?page=5']


In [3]:
# Get urls of all the designs in these pages
# To reduce the load to their server, will demonnstrate one page

designs=[]
for i in pages:
    print('working on page'+str(' ')+str(i))
    response=requests.get(i)
    soup=BeautifulSoup(response.content, "html.parser")
    links=soup.find('ol',class_='feed-archive th-grided')
    li=links.find_all('li',class_="old")
    for j in li:
        name=j.find("a")["href"]
        designs.append(name)
   

working on page https://www.threadless.com/designs/archive?page=1
working on page https://www.threadless.com/designs/archive?page=2
working on page https://www.threadless.com/designs/archive?page=3
working on page https://www.threadless.com/designs/archive?page=4
working on page https://www.threadless.com/designs/archive?page=5


In [4]:
designs[:5]

# can write out the sample of artifacts 
# with open('designs.csv', 'w') as csvfile:
#    writer=csv.writer(csvfile, delimiter=',')
#    writer.writerows(zip(designs))


# read in your sample
# raw_data_file = open("designs.csv", 'r')
# csv_data_file = csv.reader(raw_data_file, delimiter=',')
# designs = []
# for line in csv_data_file:
#     print(line[0])
#     designs.append(line[0])

['/designs/the-robot-ii',
 '/designs/betta-fish-blue',
 '/designs/lips-and-lashes-red',
 '/designs/lips-and-lashes-pink',
 '/designs/appendix-surgery-design-cute-appendectomy-two-thum']

In [18]:
# Get artifact level features
# For each design, get title, author, average score, number of scores, challenge name

rows=[]

for i in designs[:40]:
    try:
        url="https://www.threadless.com"+i
        response=requests.get(url)
        soup=BeautifulSoup(response.content, "html.parser")
        
        # initiate the variable for each period
        title=None
        author=None
        avg_score=None
        total_score=None
        
        ##title
        title=soup.select('div.submission-title h1')
        if title!=[]:
            title=title[0].text

        ##author
        author=soup.select('div.author-block a.author')
        if author!=[]:
            author=author[0].text

        ##score
        avg_score=soup.select('div.vote-avg span')
        if avg_score!=[]:
            avg_score=avg_score[0].text

        ##total scores
        total_score=soup.select('div.vote-count span')
        if total_score!=[]:
            total_score=total_score[0].text
        
        rows.append((title, author, avg_score, total_score))
        print((title, author, avg_score, total_score))
    
    except AttributeError:
        pass



('The Robot II', 'Kaiser_Paul', '3.43', '7')
('Betta Fish Blue', 'JulieErinDesign', '3.40', '5')
('Lips and Lashes Red', 'JulieErinDesign', '4.00', '1')
('Lips and Lashes Pink', 'JulieErinDesign', '3.00', '2')
('Appendix Surgery Design - Cute Appendectomy Two Thumbs Up Appendicitis Humor', 'iheartguts', '3.00', '2')
('Sherlock Bones', 'rachelpilmoor', '4.18', '17')
('Lucky Liver - Cute Liver Surgery Transplant Funny Cirrhosis Hepatitis Humor', 'iheartguts', '2.50', '2')
('Sherlock Bones', 'rachelpilmoor', '4.42', '19')
('Sherlock Bones', 'rachelpilmoor', '4.50', '10')
("Crappy Valentine's Day", 'Babsadee', '3.86', '14')
('Lepidoptera Trio No. 2', 'CourtneyKMann', '3.50', '2')
('Big Brain Energy', 'iheartguts', '3.50', '2')
('Recycle Your Head (Reversed)', 'writerlayne', '3.25', '4')
('Love Your Brain', 'iheartguts', '2.00', '2')
('Bigfoot', 'portokalis', '3.80', '5')
('Bigfoot', 'portokalis', '5.00', '1')
('Types of Ta-Tas - Cute Breasts All Shapes Colors Sizes Funny Mammogram Cancer A

In [14]:
# Question: How to scrape the challenge information?

# 1. challenge name
# 2. how many designs per challenge


# add your code here

for i in designs[-5:]:
    
    url="https://www.threadless.com"+i
    response=requests.get(url)
    soup=BeautifulSoup(response.content, "html.parser")
        
    challenge=soup.find("article",class_="about-the-challenge")
    title=challenge.select("li.challenge-title")[0].text
    num=challenge.select("i.fa-thumbs-up")[0].next_sibling.next_sibling.text
    print(title, num)



Threadless
 131611 designs

Shoes
 4267 designs

Threadless
 131611 designs

Y2K
 692 designs

Horror
 5452 designs


In [19]:
# get authors
authors=[row[1] for row in rows]
authors=filter(None, authors)
authors_unique=list(set(authors))
print(authors_unique)
len(authors_unique)

['portokalis', 'dandesign70', 'KreativK', 'ccelestec', 'hitechmom', 'iheartguts', 'JulieErinDesign', 'Queerandcreate', 'writerlayne', 'rachelpilmoor', 'VaporwaveAI', 'Mckennaii', 'CourtneyKMann', 'GoldenHeavens', 'shoppeser', 'Psychoslime', 'Babsadee', 'palaemon', 'Eclatbyjasline', 'GWART', 'Kaiser_Paul', 'lacychenault', 'Isaiahfx1420']


23

In [9]:
# For the designers we found, get the summary of their experience
full=[]

for i in authors_unique[:5]:
    url="https://www.threadless.com/@"+i
    time.sleep(5)
    response=requests.get(url)
    soup=BeautifulSoup(response.content, "html.parser")
    
    # find all stats
    stats=soup.select('div.stats ul')
    li=stats[0].find_all('li')
    
    line=[None] * 5
    for j in li:
        char=(j.text).strip()
        
        # threads
        if re.search("started",char):
            line[0]=char
            #line[1]=re.findall(r"[0-9.]+", char)[0]
            
        # submitted
        if re.search("submitted",char):
            line[1]=char   
            #line[1]=re.findall(r"[0-9.]+", char)[0]

        # scored
        if re.search("scored",char):
            line[2]=char
            #line[2]=re.findall(r"[0-9.]+", char)[0]
        
        # given
        if re.search("Given",char):
            line[3]=char
            #line[3]=re.findall(r"[0-9.]+", char)[0]

        # since
        if re.search("since",char):
            line[4]=char
            #line[4]=re.findall(r"[0-9.]+", char)[0]
    
    line.append(i)
    print(line)
    full.append(line)
                     

[None, '50 designs submitted', '876 designs scored', 'Avg Score Given: 2.08', 'Member since 2018', 'rachelpilmoor']
['1 thread started', '54 designs submitted', '1,151 designs scored', 'Avg Score Given: 2.99', 'Member since 2016', 'Babsadee']
[None, '14 designs submitted', None, 'Avg Score Given: 0.00', 'Member since 2016', 'iheartguts']
[None, '13 designs submitted', '11 designs scored', 'Avg Score Given: 5.00', 'Member since 2018', 'JulieErinDesign']
[None, '18 designs submitted', '734 designs scored', 'Avg Score Given: 2.70', 'Member since 2023', 'Kaiser_Paul']


In [10]:
# Question: how to scrape each designers' numbers of followers and following?



# add you code here
for i in authors_unique[:5]:
    url="https://www.threadless.com/@"+i
    time.sleep(5)
    response=requests.get(url)
    soup=BeautifulSoup(response.content, 'lxml')

    
    # get the section
    follow=soup.select("div.following li")
    following=follow[0].select("a span")[0].text
    follower=follow[1].select("a span")[0].text

    print(i, following, follower)



rachelpilmoor 63 82
Babsadee 23 315
iheartguts 0 248
JulieErinDesign 23 17
Kaiser_Paul 1 20


In [11]:
# Scrape the follower-followee network for each designer.
# Can we do this with beautifulsoup? 

from selenium import webdriver
from selenium.webdriver.chrome.options import Options


In [21]:
relations=[]

for i in authors_unique[1:10]:
    
    i=i.replace(" ","%20")
    
    follower_url="https://www.threadless.com/@"+i+"/followers"
    following_url="https://www.threadless.com/@"+i+"/following"

    # close a pop ad
    opts = Options()
    opts.add_argument("user-agent=gene")
    driver = webdriver.Chrome(options=opts)

    # one's follower   
    driver.get(follower_url)  
    time.sleep(5)
    
    # you can scroll many times if not reaching the end
    driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")  
    time.sleep(10)        
    html = driver.page_source
    soup = BeautifulSoup(html.encode('utf-8'),"html.parser")
    comp=soup.find('ol',class_="following-list")
    comp=comp.find_all("li")

    line=[]
    for k in comp:
        name=k.find("a")["href"]
        name=name.lstrip("/@")
        if name in authors_unique:
            # one's followers send the following tie
            line=[name, i]
            print(line)
            relations.append(line)
    
    # one's follwing
    driver.get(following_url)
    time.sleep(10)   
    driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")                
    time.sleep(25)  
    html = driver.page_source
    soup = BeautifulSoup(html.encode('utf-8'),"html.parser")
    comp=soup.find('ol',class_="following-list")
    comp=comp.find_all("li")

    line=[]
    for k in comp:
        name=k.find("a")["href"]
        name=name.lstrip("/@")
        if name in authors_unique:
            # one sends the following tie to those to follow
            line=[i, name]
            print(line)
            relations.append(line)
    driver.quit() 

['Queerandcreate', 'lacychenault']
['Queerandcreate', 'rachelpilmoor']
['Queerandcreate', 'rachelpilmoor']
