# <center>An example of SNS</center>

- Threadless.com is a crowdsouring website for graphic designs.
- Desginers submit artworks and recieve ratings from the community within a seven-day period. 
- Designs with the best scores will be selected to print on T-shirts and other products for sale. 

### Webscraping objectives

- Get a sample of users and artifacts. Consider a sampling strategy. 
- Scrape artifact-level features.
- Scrape user-level features. 

In [1]:
import requests
from bs4 import BeautifulSoup
import csv
import re
import time


In [2]:
# Get five urls of pages as a sample of latest artifacts.

link="https://www.threadless.com/designs/archive?page="
num=list(range(1,6))
pages=[]
for i in num:
    page=link+str(i)
    pages.append(page)
print(pages)


['https://www.threadless.com/designs/archive?page=1', 'https://www.threadless.com/designs/archive?page=2', 'https://www.threadless.com/designs/archive?page=3', 'https://www.threadless.com/designs/archive?page=4', 'https://www.threadless.com/designs/archive?page=5']


In [3]:
# Get urls of all the designs in these pages
# To reduce the load to their server, will demonnstrate one page

designs=[]
for i in pages:
    print('working on page'+str(' ')+str(i))
    response=requests.get(i)
    soup=BeautifulSoup(response.content, "html.parser")
    links=soup.find('ol',class_='feed-archive th-grided')
    li=links.find_all('li',class_="old")
    for j in li:
        name=j.find("a")["href"]
        designs.append(name)
   

working on page https://www.threadless.com/designs/archive?page=1
working on page https://www.threadless.com/designs/archive?page=2
working on page https://www.threadless.com/designs/archive?page=3
working on page https://www.threadless.com/designs/archive?page=4
working on page https://www.threadless.com/designs/archive?page=5


In [4]:
designs[:5]

# can write out the sample of artifacts 
# with open('designs.csv', 'w') as csvfile:
#    writer=csv.writer(csvfile, delimiter=',')
#    writer.writerows(zip(designs))


# read in your sample
# raw_data_file = open("designs.csv", 'r')
# csv_data_file = csv.reader(raw_data_file, delimiter=',')
# designs = []
# for line in csv_data_file:
#     print(line[0])
#     designs.append(line[0])

['/designs/sacred-tree-of-life',
 '/designs/colorful-origami-giraffe',
 '/designs/origami-giraffe',
 '/designs/y2k-is-here',
 '/designs/broken-angel-3']

In [15]:
# Get artifact level features
# For each design, get title, author, average score, number of scores, challenge name

rows=[]

for i in designs[:40]:
    try:
        url="https://www.threadless.com"+i
        response=requests.get(url)
        soup=BeautifulSoup(response.content, "html.parser")
        
        # initiate the variable for each period
        title=None
        author=None
        avg_score=None
        total_score=None
        
        ##title
        title=soup.select('div.submission-title h1')
        if title!=[]:
            title=title[0].text

        ##author
        author=soup.select('div.author-block a.author')
        if author!=[]:
            author=author[0].text

        ##score
        avg_score=soup.select('li.avg-score strong')
        if avg_score!=[]:
            avg_score=avg_score[0].text

        ##total scores
        total_score=soup.select('li.total-scores strong')
        if total_score!=[]:
            total_score=total_score[0].text
        
        rows.append((title, author, avg_score, total_score))
        print((title, author, avg_score, total_score))
    
    except AttributeError:
        pass



('Sacred tree of life', 'Bearpaws', '2.00', '2')
('Colorful Origami Giraffe', 'koalafish', '3.67', '3')
('Origami Giraffe', 'koalafish', '1.00', '1')
('Y2K is here', 'Producershirts', '2.00', '4')
('Broken Angel', 'RIZES', [], '0')
('wolf art', 'nafidie', '2.25', '4')
('Montana Rainbow', 'tanavegas', '1.00', '1')
("You're Perfect", 'losereputation', '3.25', '4')
('Dead inside But coffeimated AF', 'urbrand', '3.33', '3')
('coffee lover', 'urbrand', '3.50', '4')
('ENVIRONMENT LOGO CLEAN CITY', 'nafidie', '2.00', '4')
('coffee valentines', 'urbrand', '2.33', '3')
('ENVIRONMENT LOGO TREE', 'nafidie', '2.00', '3')
('Cloudy Day', 'braxxaz', '3.00', '2')
('Owl With coffee', 'urbrand', '2.67', '3')
('ENVIRONMENT LOGO HAND TREE', 'nafidie', '2.00', '3')
('Owl Valentines Day', 'urbrand', '3.00', '3')
('Owl Lovers', 'urbrand', '3.33', '3')
('Love owl', 'urbrand', '3.33', '3')
('Hello Sunshine', 'braxxaz', '1.00', '1')
("I'm Your Father Nooo Floppy Disc And USB Y2K", 'Smart-creator', '1.00', '3')


In [7]:
# Question: How to scrape the challenge information?

# 1. challenge name
# 2. how many designs per challenge


# add your code here





Threadless
 132137 designs

Y2K
 795 designs

Pride Forever
 2154 designs

Horror
 5505 designs

Threadless
 132137 designs


In [16]:
# get authors
authors=[row[1] for row in rows]
authors=filter(None, authors)
authors_unique=list(set(authors))
print(authors_unique)
len(authors_unique)

['JPArts26', 'Bearpaws', 'Smart-creator', 'losereputation', 'tanavegas', 'mattjh2', 'nafidie', 'GloopZ', 'koalafish', 'VaporwaveAI', 'Producershirts', 'braxxaz', 'Keejus', 'ClothingnCrypto', 'portokalis', 'Rabica', 'makart', 'urbrand', 'bojanvukovic', 'stotaz9', 'RIZES']


21

In [9]:
# For the designers we found, get the summary of their experience
full=[]

for i in authors_unique[:5]:
    url="https://www.threadless.com/@"+i
    time.sleep(5)
    response=requests.get(url)
    soup=BeautifulSoup(response.content, "html.parser")
    
    # find all stats
    stats=soup.select('div.stats ul')
    li=stats[0].find_all('li')
    
    line=[None] * 5
    for j in li:
        char=(j.text).strip()
        
        # threads
        if re.search("started",char):
            line[0]=char
            #line[1]=re.findall(r"[0-9.]+", char)[0]
            
        # submitted
        if re.search("submitted",char):
            line[1]=char   
            #line[1]=re.findall(r"[0-9.]+", char)[0]

        # scored
        if re.search("scored",char):
            line[2]=char
            #line[2]=re.findall(r"[0-9.]+", char)[0]
        
        # given
        if re.search("Given",char):
            line[3]=char
            #line[3]=re.findall(r"[0-9.]+", char)[0]

        # since
        if re.search("since",char):
            line[4]=char
            #line[4]=re.findall(r"[0-9.]+", char)[0]
    
    line.append(i)
    print(line)
    full.append(line)
                     

['5 threads started', '125 designs submitted', '45 designs scored', 'Avg Score Given: 4.69', 'Member since 2015', 'Producershirts']
[None, '11 designs submitted', None, 'Avg Score Given: 0.00', 'Member since 2022', 'urbrand']
[None, '47 designs submitted', '7 designs scored', 'Avg Score Given: 4.43', 'Member since 2019', 'braxxaz']
['2 threads started', '64 designs submitted', '21 designs scored', 'Avg Score Given: 1.52', 'Member since 2022', 'Bearpaws']
[None, '9 designs submitted', '1 design scored', 'Avg Score Given: 5.00', 'Member since 2023', 'nafidie']


In [10]:
# Question: how to scrape each designers' numbers of followers and following?



# add you code here




Producershirts 34 268
urbrand 0 6
braxxaz 0 41
Bearpaws 15 13
nafidie 1 4


In [11]:
# Scrape the follower-followee network for each designer.
# Can we do this with beautifulsoup? 

from selenium import webdriver
from selenium.webdriver.chrome.options import Options


In [None]:
relations=[]

for i in authors_unique:
    
    i=i.replace(" ","%20")
    
    follower_url="https://www.threadless.com/@"+i+"/followers"
    following_url="https://www.threadless.com/@"+i+"/following"

    # close a pop ad
    opts = Options()
    opts.add_argument("user-agent=gene")
    driver = webdriver.Chrome(options=opts)

    # one's follower   
    driver.get(follower_url)  
    time.sleep(5)
    
    # you can scroll many times if not reaching the end
    driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")  
    time.sleep(10)        
    html = driver.page_source
    soup = BeautifulSoup(html.encode('utf-8'),"html.parser")
    comp=soup.find('ol',class_="following-list")
    comp=comp.find_all("li")

    line=[]
    for k in comp:
        name=k.find("a")["href"]
        name=name.lstrip("/@")
        if name in authors_unique:
            # one's followers send the following tie
            line=[name, i]
            print(line)
            relations.append(line)
    
    # one's follwing
    driver.get(following_url)
    time.sleep(10)   
    driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")                
    time.sleep(25)  
    html = driver.page_source
    soup = BeautifulSoup(html.encode('utf-8'),"html.parser")
    comp=soup.find('ol',class_="following-list")
    comp=comp.find_all("li")

    line=[]
    for k in comp:
        name=k.find("a")["href"]
        name=name.lstrip("/@")
        if name in authors_unique:
            # one sends the following tie to those to follow
            line=[i, name]
            print(line)
            relations.append(line)
    driver.quit() 

['GloopZ', 'makart']
['Keejus', 'koalafish']
['ClothingnCrypto', 'koalafish']
['Keejus', 'VaporwaveAI']
['Keejus', 'braxxaz']
['portokalis', 'Keejus']
['Keejus', 'GloopZ']
['Keejus', 'koalafish']
['Keejus', 'portokalis']
['Keejus', 'makart']
['Keejus', 'braxxaz']
['Keejus', 'VaporwaveAI']
['ClothingnCrypto', 'koalafish']
['ClothingnCrypto', 'makart']
['Keejus', 'portokalis']
['portokalis', 'Keejus']
