# Interactive Twitter scrapping

In [1]:
import selenium
from time import sleep
from selenium.webdriver import Chrome, ChromeOptions
from selenium.webdriver.common.keys import Keys
from selenium.common.exceptions import NoSuchElementException
from webdriver_manager.chrome import ChromeDriverManager
from dataclasses import dataclass, asdict

In [2]:
options = ChromeOptions()
options.add_argument('headless')
options.add_argument("user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36")
options.add_argument('window-size=1920x1080')
driver = Chrome(ChromeDriverManager().install(), chrome_options = options)

[WDM] - Current google-chrome version is 108.0.5359
[WDM] - Get LATEST driver version for 108.0.5359
[WDM] - Driver [/Users/quimpm/.wdm/drivers/chromedriver/mac64/108.0.5359.71/chromedriver] found in cache


 


  driver = Chrome(ChromeDriverManager().install(), chrome_options = options)
  driver = Chrome(ChromeDriverManager().install(), chrome_options = options)


In [3]:
driver.get("https://www.twitter.com/monkeydquim")

In [4]:
driver.current_url

'https://twitter.com/monkeydquim'

### Util Funcs and Data Structures

In [5]:
@dataclass(eq=True)
class User():
    name: str = None
    at: str = None
    desc: str = None
    profile_img: str = None
    banner_img: str = None
    geolocation: str = None
    join_date: str = None
    following: int = None
    followers: int = None

In [6]:
@dataclass(eq=True, unsafe_hash=True)
class Tweet():
    username: str = None
    at: str = None
    text: str = None
    views: int = None
    replies: int = None
    retweets: int = None
    likes: int = None
    imgs: (str) = None
    video: str = None
    link: str = None
    time: str = None

In [7]:
def scrape_wrapper(driver, default, func, args=None):
    retries = 3
    if args:
        list_args = list(args)
        list_args.insert(0, driver)
        args = tuple(list_args)
    else:
        args = (driver,)
    try:
        return func(*args)
    except:
        try:
            sleep(1)
            return func(*args)
        except:
            return default

### Get user profile info

In [8]:
def get_banner_image(driver):
    banner_img = driver.find_element('xpath', './/img[@draggable="true"]')
    return banner_img.get_attribute("src")

def get_username_and_at(driver):
    user_name = driver.find_element('xpath', './/div[@data-testid="UserName"]') 
    return user_name.text.splitlines()

def get_description(driver):
    user_desc = driver.find_element('xpath', './/div[@data-testid="UserDescription"]') 
    return user_desc.text

def get_location(driver):
    user_location = driver.find_element('xpath', './/span[@data-testid="UserLocation"]')
    return user_location.text

def get_join_date(driver):
    user_join_date = driver.find_element('xpath', './/span[@data-testid="UserJoinDate"]')
    return user_join_date.text

def get_following(driver, at):
    user_following = driver.find_element('xpath', './/a[@href="/'+at[1:]+'/following"]')
    return int(user_following.text.split()[0])

def get_followers(driver, at):
    user_followers = driver.find_element('xpath', './/a[@href="/'+at[1:]+'/followers"]')
    return int(user_followers.text.split()[0])
    
def get_user_information(driver):
    user = User()
    user.name, user.at = scrape_wrapper(driver, '', get_username_and_at)
    user.banner_image = scrape_wrapper(driver, '', get_banner_image)
    user.profile_img = "https://www.twitter.com/" + user.at[1:] + "/photo"
    user.desc = scrape_wrapper(driver, '', get_description)
    user.geolocation = scrape_wrapper(driver, '', get_location)
    user.join_date = scrape_wrapper(driver, '', get_join_date)
    user.following = scrape_wrapper(driver, 0, get_following, (user.at,))
    user.followers = scrape_wrapper(driver, 0, get_followers, (user.at,))
    return user
    

In [9]:
user = get_user_information(driver)
user

User(name='Quim*10^-12', at='@monkeydquim', desc='UDL- Computer engineering\n\nLearning code...', profile_img='https://www.twitter.com/monkeydquim/photo', banner_img=None, geolocation='Lleida, Catalunya', join_date='Joined September 2017', following=227, followers=71)

### Get Tweets

In [10]:
#Move to Tweets and replies
driver.find_element('xpath', '//a[@href="/'+user.at[1:]+'/with_replies"]').click()


### Tweet parsing

In [9]:
def get_username_and_date(driver, scraped_tweet):
    user_name = scraped_tweet.find_element('xpath', './/div[@data-testid="User-Names"]')
    values = user_name.text.splitlines()
    return values[0], values[1][1:], values[3]

def get_text(driver, scraped_tweet):
    text = scraped_tweet.find_element('xpath', './/div[@data-testid="tweetText"]')
    return text.text

def get_visualizations(driver, scraped_tweet):
    text = scraped_tweet.find_element('xpath', './/div[@data-testid="app-text-transition-container"]')
    return int(text.text)

def get_reply(driver, scraped_tweet):
    reply = scraped_tweet.find_element('xpath', './/div[@data-testid="reply"]').text
    return int(reply) if reply else 0

def get_retweet(driver, scraped_tweet):
    retweet = scraped_tweet.find_element('xpath', './/div[@data-testid="retweet"]').text
    return int(retweet) if retweet else 0
    
def get_like(driver, scraped_tweet):
    like = scraped_tweet.find_element('xpath', './/div[@data-testid="like"]').text
    return int(like) if like else 0

def get_views(driver, scraped_tweet):
    views = scraped_tweet.find_element('xpath', './/div[@role="group"]/div/a').text
    return int(views) if views else 0

def get_imgs(driver, scraped_tweet):
    imgs = scraped_tweet.find_elements('xpath', './/img[@alt="Image"]')
    return tuple([img.get_attribute("src") for img in imgs])

def get_link(driver, scraped_tweet):
    link = scraped_tweet.find_element('xpath', './/div[@data-testid="card.wrapper"]/div/a')
    return link.get_attribute("href")

def get_video(driver, scraped_tweet):
    video = scraped_tweet.find_element('xpath', './/video')
    return video.get_attribute("src")[5:]

def get_tweet_information(driver, scraped_tweet):
    tweet = Tweet()
    tweet.username, tweet.at, tweet.time = scrape_wrapper(driver, '', get_username_and_date, (scraped_tweet,))
    tweet.text = scrape_wrapper(driver, '', get_text, (scraped_tweet,))
    tweet.replies = scrape_wrapper(driver, 0, get_reply, (scraped_tweet,))
    tweet.retweets = scrape_wrapper(driver, 0, get_retweet, (scraped_tweet,))
    tweet.likes = scrape_wrapper(driver, 0, get_like, (scraped_tweet,))
    tweet.views = scrape_wrapper(driver, 0, get_views, (scraped_tweet,))
    tweet.imgs = scrape_wrapper(driver, (), get_imgs, (scraped_tweet,))
    tweet.link = scrape_wrapper(driver, '', get_link, (scraped_tweet,))
    tweet.video = scrape_wrapper(driver, '', get_video, (scraped_tweet,))
    return tweet

In [None]:
tweets = driver.find_elements('xpath', './/article[@data-testid="tweet"]')
scraped_tweet = tweets[0]
get_tweet_information(driver, scraped_tweet)

# Scrape profile algorithm

In [None]:
def scrape_profile(profile):
    options = ChromeOptions()
    options.add_argument('headless')
    options.add_argument('window-size=1920x1080')
    options.add_argument("user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36")
    driver = Chrome(ChromeDriverManager().install(), chrome_options = options)
    driver.get("https://www.twitter.com/"+profile)
    sleep(15)
    driver.find_element('xpath', './/a[@href="/'+profile+'/with_replies"]').click()
    user_info = get_user_information(driver)
    print(user_info)
    tweets_info={}
    scrolling = True
    last_position = driver.execute_script("return window.pageYOffset;")
    while scrolling:
        tweets = driver.find_elements('xpath', './/article[@data-testid="tweet"]')
        for tweet in tweets:
            try:
                scraped_tweet = get_tweet_information(driver, tweet)
                print("Tweet Scrapped")
            except:
                print("Failed to scrap one tweet")
            if not tweets_info.get(scraped_tweet.__hash__()):
                tweets_info[scraped_tweet.__hash__()] = scraped_tweet
        while True:
            scroll_attempt = 0
            driver.execute_script('window.scrollTo(0, document.body.scrollHeight);')
            curr_position = driver.execute_script("return window.pageYOffset;")
            if last_position == curr_position:
                scroll_attempt += 1
                if scroll_attempt >= 3:
                    scrolling = False
                    break
                else:
                    sleep(2)
            else:
                last_position = curr_position
                break
    return user_info, tweets_info.values()
    
scrape_profile("monkeydquim")                
            
            

[WDM] - Current google-chrome version is 108.0.5359
[WDM] - Get LATEST driver version for 108.0.5359
[WDM] - Driver [/Users/quimpm/.wdm/drivers/chromedriver/mac64/108.0.5359.71/chromedriver] found in cache


 


  driver = Chrome(ChromeDriverManager().install(), chrome_options = options)
  driver = Chrome(ChromeDriverManager().install(), chrome_options = options)


User(name='Quim*10^-12', at='@monkeydquim', desc='UDL- Computer engineering\n\nLearning code...', profile_img='https://www.twitter.com/monkeydquim/photo', banner_img=None, geolocation='Lleida, Catalunya', join_date='Joined September 2017', following=227, followers=71)
Tweet Scrapped
Tweet Scrapped
Tweet Scrapped
Tweet Scrapped
Tweet Scrapped
Tweet Scrapped
Tweet Scrapped
Tweet Scrapped
Tweet Scrapped
Tweet Scrapped
Tweet Scrapped
Tweet Scrapped
Tweet Scrapped
Tweet Scrapped
Tweet Scrapped
Tweet Scrapped
Tweet Scrapped
Tweet Scrapped
Tweet Scrapped
Tweet Scrapped
Tweet Scrapped
Tweet Scrapped
Tweet Scrapped
Tweet Scrapped
Tweet Scrapped
Tweet Scrapped
Tweet Scrapped
Tweet Scrapped
Tweet Scrapped
Tweet Scrapped
Tweet Scrapped
Tweet Scrapped
Tweet Scrapped
Tweet Scrapped
Tweet Scrapped
Tweet Scrapped
Tweet Scrapped
Tweet Scrapped
Tweet Scrapped
Tweet Scrapped
Tweet Scrapped
Failed to scrap one tweet
Tweet Scrapped
Tweet Scrapped
Failed to scrap one tweet
Failed to scrap one tweet
Tweet Sc