In [None]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from bs4 import BeautifulSoup as bs
import time
import pandas as pd
from pandas.tseries.offsets import DateOffset
import re
import emoji
from datetime import date
from langdetect import detect, detect_langs

pd.set_option('display.max_rows', 1000)
pd.set_option('display.max_columns', 100)

# Web Scraper

In [None]:
# specify query and create LinkedIn search url

query = 'enter query here'

keywords = query.split(' ')
url = 'https://www.linkedin.com/search/results/content/?keywords='
for index, word in enumerate(keywords, start = 1):
    if word[0] == '#':
        url = url+'%23'+word[1:]
    else:
        url = url+word
    if index != len(keywords):
        url = url+'%20'
url = url+'&origin=SWITCH_SEARCH_VERTICAL&sid=rTP'

In [None]:
# launch chrome driver

browser = webdriver.Chrome('chromedriver')

In [None]:
# go to url, need to log in to LinkedIn before

browser.get(url)

In [None]:
# load all posts

pause = 2

last_height = browser.execute_script("return document.body.scrollHeight")

while True:
    browser.execute_script("window.scrollTo(0, document.body.scrollHeight);")

    time.sleep(pause)
    try:
        new_height = browser.execute_script("return document.body.scrollHeight")
    except:
        continue
    if new_height == last_height:
        break
    last_height = new_height

In [None]:
# click "Load more comments" once for each post

browser.execute_script("window.scrollTo(0, 0);")
buttons = browser.find_elements(By.XPATH, "//span[text()='Load more comments']")
for button in buttons:
    time.sleep(2)
    try:
        button.click()
    except Exception as e:
        print("Cannot click button: {}".format(e))
    time.sleep(2)

In [None]:
# extract browser source code and separate into individual containers for each post

page = browser.page_source  
linkedin_soup = bs(page.encode("utf-8"), "html")
linkedin_soup.prettify()
containers = linkedin_soup.findAll("div",{"class":"feed-shared-update-v2 feed-shared-update-v2--minimal-padding full-height relative feed-shared-update-v2--e2e artdeco-card"})
containers2 = linkedin_soup.findAll("div",{"class":"feed-shared-update-v2 feed-shared-update-v2--minimal-padding full-height relative artdeco-card"})
containers3 = linkedin_soup.findAll("div",{"class":"feed-shared-update-v2 feed-shared-update-v2--minimal-padding full-height relative feed-shared-update-v2--e2e feed-shared-update-v2--wrapped"})
containers4 = linkedin_soup.findAll("div",{"class":"feed-shared-update-v2 feed-shared-update-v2--minimal-padding full-height relative feed-shared-update-v2--wrapped"})

containers.extend(containers2)
containers.extend(containers3)
containers.extend(containers4)

In [None]:
# print number of posts scraped

len(containers)

In [None]:
# extract features from source code

dic = {
    'Name': [],
    'Biography': [],
    'Date': [],
    'Text': [],
    'Comments Text': [],
    'Likes': [],
    'Comments': [],
    'Reposts': [],
    'Media Type': [],
    'Media Link': [],
    'Article Title': [],
    'Article Link': [],
    'Article Description': [],
    'Query': [],
    'Query Date': []
}

today = date.today()

curr_date = today.strftime("%m/%d/%Y")

for container in containers:
    try:

        try:
            name = container.find('span',{"dir":"ltr"}).text.strip()
            dic['Name'].append(name)
        except:
            dic['Name'].append("N/A")

        try:
            date_posted = container.find('span',{"aria-hidden":"true"}).text.strip()

            if date_posted == 'Follow':
                dic['Date'].append('N/A')
            else:
                dic['Date'].append(date_posted)

        except:
            dic['Date'].append("N/A")
        
        
        try:
            text = container.find('div',{"class":"update-components-text relative feed-shared-update-v2__commentary"}).text.strip()
            dic['Text'].append(text)
        except:
            dic['Text'].append('')
        
        try:
            bio = container.find('span',{"class":"update-components-actor__description t-12 t-normal t-black--light"}).text.strip()
            dic['Biography'].append(bio)
        except:
            dic['Biography'].append('')
        
        try:
            likes = container.find('span',{"class":"social-details-social-counts__reactions-count"}).text.strip()
            dic['Likes'].append(likes)
        except:
            dic['Likes'].append(0)
            
        try:
            comments = container.find('li',{"class":"social-details-social-counts__item social-details-social-counts__comments"}).text.strip()
            dic['Comments'].append(comments)
        except:
            dic['Comments'].append(0)
        
        try:
            reposts = container.find('button',{"class":"ember-view t-black--light t-12 hoverable-link-text"}).text.strip()
            dic['Reposts'].append(reposts)
        except:
            dic['Reposts'].append(0)
            
        try:
            comments_list = []
            comments_text = container.findAll("div",{"class":"comments-comment-item-content-body break-words"})
            for comment_text in comments_text:
                comments_list.append(comment_text.text.strip())
            dic['Comments Text'].append(comments_list)
        except:
            dic['Comments Text'].append([])
            
        try:
            article_img = container.find('div',{"class":"update-components-article--with-large-image"})
            article_img_source_link = article_img.find("a",{"class":"app-aware-link update-components-article__image-link tap-target"})['href']
            article_img_title = article_img.find("span",{"dir":"ltr"}).text.strip()
            
            try:
                article_img_media_link = article_img.find("img",{"class":"ivm-view-attr__img--centered update-components-article__image lazy-image ember-view"})['src']
            except:
                article_img_media_link = article_img.find("img",{"class":"ivm-view-attr__img--centered ivm-view-attr__img update-components-article__image lazy-image ember-view"})['src']
            
            try:
                article_img_desc = article_img.find('h2', {'class':"update-components-article__description update-components-article__description--truncated t-12 t-black--light"}).text.strip()
                dic['Article Description'].append(article_img_desc)
            except:
                dic['Article Description'].append('N/A')

            dic['Media Type'].append('Article (Large Image)')
            dic['Media Link'].append(article_img_media_link)
            dic['Article Title'].append(article_img_title)
            dic['Article Link'].append(article_img_source_link)
        except:
            try:
                article_no = container.find('div',{"class":"update-components-article--with-no-image"})
                article_no_title = article_no.find("h2",{"class":"t-14 update-components-article__title break-words t-bold t-black"}).text.strip()
                article_no_source_link = article_no.find("a",{"class":"app-aware-link update-components-article__meta flex-grow-1 full-width tap-target display-flex justify-space-between align-items-flex-start"})['href']
                
                try:
                    article_no_description = article_no.find('h2', {'class':"update-components-article__description update-components-article__description--truncated t-12 t-black--light"}).text.strip()
                except:
                    article_no_description = 'N/A'

                dic['Media Type'].append('Article (No Image)')
                dic['Media Link'].append('N/A')
                dic['Article Title'].append(article_no_title)
                dic['Article Link'].append(article_no_source_link)
                dic['Article Description'].append(article_no_description)
            except:
                try:
                    article_small = container.find('div',{"class":"update-components-article--with-small-image"})
                    article_small_media_link = article_small.find("img",{"class":"ivm-view-attr__img--centered update-components-article__image lazy-image ember-view"})['src']
                    article_small_source_link = article_small.find("a",{"class":"app-aware-link update-components-article__image-link tap-target"})['href']
                    article_small_title = article_small.find("span",{"dir":"ltr"}).text.strip()
                    
                    dic['Media Type'].append('Article (Small Image)')
                    dic['Media Link'].append(article_small_media_link)
                    dic['Article Title'].append(article_small_title)
                    dic['Article Link'].append(article_small_source_link)
                    dic['Article Description'].append('N/A')
                except:
                    try:
                        image = container.find('div',{"class":"update-components-image__container"})

                        try:
                            image_link = image.find('img',{'class':"ivm-view-attr__img--centered update-components-image__image lazy-image ember-view"})['src']
                        except:
                            image_link = image.find('img',{'class':"ivm-view-attr__img--centered update-components-image__image update-components-image__image--constrained lazy-image ember-view"})['src']
                        
                        dic['Media Type'].append('Image')
                        dic['Media Link'].append(image_link)
                        dic['Article Title'].append('N/A')
                        dic['Article Link'].append('N/A')
                        dic['Article Description'].append('N/A')
                        
                    except:
                        try:
                            video = container.find('div', {'class':"update-components-linkedin-video__container"})
                            video_link = video.find('video', {'class':'vjs-tech'})['src']
                            
                            dic['Media Type'].append('Video')
                            dic['Media Link'].append(video_link)
                            dic['Article Title'].append('N/A')
                            dic['Article Link'].append('N/A')
                            dic['Article Description'].append('N/A')
                        except:
                            try:
                                video = container.find('div', {'class':"feed-shared-external-video feed-shared-update-v2__content"})
                                video_link = video.find('a', {'class':'app-aware-link tap-target external-video-viewer__play-link play-video'})['href']
                                
                                dic['Media Type'].append('Video')
                                dic['Media Link'].append(video_link)
                                dic['Article Title'].append('N/A')
                                dic['Article Link'].append('N/A')
                                dic['Article Description'].append('N/A')
                            except:
                                dic['Media Type'].append('None')
                                dic['Media Link'].append('N/A')
                                dic['Article Title'].append('N/A')
                                dic['Article Link'].append('N/A')
                                dic['Article Description'].append('N/A')
        
        dic['Query'].append(query)
        dic['Query Date'].append(curr_date)
    except:
        pass

# Data Cleaning

In [None]:
# print number of raw data points

print('length of raw data: ', len(dic['Name']))

In [None]:
# clean text

def clean_text(text):
    clean = emoji.replace_emoji(text)
    clean = re.sub(r'http\S+', '', clean)
    clean = re.sub(r'www\S+', '', clean)
    clean = re.sub("\\xa0", '', clean)
    clean = re.sub("\\n", '', clean)
    clean = re.sub("‚Äô", "'", clean)
    return clean

text_cols = [
    'Name', 
    'Biography', 
    'Text', 
    'Article Title', 
    'Article Description'
]

for column in text_cols: 
    for i, text in enumerate(dic[column]):
        clean = clean_text(text)
        dic[column][i] = clean

for i, comments in enumerate(dic['Comments Text']):
    for j, text in enumerate(comments):
        clean = clean_text(text)
        dic['Comments Text'][i][j] = clean

df = pd.DataFrame(dic)

In [None]:
# remove duplicate posts

df = df.drop_duplicates(subset=['Name', 'Biography', 'Date', 'Text', 'Media Type', 'Article Title', 'Article Description'])
print('length after removing duplicates: ', len(df))

In [None]:
# remove posts with no text

df = df[df['Text'] != '']
print('length after removing posts with no text: ', len(df))

In [None]:
# remove non-English posts

langs = []

for index, row in df.iterrows():
    try:
        lang = detect(row['Text'])
        langs.append(lang)
    except:
        try:
            if row['Article Description'] == 'N/A':
                raise Exception("")
            else:
                lang = detect(row['Article Description'])
                langs.append(lang)
        except:
            try:
                lang = detect(row['Biography'])
                langs.append(lang)
            except:
                langs.append('unknown')

df['Language'] = langs
df = df[df['Language'] == 'en']
del df['Language']

print('length after removing non-English posts: ', len(df))

In [None]:
# clean features

df['Comments'] = df['Comments'].astype(str)
df['Reposts'] = df['Reposts'].astype(str)
df['Likes'] = df['Likes'].astype(str)

for i, row in df.iterrows():
    if row["Comments"] == '':
        row["Comments"] = '0'

    row['Comments'] = re.sub("[^0-9]", "", row['Comments'])

    if row["Reposts"] == '':
        row["Reposts"] = '0'

    row['Reposts'] = re.sub("[^0-9]", "", row['Reposts'])

    if row["Likes"] == '':
        row["Likes"] = '0'

    row['Likes'] = re.sub("[^0-9]", "", row['Likes'])

    row['Date'] = re.sub(" •", "", row['Date'])
    row['Date'] = re.sub(" Edited", "", row['Date'])

    if row['Comments Text'] == ['']: # this might be wrong, test it out
        row['Comments Text'] = []


df['Comments'] = df['Comments'].astype(int)
df['Reposts'] = df['Reposts'].astype(int)
df['Likes'] = df['Likes'].astype(int)

In [None]:
# format dates

dates = []

query_dates = list(pd.to_datetime(df['Query Date']))
time_deltas = list(df['Date'])

for i in range(0, len(query_dates)):
    value = int(re.sub("[^0-9]", "", time_deltas[i]))

    if 'd' in time_deltas[i]:
        dates.append(query_dates[i] - DateOffset(days=value))
    elif 'mo' in time_deltas[i]:
        dates.append(query_dates[i] - DateOffset(months=value))
    elif 'm' in time_deltas[i]:
        dates.append(query_dates[i] - DateOffset(minutes=value))
    elif 'yr' in time_deltas[i]:
        dates.append(query_dates[i] - DateOffset(years=value))
    elif 'h' in time_deltas[i]:
        dates.append(query_dates[i] - DateOffset(hours=value))
    elif 'w' in time_deltas[i]:
        dates.append(query_dates[i] - DateOffset(weeks=value))
    else: 
        print(time_deltas[i])

dates = pd.Series(dates).dt.strftime('%m/%d/%Y')

df['Date'] = dates

In [None]:
# display clean data

df.head()

In [None]:
# This is a sanity check to make sure the scraper is working, and LinkedIn hasn't changed its source code.
# Make sure the different media types are captured, and that there aren't too many "None". 
# In large scrapes, all media types should appear. 
# Media Types: Article (Large Image), Article (Small Image), Article (No Image), Image, Video, None

df['Media Type'].value_counts()

In [None]:
# save scraped and cleaned data as json

df.to_json(query+'.json')