# LinkedIn news feed classifier

### This Notebook takes your linkedin news feed and classifies it into 4 categories (Job lead,Event,Others and Ads)

The notebook is divided into 3 parts:
1. Import content
2. Classify content
3. Access the required category


Importing of data is done using chromedriver. The latest version is available at https://sites.google.com/chromium.org/driver/

## 1. Import content

In [None]:
# Required Imports
import time
import pandas as pd
import re as re
import pickle
from selenium import webdriver
from bs4 import BeautifulSoup as bs
from IPython.core.display import display, HTML 
from IPython.display import IFrame
from sklearn.pipeline import Pipeline 
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
#See if existing user credential file exists or create one 
# Borrowed this cell from https://christophegaron.com/scraping-linkedin-posts-with-selenium-and-beautiful-soup/
try:
    f= open("linkedin_credentials.txt","r")
    contents = f.read()
    username = contents.replace("=",",").split(",")[1]
    password = contents.replace("=",",").split(",")[3]
except:
    f= open("linkedin_credentials.txt","w+")
    username = input('Enter your linkedin username: ')
    password = input('Enter your linkedin password: ')
    f.write("username={}, password={}".format(username,password))
    f.close()

In [None]:
#accessing Chromedriver
browser = webdriver.Chrome('chromedriver')

#Open login page
browser.get('https://www.linkedin.com/login?fromSignIn=true&trk=guest_homepage-basic_nav-header-signin')

#Enter login info:
elementID = browser.find_element_by_id('username')
elementID.send_keys(username)

elementID = browser.find_element_by_id('password')
elementID.send_keys(password)
elementID.submit()


In [None]:
#Simulate scrolling to capture all posts
SCROLL_PAUSE_TIME = 1.5
#Set the pages number counter
PAGES=0
#Set maximum number of pages to extract
MAX_PAGES=30
# Get scroll height
last_height = browser.execute_script("return document.body.scrollHeight")

while PAGES<MAX_PAGES:
    # Scroll down to bottom
    browser.execute_script("window.scrollTo(0, document.body.scrollHeight);")

    # Wait to load page
    time.sleep(SCROLL_PAUSE_TIME)
    PAGES+=1
    # Calculate new scroll height and compare with last scroll height
    new_height = browser.execute_script("return document.body.scrollHeight")
    if new_height == last_height:
        break
    last_height = new_height

In [None]:
#Check out page source code
company_page = browser.page_source  


#Use Beautiful Soup to get access tags
linkedin_soup = bs(company_page.encode("utf-8"), "html")
linkedin_soup.prettify()

#Find the post blocks
containers = linkedin_soup.findAll(attrs={"data-urn": True}) #gets content
print(len(containers))

In [None]:
#Removes other types of contents to get required html content
updated_containers=[x for x in containers if '<div class="feed-shared-text relative feed-shared-update-v2__commentary" dir="ltr">' in str(x)] #removed else ""
print(len(updated_containers))

In [None]:
dataframe_content=pd.DataFrame([updated_containers]).T
dataframe_content.columns=["html_content"]

In [None]:
dataframe_content.head()

In [None]:
def complete_cleaner(inputs):
    """
    This function takes in the content with HTML and removes tags
    and emojis
    """
    cleaned_content=[]
    regrex_pattern = re.compile(pattern = "["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
        u"\U00002500-\U00002BEF"  # chinese char
        u"\U00002702-\U000027B0"
        u"\U00002702-\U000027B0"
        u"\U000024C2-\U0001F251"
        u"\U0001f926-\U0001f937"
        u"\U00010000-\U0010ffff"
        u"\u2640-\u2642" 
        u"\u2600-\u2B55"
        u"\u200d"
        u"\u23cf"
        u"\u23e9"
        u"\u231a"
        u"\ufe0f"  # dingbats
        u"\u3030"
                          "]+", flags = re.UNICODE)
    cleanr = re.compile('<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});')
    for items in inputs:
        cleantext = re.sub(cleanr, "", str(items))
        cleantext=cleantext.replace('\n',"")
        cleantext=re.sub(regrex_pattern,"" ,cleantext)
        cleaned_content.append(str(cleantext).lower())  
    return cleaned_content

In [None]:
#Function to get span for each row
text_content=[]
for n in range(len(dataframe_content)):
    each_row_soup=bs(str(dataframe_content["html_content"][n]))
    containers_span_each_row = each_row_soup.findAll("span",{"class":"break-words"})
    containers_span_each_row=complete_cleaner(containers_span_each_row)
    text_content.append(containers_span_each_row[0])

In [None]:
#add the cleaned content to the dataframe
dataframe_content["Text"]=text_content

## 2.Classify content

# Load the model to predict

In [None]:
#Load the classifier pipeline
with open('pipeline_classifier.pickle', 'rb') as file:
    pipeline_classifier = pickle.load(file)

In [None]:
#Add the predicted tags to Label column in the dataframe
dataframe_content["Label"]=pipeline_classifier.predict(dataframe_content["Text"])

In [None]:
#Check to ensure Labels are available
dataframe_content.head()

In [None]:
#Check the distribution of content in your feed
sns.set()
sns.countplot(x='Label', data=dataframe_content)
plt.show()

## 3.Access the required category

In [None]:
#Function to return the html
def select_label(label):
    if label==1:
        container_x=" ".join(str(x) for x in dataframe_content.loc[dataframe_content["Label"]=="Job lead"]["html_content"])
        display(HTML(container_x))
    elif label==2:
        container_x=" ".join(str(x) for x in dataframe_content.loc[dataframe_content["Label"]=="Event"]["html_content"])
        display(HTML(str(container_x)))
    elif label==3:
        container_x=" ".join(str(x) for x in dataframe_content.loc[dataframe_content["Label"]=="Others"]["html_content"])
        display(HTML(str(container_x)))
    elif label==4:
        container_x=" ".join(str(x) for x in dataframe_content.loc[dataframe_content["Label"]=="Advertisement"]["html_content"])
        display(HTML(str(container_x)))
    else:
        return print("Select label 1,2,3 or 4")

## Use the select label function to pick the desired content. 
    For Job leads, use select_label(1)
    For Events, use select_label(2)
    For Others, use select_label(3)
    For Advertisements, use select_label(4)

In [None]:
#For example, to browse job leads use
select_label(1)