In [1]:
# Importing packages
import pandas as pd
import numpy as np
import re
from collections import OrderedDict 

In [2]:
# Importing paragraphs
data = pd.read_csv('Paras1.csv')
# Dropping duplicates from paragraphs
data = data.drop_duplicates(subset=['Text'])
# Resetting index
data.reset_index(inplace=True, drop=True)
data

Unnamed: 0,Link,Company,Page Name,Snippet,Query,Text
0,https://www.microsoft.com/en-us/diversity/insi...,Microsoft,Inclusive Hiring at Microsoft,Our annual Ability Summit brings Microsoft emp...,disabled employees at Microsoft,With the unemployment rate for people with dis...
1,https://www.microsoft.com/en-us/diversity/insi...,Microsoft,Inclusive Hiring at Microsoft,Our annual Ability Summit brings Microsoft emp...,disabled employees at Microsoft,"From the very first days of our company, Micro..."
2,https://www.microsoft.com/en-us/diversity/insi...,Microsoft,Inclusive Hiring at Microsoft,Our annual Ability Summit brings Microsoft emp...,disabled employees at Microsoft,"Recruit, onboard and development of Neurodiver..."
3,https://www.microsoft.com/en-us/diversity/insi...,Microsoft,Inclusive Hiring at Microsoft,Our annual Ability Summit brings Microsoft emp...,disabled employees at Microsoft,Microsoft is striving to promote an inclusive ...
4,https://www.microsoft.com/en-us/diversity/insi...,Microsoft,Inclusive Hiring at Microsoft,Our annual Ability Summit brings Microsoft emp...,disabled employees at Microsoft,The Disability ERG represents employees with d...
...,...,...,...,...,...,...
8834,https://sway.office.com/ZIetX05BXMPoBLHk,Microsoft,Our Disability Inclusion Journey,,,Health and Wellness
8835,https://sway.office.com/ZIetX05BXMPoBLHk,Microsoft,Our Disability Inclusion Journey,,,Team Centric Training
8836,https://sway.office.com/ZIetX05BXMPoBLHk,Microsoft,Our Disability Inclusion Journey,,,Company-wide Training
8837,https://sway.office.com/ZIetX05BXMPoBLHk,Microsoft,Our Disability Inclusion Journey,,,Employee Community


In [3]:
# Defining list of keywords and keyword categories
keywords = ['disabled people', 'disabled employee', 'family friendly benefits', 'parental benefits', 'leave benefits', 'health benefits', 'disability benefits', 'employee benefits', 'disability', 'disabled person', 'disabilities', 'autism', 'autistic', 
            'accessibility', 'accessible', 'erg', 'employee resource group', 'inclusivity', 'inclusion',
            'inclusive', 'visually impaired', 'hearing impaired' 'deaf',
            'discriminate', 'discriminating', 'discrimination', 'equal employment', 'equal opportunity', 'accommodation', 'accommodate', 'accommodations',
            'accommodating', 'diversity', 'diverse', 'ndeam', 'national disability employment awareness month', 'a11y', 'abilities', 'paralysed', 'paralyzed', 
            'paralysis', 'blind', 'blindness', 'culture', 'representation', 'dei', 'd&i', 'wheelchair', 
            'neurodiverse', 'deafness', 'hire', 'hired', 'hiring', 'dream job', 'job shortage', 'new job', 'job skills' 'job search', 'job hunt', 'jobs', 'job seekers', 'job opportunity', 'job opportunities',
            'interview', 'employment', 'employer', 'recruitment', 'recruiting', 'recruit']
categories = {
    'disability': ['disabled people', 'disabled employee', 'disability', 'disabled person', 'disabilities', 'autism', 'autistic',
                   'accessibility', 'accessible', 'visually impaired', 'hearing impaired', 
                   'deaf', 'accommodation', 'accommodations', 'accommodate', 'accommodating', 'paralysis', 
                    'blind', 'blindness','ndeam', 'national disability employment awareness month', 
                   'a11y', 'abilities', 'paralysed', 'paralyzed', 'wheelchair', 
                    'neurodiverse', 'deafness'
                  ],
    'hiring': ['family friendly benefits', 'parental benefits', 'leave benefits', 'health benefits', 'disability benefits', 'employee benefits', 'equal employment', 'equal opportunity', 'hire', 'hired', 'hiring', 'dream job', 'job shortage', 'new job', 'job skills' 'job search', 'job hunt', 'jobs', 
               'job seekers', 'job opportunity', 'job opportunities', 'interview', 'employment', 'employer', 'recruitment', 'recruiting', 'recruit'],
    'culture': ['erg', 'employee resource group', 'resource group', 'inclusivity', 'inclusion', 'inclusive', 'discriminate', 
                'discriminating', 'diversity', 'diverse', 'culture', 'representation', 'dei', 'd&i']
}
source = ['news', 'blog', 'support', 'investor', 'csr', 'careers', 'privacy', 'jobs']

In [None]:
# Function to identify all keywords in text
def keys(str):
    key = ''
    for i in keywords:
        if re.search(r'\b' + i + r'\b', str):
            key +=  i + ' ' 
    return ' '.join(sorted(set(key.split())))

# Function to get the count of words in the text
def getlen(str):
    return len(str.split())

# Function to check the relevance of text based on keywords
def isRelevant(str):
    for i in keywords:
        if re.search(r'\b' + i + r'\b', str):
            return 1 
    return 0

# Function to get article source type
def sourcetype(str):
    stype = ''
    for i in source:
        if re.search(i, str):
            stype +=  i + ' ' 
    return stype.strip()

# Function to identify all keyword categories in text
def tags(str):
    labels = ''
    for i in categories['disability']:
        if re.search(r'\b' + i + r'\b', str):
            labels += 'disability' + ' '
            break
    for i in categories['culture']:
        if re.search(r'\b' + i + r'\b', str):
            labels += 'culture' + ' '
            break
    for i in categories['hiring']:
        if re.search(r'\b' + i + r'\b', str):
            labels += 'hiring' + ' '
            break
    return ' '.join(sorted(set(labels.split())))

In [None]:
# Applying defined functions on the dataframe
data['Relevance'] = data['Text'].apply(lambda x: isRelevant(x.lower()))
data['Length'] = data['Text'].apply(lambda x: getlen(x.lower()))
data['Labels'] = data['Text'].apply(lambda x: tags(x.lower()))
data['Keywords'] = data['Text'].apply(lambda x:keys(x.lower()))
data['Source'] = data['Link'].apply(lambda x:sourcetype(x.lower()))

In [None]:
# Keeping relevant paragraphs
data = data.loc[(data.Relevance == 1) & (data.Length > 7)]

In [None]:
data

In [None]:
# Removing articles from URLs with 'privacy', 'support', 'investor', 'news investor privacy', and 'news investor' source types
data = data.loc[(data['Source'] != 'privacy') & (data['Source'] != 'support') & (data['Source'] != 'investor') & (data['Source'] != 'news investor privacy') & (data['Source'] != 'news investor')]
data

In [None]:
data['Labels'].value_counts()

In [None]:
# Removing irrelevant keywords and keyword categories
data = data.loc[~((data.Labels == 'disability') & (data.Keywords == 'accessible'))]
data

In [None]:
len(data.Link.unique())

In [None]:
# Exporting paragraphs to Excel file
data.to_excel('Relevant.xlsx', header=True, index=False)

In [None]:
# Concatenating paras to form article with relevant text only
new_df = pd.DataFrame()
new_df['Link'] = data['Link']
new_df['Company'] = data['Company']
new_df['Page Name'] = data['Page Name']
new_df['Source'] = data['Source']
new_df['Text'] = data.groupby(['Link'])['Text'].transform(lambda x : '\n'.join(x))
new_df['Keywords'] = data.groupby(['Link'])['Keywords'].transform(lambda x : ' '.join(x))
new_df['Keywords'] = new_df['Keywords'].apply(lambda x : ' '.join(set(x.split())))
new_df = new_df.drop_duplicates().reset_index(drop=True)   
new_df

In [None]:
new_df['Company'].value_counts()

In [None]:
# Adding column with keyword categories
new_df['Labels'] = new_df['Text'].apply(lambda x: tags(x.lower()))
new_df

In [None]:
# Exporting dataframe to Excel file
new_df.to_excel('Pages.xlsx', header=True, index=False)

In [None]:
new_df['Labels'].value_counts()

In [None]:
# Dividing articles for Inclusive Hiring', and 'Employee Community' tabs
inclusive_hiring = ['hiring', 'disability hiring', 'culture hiring', 'culture disability hiring']
employee_community = ['culture', 'culture disability', 'disability']
inc_hiring = new_df.loc[new_df['Labels'].isin(inclusive_hiring)]
inc_hiring.reset_index(drop=True, inplace=True)
emp_community = new_df.loc[new_df['Labels'].isin(employee_community)]
emp_community.reset_index(drop=True, inplace=True)

In [None]:
inc_hiring

In [None]:
# Text to show on the website
keywords = []
for i in ['hiring', 'disability', 'culture']:
    for j in categories[i]:
        keywords.append(j)
def inch_para(str):
    res = []
    paras = str.split('\n')
    for i in paras:
        for j in keywords:
            if re.search(r'\b' + j + r'\b', i.lower()) and i.split()[0].lower() != 'written':
                res.append(i)
    return list(OrderedDict.fromkeys(res))[0].strip()
inc_hiring['Show'] = inc_hiring['Text'].apply(lambda x: inch_para(x))

In [None]:
# Text to show on the website
keywords = []
for i in ['culture', 'disability']:
    for j in categories[i]:
        keywords.append(j)
def empcom_para(str):
    res = []
    paras = str.split('\n')
    for i in paras:
        for j in keywords:
            if re.search(r'\b' + j + r'\b', i.lower()) and i.split()[0].lower() != 'written':
                res.append(i)
    return list(OrderedDict.fromkeys(res))[0].strip()
emp_community['Show'] = emp_community['Text'].apply(lambda x: empcom_para(x))

In [None]:
inc_hiring.to_excel('Inclusive Hiring.xlsx', header=True, index=False)
emp_community.to_excel('Employee Community.xlsx', header=True, index=False)