# Hack the job market like a data scientist
## Problem - Find company leads with a suitable cultural fit
A job seeker must access the hidden job market to find 80% of available positions. Manual search methods can provide basic such as sector, company size, and hiring outlook. But cultural fit is difficult to assess:
- how cool is the company?
- are they innovative or conservative?
- do they have leadership or a "me too" approach?

## Solution - Follow other people's career journey
Collect names of firms with the desired cultural fit. For example, IDEO for "design thinking" or Gallop for "innovative approaches to leadership." Then scrape Linkedin search for people who were formerly at that company, and collect the list of their subsequent firms.

In [1]:
%matplotlib inline

import pandas as pd
import requests
import lxml.html
from lxml.html.clean import Cleaner
import re
import json
from time import sleep
from random import randint
import csv
import matplotlib.pyplot as plt

from os import listdir
from os.path import isfile, join


In [12]:
# list all files in directory
# load the first
# parse the html for person record
folder = 'data'
onlyfiles = [f for f in listdir(folder) if isfile(join(folder, f))]
onlyfiles[0]

def load_connections():

    # get files in /data and supfolders
    folder = 'data'
    files = [f for f in listdir(folder) if (isfile(join(folder, f)) and f != '.DS_Store')]
    
    main_results = []
    
    print(files)

    
    
    for i in range(0, len(files)):
#    for i in range(0, 1):
        
        file = folder + '/' + files[i]

        f = open(file, 'r')
                
        json_start = '<code id="voltron_srp_main-content" style="display:none;"><!--'
        json_end = '--></code>'
        
        
        html = f.read()

        json_data = json.loads(html[(html.find(json_start) + len(json_start)):html.find(json_end)])
        
        #print(json_data)
        
        # typically 10 results, but 0 is the query
        count = len(json_data['content']['page']['voltron_unified_search_json']['search']['results'])
        json_people = json_data['content']['page']['voltron_unified_search_json']['search']['results'][1:count] #['person']['firstName'] #['person']

        keywords = ''
        
        try:
            keywords = json_data['content']['page']['voltron_unified_search_json']['search']['results'][0]['queryRewrite']['escapeHatchUrl'] #.find('keywords=')
            keywords = keywords[keywords.find('keywords=')+9:]
            keywords = keywords[:keywords.find('&')]
        except:
            print('oops')
        
        
        for person in json_people:
            
            # remove filters and other non-person data
            if 'person' in person:
                
                dat = {}
                
                dat['keywords'] = keywords
                dat['connections'] = person['person']['connectionCount']
                dat['degree'] = person['person']['distance']
                dat['first'] = person['person']['firstName']
                dat['last'] = person['person']['lastName']
                if 'sharedConnectionCount' in person['person']:
                    dat['shared_connections'] = person['person']['sharedConnectionCount']
                    dat['shared_ids'] = person['person']['sharedConnectionIds']
                    
                dat['industry'] = person['person']['fmt_industry']
                dat['location'] = person['person']['fmt_location']
                dat['id'] = person['person']['id']

                dat['type'] = 'Top'
                dat['heading'] = person['person']['fmt_headline']
                
                # company - anything after at
                company = person['person']['fmt_headline']

                bold_company = re.compile('<B>(.*?)</B>').findall(company)
                if bold_company:
                    company = bold_company[0]
                
                elif company.find('at ') > 0:
                    company = company[company.find('at ')+3:]
                
                dat['company'] = company

                # append and create new
                main_results.append(dat)
                dat = dat.copy()
                
                if len(person['person']['snippets']) < 1: print('ERROR: No Snippets')
                
                for snippet in person['person']['snippets']:
                    
                    
                    if 'bodyList' in snippet:
                        for item in snippet['bodyList']:  
                            dat['type'] = snippet['fieldName']
                            dat['heading'] = item
                            company = item
                            
                            bold_company = re.compile('<B>(.*?)</B>').findall(company)
                            if bold_company:
                                company = bold_company[0]
                            elif company.find('at ') > 0:
                                company = company[company.find('at ')+3:]

                            dat['company'] = company
                            
                            # append and create new
                            main_results.append(dat)
                            dat = dat.copy()

                    if 'heading' in snippet:
                        dat['type'] = snippet['fieldName']
                        dat['heading'] = snippet['heading']
                        
                        company = snippet['heading']
                        bold_company = re.compile('<B>(.*?)</B>').findall(company)
                        if bold_company:
                            company = bold_company[0]
                        elif company.find('at ') > 0:
                            company = company[company.find('at ')+3:]

                        dat['company'] = company

                        # append and create new
                        main_results.append(dat)
                        dat = dat.copy()
                        
                #main_results.append(dat)

            #print(json.dumps(person['person'], indent=4, sort_keys=True))

    print('Done processing files')
    
    df = pd.DataFrame.from_dict(main_results, dtype=None)
    
    return df


df = load_connections()


['gallup1.html', 'gallup10', 'gallup11', 'gallup12', 'gallup13', 'gallup14', 'gallup2', 'gallup3', 'gallup4', 'gallup5', 'gallup6', 'gallup7', 'gallup8', 'gallup9', 'ideo']
oops
ERROR: No Snippets
ERROR: No Snippets


KeyError: 'connectionCount'

In [13]:
df


Unnamed: 0,company,connections,degree,first,heading,id,industry,keywords,last,location,shared_connections,shared_ids,type
0,MMG,227,2,Kim,Proposal Writer at MMG,63102796,Marketing and Advertising,gallup,Rowse,Washington D.C. Metro Area,1,2353260,Top
1,Gallup,227,2,Kim,"Senior Consultant, Contract Services Advisor a...",63102796,Marketing and Advertising,gallup,Rowse,Washington D.C. Metro Area,1,2353260,Past
2,Crosby Marketing Communications,501,2,Meredith,Executive Vice President at Crosby Marketing C...,39098764,"Health, Wellness and Fitness",gallup,Williams,Washington D.C. Metro Area,1,2353260,Top
3,Gallup,501,2,Meredith,"Partner, Government Strategic Communications a...",39098764,"Health, Wellness and Fitness",gallup,Williams,Washington D.C. Metro Area,1,2353260,Past
4,Pew Research Center,501,2,Kyley,Research Methodologist at Pew Research Center,118193497,Research,gallup,McGeeney,Washington D.C. Metro Area,1,2353260,Top
5,Gallup,501,2,Kyley,Advanced Design and Analytics Consultant at <B...,118193497,Research,gallup,McGeeney,Washington D.C. Metro Area,1,2353260,Past
6,Senior Manager,501,2,James,Senior Manager,22668094,Management Consulting,gallup,Kilpatrick,Washington D.C. Metro Area,1,2353260,Top
7,Gallup,501,2,James,Senior Consultant - Client Development&#x2F; A...,22668094,Management Consulting,gallup,Kilpatrick,Washington D.C. Metro Area,1,2353260,Past
8,APOPO,501,2,Charles,US Director at APOPO,35590853,Nonprofit Organization Management,gallup,Richter,Washington D.C. Metro Area,3,34322719789992312003734,Top
9,Gallup,501,2,Charles,Business Development Consultant at <B>Gallup</B>,35590853,Nonprofit Organization Management,gallup,Richter,Washington D.C. Metro Area,3,34322719789992312003734,Past


In [3]:
#df[df.company == 'kgb']

In [87]:
blacklist_companies = ['Gallup', '42 Winks Productions', 'Edgerton.Life', 'Move Sleep, StrengthsFinder 2.0', 
                       'Fortune 500 Business Development Executive', 'CoachingMillennials', 'LifeCourse Associates', 'Versatile and Experienced Manager',
                       'Senior Strategic Consultant','Saeculum Research', 'Senior Consultant', 'GALLUP', 'Capture Strategy/Proposal Management Consultant',
                       'proactive people connector, always looking to help and ideate around achieving meaningful change', 'Organizational Effectiveness Management Consultant | Marketing Professional  | Market Analytics Enthusiast',
                       'Organizational Effectiveness &amp; Human Capital Optimization Consultant', 'Experienced Hospitality Professional','Audience Development Strategist','Venture Capital',
                       'Sr. Recruitment Consultant', 'CEO','Senior Director in Sales Management, dedicated to building and leading high-performing teams',
                       'Providing on-demand resources, consulting and training in Organizational Development; TEDx Speaker; Executive Coach','Senior Research Consultant','du /Gallup','Regional Director of Client Service','Education Consultant',
                       'Consultant','Market Research and Consumer Insights Professional','Director of Research, Applied Sociologist','Learning &amp; Development Advanced Consultant','Brand and Marketing Strategist','Chief of Party, RADP-S',
                       'Best Selling Author, Speaker &amp; Executive Leadership Advisor','I help professional services firms grow and scale','Researcher and Consultant - Industrial-Organizational Psychology',
                       'Researcher and Consultant - Industrial-Organizational Psychology']
df[((df.type != 'Education') & (~df.company.isin(blacklist_companies)))]['company'].value_counts()
#df.company.unique()

SBD, ACT-IAC Rocky Mountain Chapter Industry Chair                  2
PANDORA                                                             2
Southern Hospitality, Inc                                           2
Organization for Autism Research                                    2
Solutions By Design                                                 2
Prometric                                                           2
kgb                                                                 2
Louisiana Technology Park                                           2
GTM Research                                                        2
founder, ceo &amp; codebot @ wallet.AI                              2
United States Department of Defense                                 2
Zscaler                                                             2
I-Flow, a Kimberly-Clark Corporation                                1
The Coffman Organization                                            1
PwC Management Consu