In [1]:
'''Make sure to have requirements installed.'''
!pip3 install -r requirements.txt



In [2]:
from linkedin_api import Linkedin
import pandas as pd
import logging
from tqdm.notebook import tqdm

#### The Linkedin API could not be subclassed because the requirements of the underlying client are unknown. Nevertheless, a class is a good way to limit unneeded access as well as structure data and requests. Leveraging this unofficial Linkedin API, LinkedIn_Requests provides the functionality for the usecase.

#### To see the full availible functionality of the API, read the docs here: https://linkedin-api.readthedocs.io/en/latest/api.html

In [3]:
class LinkedIn_Requests:
    def __init__(self, email, pwd, logger):
        self.auth = Linkedin(email, pwd)
        my_profile = self.auth.get_user_profile()
        self.my_name = my_profile['miniProfile']['firstName'] + ' ' + my_profile['miniProfile']['lastName']
        self.logger = logger

    '''Returns LinkedIn URIs from LinkedIn person search'''
    def get_person_profile_ids(self, name, company):
        search_result = self.__search_person_handler(name, company)
        if search_result:
            res = search_result[0]
            return res['urn_id'], res['public_id']
        return None
    
    '''Handler for search results'''
    def __search_person_handler(self, name, company):
        try:
            name.encode('ascii')
            company.encode('ascii')
            pass
        except:
            self.logger.warning(f"The search terms {name} {company} contain non-ascii characters.")
            
        search_result = self.__search_person(name, company)
        number_of_matches = len(search_result)
        
        #returns value IFF one search result
        if number_of_matches == 1:
            return search_result
        else:
            self.logger.error(f'Found {number_of_matches} matches for {name} working at {company}. Skipping.')
            return None
    
    '''
        Perform a LinkedIn search for people.
        Using the general keywords param was found more reliable than the more specific params offered.
    '''
    def __search_person(self, name, company):
        return self.auth.search_people(keywords=' '.join([name, company]), limit=1)
#         return self.auth.search_people(keywords=name, keyword_company = company)
    
    '''Returns the first-degree connections for a given LinkedIn profile to this auth'd profile.'''
    def get_profile_connections(self, urn_id):
        return self.auth.get_profile_connections(urn_id)
    
    '''Returns the email of the target user.'''
    def get_email(self, _public_id=None, _urn_id=None):
        contact_info = self.__get_profile_contact_info(_public_id, _urn_id)
        return contact_info['email_address']
    
    '''Returns name of given user'''
    def get_name(self, _public_id=None, _urn_id=None):
        profile = self.__get_profile(_public_id, _urn_id)
        return profile['firstName'] + ' ' + profile['lastName']
    
    '''Fetch data for a given LinkedIn profile. Pass a {public_id} or a {urn_id}.'''
    def __get_profile(self, _public_id=None, _urn_id=None):
        return self.auth.get_profile(public_id=_public_id, urn_id=_urn_id)
    
    '''Fetch contact information for a given LinkedIn profile. Pass a {public_id} or a {urn_id}.'''
    def __get_profile_contact_info(self, _public_id=None, _urn_id=None):
        return self.auth.get_profile_contact_info(public_id=_public_id, urn_id=_urn_id)

In [4]:
def read_input(path, required_cols = ['Primary Contact', 'Company Name']):
    pitchbook_input_csv = pd.read_csv(path, encoding='latin1')
    pitchbook_input_csv.dropna(axis=1, inplace=True)
    assert all([col in pitchbook_input_csv.columns for col in required_cols]), f'Required columns of {required_cols} are not all present in the input.'
    return pitchbook_input_csv

def read_config(path):
    read_in_path = False
    read_out_path = False
    input_file_path = ''
    auths = []
    with open(path, 'r') as config:
        for line in config:
            if line.startswith('#') or line == '\n':
                continue
            elif not read_in_path:
                in_file_path = line.strip('\n')
                read_in_path = True
            elif not read_out_path:
                out_file_path = line.strip('\n')
                read_out_path = True
            else:
                auths.append(line.strip('\n').split(' '))
    return in_file_path, out_file_path, auths
            

In [5]:
class Target_LinkedIn_User:
    '''A class to hold data for a linkedin user'''
    def __init__(self, contact_name, company, urn_id, email):
        self.contact_name = contact_name
        self.company = company
        self.urn_id = urn_id
        self.email = email
        self.connection_urns = set()
        
    def add_connection_urn(self, urn_id):
        self.connection_urns.add(urn_id)
        
    def to_df(self, name_lookup):
        arr = self._to_arr(name_lookup)
        df = pd.DataFrame(arr).T
        df.columns = ['Primary Contact', 'Company Name', 'urn id text', 'linkedin url', 'email', 'connection']
        return df
        
    def _to_arr(self, name_lookup):
        return [self.contact_name,
                self.company,
                self.urn_id,
                self._to_li_hyperlink(self.contact_name, self.urn_id),
                self.email,
                [(self._to_li_hyperlink(name_lookup[urn_id], urn_id)) for urn_id in self.connection_urns]
               ]
    
    def _to_li_hyperlink(self, display_text, urn_id):
        return f'=HYPERLINK("https://www.linkedin.com/in/{urn_id}/","{display_text}")'

In [8]:
'''
    The driver of the program. Takes the config file. 
'''

def main(config_path):
    '''Set up a logger'''
    Log_Format = "%(levelname)s %(asctime)s - %(message)s"
    logging.basicConfig(filename = "logfile.log",
                        filemode = "a",
                        format = Log_Format, 
                        level = logging.INFO)
    logger = logging.getLogger()

    '''read in and set inputs from config file'''
    input_file_path, out_file_path, auth_input = read_config(config_path)
    pitchbook_input_csv = read_input(input_file_path)
    logger.info(f'Read Input. {pitchbook_input_csv.shape[0]} rows.')
    auths = [LinkedIn_Requests(email, pwd, logger) for email, pwd in auth_input]
    logger.info(f'Authenticated all users.')

    '''the name_lookup prevents making repeating calls to the api if connections were to overlap b/t target users.'''
    name_lookup = {}
    connection_profiles = []

    '''find the target users and their connections for all entries in the input'''
    for entry in tqdm(pitchbook_input_csv.iterrows(), total=pitchbook_input_csv.shape[0]):
        contact_name, company = entry[1]['Primary Contact'], entry[1]['Company Name']

        res = auths[0].get_person_profile_ids(contact_name, company)
        if res != None:
            try:
                urn_id, public_id = res
                email = auths[0].get_email(_urn_id = urn_id)
                target_user = Target_LinkedIn_User(contact_name, company, urn_id, email)
                name_lookup[urn_id] = contact_name

                '''for a target user, find their connections to the authenticated users.'''
                for auth in auths:
                    connections = auth.get_profile_connections(urn_id)
                    logger.info(f'Authenticated user {auth.my_name} has {len(connections)} connections with {contact_name} working at {company}. Processing.')

                    for conn in connections:
                        urn_id = conn['urn_id']
                        target_user.add_connection_urn(urn_id)

                        '''save name of connection'''
                        if urn_id not in name_lookup:
                            name_lookup[urn_id] = auth.get_name(_urn_id = urn_id)
                connection_profiles.append(target_user)
                logger.info(f'Completed {contact_name} working at {company}')
            except Exception as e:
                logger.error(f'Unknown failure: {e}')
    logger.info(f'Completed Calls to LinkedIn. Now Saving.')
    connections_df = pd.concat([conn.to_df(name_lookup) for conn in connection_profiles])
    merge_src_and_connections = pitchbook_input_csv.merge(connections_df, on=['Primary Contact', 'Company Name'])
    merged = merge_src_and_connections.explode('connection')
    try:
        merged.to_excel(out_file_path, index=False)
        logger.info(f'Successfully saved to {out_file_path}')
    except Exception as e:
        logger.error(f'Save Failed. {e}')
    finally:
        logger.info('EOP')


In [9]:
if __name__ == '__main__':
    config_file = r'/Users/nathanoasis/Downloads/config.csv'
    main(config_file)

hi


  0%|          | 0/2 [00:00<?, ?it/s]