In [1]:
import numpy as np
import pandas as pd
import json
import gzip
import seaborn as sns
import os
import math
import string
%matplotlib inline

In [2]:
data_dir = '../Data/linkedin-crawled-profiles-dataset'
profiles_path = os.path.join(data_dir, 'linkedin.json')

num_profiles_loaded = 1000
profiles = []

for line in open(profiles_path, encoding = 'utf8'):
    if len(profiles) < num_profiles_loaded:
        line = json.loads(line)
        profiles.append(line)
    else:
        break

In [3]:
df = pd.DataFrame()
for i in profiles: 
    if "experience" and "education" and "skills" in i:
        df1 = pd.DataFrame.from_dict(i,orient='index')
        df1=df1.transpose()
        df=pd.concat([df,df1],axis = 0,ignore_index = True)
    

In [4]:
df.head()

Unnamed: 0,_id,also_view,education,events,experience,group,homepage,honors,industry,interests,interval,locality,name,overview_html,skills,specilities,summary,url
0,in-00000001,[{'url': 'http://www.linkedin.com/pub/krisa-dr...,,,,,,,Medical Devices,,,United States,"{'family_name': 'Mazalu MBA', 'given_name': 'D...",,"[Key Account Development, Strategic Planning, ...",,SALES MANAGEMENT / BUSINESS DEVELOPMENT / PROJ...,http://www.linkedin.com/in/00000001
1,in-00001,[{'url': 'http://www.linkedin.com/pub/peter-ki...,"[{'start': '2008', 'major': 'Economics', 'end'...","[{'from': 'Sahlgrenska University Hospital', '...","[{'org': 'Johnson and Johnson', 'title': 'Seni...","{'affilition': ['ASMALLWORLD.net', 'Biomarker ...",,,Pharmaceuticals,,20.0,"Antwerp Area, Belgium","{'family_name': 'Forslund', 'given_name': 'Ann'}","<dl id=""overview""><dt id=""overview-summary-cur...","[Molecular Biology, Biomarkers]","Biomarkers in Oncology, Cancer Genomics, Molec...",Ph.D. scientist with background in cancer rese...,http://be.linkedin.com/in/00001
2,in-00006,[{'url': 'http://www.linkedin.com/pub/george-c...,"[{'major': 'Biophysics', 'end': '2009', 'name'...",[{'from': 'Wyss Institute for Biologically Ins...,"[{'org': 'UCSF', 'title': 'Assistant Professor...",,"{'BIOMOD': ['http://biomod.net/'], 'Company We...",,Research,"personal genomics, nanotechnology",0.0,"San Francisco, California","{'family_name': 'Douglas', 'given_name': 'Shawn'}","<dl id=""overview""><dt id=""overview-summary-cur...","[DNA, Nanotechnology, Molecular Biology, Softw...",,I am interested in inventing new methods to co...,http://www.linkedin.com/in/00006
3,in-000vijaychauhan,[{'url': 'http://in.linkedin.com/in/sandeeprag...,"[{'start': '1988', 'end': '1989', 'name': 'Eco...",,,{'member': 'Member of Project Management Insti...,,,Aviation & Aerospace,"Literature, Philosophy, Music",,"Chennai Area, India","{'family_name': 'Chauhan, PMP', 'given_name': ...",,"[Program Management, French, Avionics, Embedde...",,"Experience in Avionics Systems, Embedded Syste...",http://in.linkedin.com/in/000vijaychauhan
4,in-001adambutler,[{'url': 'http://uk.linkedin.com/in/paulbeier'...,"[{'major': 'Product Design', 'end': '1994', 'n...","[{'from': 'Tigerprint', 'to': 'WHSmith', 'titl...","[{'org': 'Brand New Music', 'title': 'Founding...",,,,Marketing and Advertising,"travelling,the sea,trying new things, trying t...",16.0,"Brighton, United Kingdom","{'family_name': 'Adam', 'given_name': 'Butler,'}",,"[Digital Strategy, Integrated Marketing, Digit...","A passion for Brands, coupled with experience ...",Integrating creative design with commercial ma...,http://uk.linkedin.com/in/001adambutler


In [5]:
# Extract information about major, highest degree and institution from education column
def add_edu_info(df):
    df2 = df[df['education'].notnull()].copy()

    major = []
    institution = []
    degree = []
    
    for profile in df2['education']:
        personal_major = [] # majors for a particular profile
        personal_institution = []
        personal_degree = []
        for record in profile:
            if "major" in record: 
                personal_major.append(record['major'])
                         
            if "name" in record:
                personal_institution.append(record['name'])
            
            if "degree" in record:
                personal_degree.append(record['degree'])
                
        major.append(personal_major)
        institution.append(personal_institution)
        degree.append(personal_degree) 
        
    major = [float('nan') if x == [] else x for x in major] #replace empty list with NaN for consistency
    df2['major'] = major
    institution = [float('nan') if x == [] else x for x in institution]
    df2['institution'] = institution
    degree = [float('nan') if x == [] else x[0] for x in degree] # Assuming the most recent degree is the highest degree obtained
    df2['degree'] = degree
    
    df = pd.merge(df, df2[['_id', 'major', 'institution', 'degree']], on=['_id'], how = 'left')
    
    return df

In [6]:
print('Original df shape: ', df.shape)

df = add_edu_info(df)
print('Add majors and instituion', df.shape)

Original df shape:  (703, 18)
Add majors and instituion (703, 21)


In [7]:
df['degree'].value_counts().loc[lambda x : x>1] 

MBA                                           38
BA                                            17
Master                                        15
BS                                            13
MS                                            10
Bachelor of Science                            9
PhD                                            8
Bachelor's degree                              7
Bachelor                                       6
B.E.                                           5
B.A.                                           5
B.S.                                           5
Bachelors                                      4
Ph.D.                                          4
Certificate                                    4
Master of Business Administration (MBA)        4
MSc                                            3
Executive MBA                                  3
Bachelor of Science (BSc)                      3
Bachelor of Science (BS)                       3
Master's degree     

In [8]:
def remove_punctuation(s):
    translator = str.maketrans('', '', string.punctuation)
    
    return s.translate(translator)

In [9]:
def clean_degree(df):
    lst = []
    
    for degree in df['degree']:
        if type(degree) == float:
            lst.append(degree) # Keep NaN values without further processing
        else:
            degree = remove_punctuation(degree.upper())
            
            if 'PHD' in degree:
                degree = 'PHD'
                
            if "BUSINESS ADMINISTRATION" in degree or 'MBA' in degree:
                degree = 'MBA'
            
            if "BACHELOR OF SCIENCE" in degree or "BSC" in degree:
                degree = 'BS'
            
            if "MASTER OF SCIENCE" in degree or 'MS' in degree or 'MSC' in degree:
                degree = 'MS'
            
            if degree == "BACHELORS DEGREE" or degree == 'BACHELORS':
                degree = 'BACHELOR'
                
            if degree == "MASTERS DEGREE" or degree == 'MASTERS':
                degree = 'MASTER'
                
            lst.append(degree)
    
    df['degree'] = lst
    
    return df

In [14]:
clean_degree(df)['degree'].value_counts().loc[lambda x : x>1] 

MBA                   61
BS                    47
MS                    39
MASTER                23
BA                    23
PHD                   18
BACHELOR              18
BE                     8
MA                     7
CERTIFICATE            4
BA HONS                3
BBA                    3
PGDM                   3
BTECH                  2
BUSINESS               2
ASSOCIATE              2
NONE                   2
COMPUTER SCIENCE       2
MHRM                   2
PGCE                   2
PGDBM                  2
COMPUTER ANIMATION     2
BCS                    2
Name: degree, dtype: int64

In [15]:
df.head()

Unnamed: 0,_id,also_view,education,events,experience,group,homepage,honors,industry,interests,...,locality,name,overview_html,skills,specilities,summary,url,major,institution,degree
0,in-00000001,[{'url': 'http://www.linkedin.com/pub/krisa-dr...,,,,,,,Medical Devices,,...,United States,"{'family_name': 'Mazalu MBA', 'given_name': 'D...",,"[Key Account Development, Strategic Planning, ...",,SALES MANAGEMENT / BUSINESS DEVELOPMENT / PROJ...,http://www.linkedin.com/in/00000001,,,
1,in-00001,[{'url': 'http://www.linkedin.com/pub/peter-ki...,"[{'start': '2008', 'major': 'Economics', 'end'...","[{'from': 'Sahlgrenska University Hospital', '...","[{'org': 'Johnson and Johnson', 'title': 'Seni...","{'affilition': ['ASMALLWORLD.net', 'Biomarker ...",,,Pharmaceuticals,,...,"Antwerp Area, Belgium","{'family_name': 'Forslund', 'given_name': 'Ann'}","<dl id=""overview""><dt id=""overview-summary-cur...","[Molecular Biology, Biomarkers]","Biomarkers in Oncology, Cancer Genomics, Molec...",Ph.D. scientist with background in cancer rese...,http://be.linkedin.com/in/00001,"[Economics, Cancer genomics, Biology, Medicine...",[Columbia University - Columbia Business Schoo...,PHD
2,in-00006,[{'url': 'http://www.linkedin.com/pub/george-c...,"[{'major': 'Biophysics', 'end': '2009', 'name'...",[{'from': 'Wyss Institute for Biologically Ins...,"[{'org': 'UCSF', 'title': 'Assistant Professor...",,"{'BIOMOD': ['http://biomod.net/'], 'Company We...",,Research,"personal genomics, nanotechnology",...,"San Francisco, California","{'family_name': 'Douglas', 'given_name': 'Shawn'}","<dl id=""overview""><dt id=""overview-summary-cur...","[DNA, Nanotechnology, Molecular Biology, Softw...",,I am interested in inventing new methods to co...,http://www.linkedin.com/in/00006,"[Biophysics, Computer Science]","[Harvard University, Yale University]",PHD
3,in-000vijaychauhan,[{'url': 'http://in.linkedin.com/in/sandeeprag...,"[{'start': '1988', 'end': '1989', 'name': 'Eco...",,,{'member': 'Member of Project Management Insti...,,,Aviation & Aerospace,"Literature, Philosophy, Music",...,"Chennai Area, India","{'family_name': 'Chauhan, PMP', 'given_name': ...",,"[Program Management, French, Avionics, Embedde...",,"Experience in Avionics Systems, Embedded Syste...",http://in.linkedin.com/in/000vijaychauhan,,[Ecole nationale supérieure de l'Aéronautique ...,
4,in-001adambutler,[{'url': 'http://uk.linkedin.com/in/paulbeier'...,"[{'major': 'Product Design', 'end': '1994', 'n...","[{'from': 'Tigerprint', 'to': 'WHSmith', 'titl...","[{'org': 'Brand New Music', 'title': 'Founding...",,,,Marketing and Advertising,"travelling,the sea,trying new things, trying t...",...,"Brighton, United Kingdom","{'family_name': 'Adam', 'given_name': 'Butler,'}",,"[Digital Strategy, Integrated Marketing, Digit...","A passion for Brands, coupled with experience ...",Integrating creative design with commercial ma...,http://uk.linkedin.com/in/001adambutler,[Product Design],[Bournemouth University],BA HONS
