In [1]:
import numpy as np
import pandas as pd
import json
import gzip
import seaborn as sns
import os
%matplotlib inline

In [2]:
data_dir = '/Users/Christy/Desktop/242/Project/'
profiles_path = os.path.join(data_dir, 'linkedin.json')
print(os.listdir(data_dir))

['linkedin.edges', 'Lecture8-DataScience.pdf', 'secondary data.csv', 'linkedin.nodes', '.ipynb_checkpoints', 'Exploratory Data Analysis.ipynb', 'linkedin.json']


In [3]:
fields = set()
localities = []
industries = []
skills = []
specialities = []
interests = []
occupations = []
companies = []
schools = []
majors = []

for line in open(profiles_path):
    line = json.loads(line)
    
    fields.update(line.keys())
    
    localities.append(line['locality'] if 'locality' in line else '')
    industries.append(line['industry'] if 'industry' in line else '')
    skills.extend(line['skills'] if 'skills' in line else [])
    specialities.extend([s.strip() for s in line['specilities'].split(',')] if 'specilities' in line else [])
    interests.extend([s.strip() for s in line['interests'].split(',')] if 'interests' in line else [])
    occupations.extend([exp['title'] for exp in line['experience'] if 'title' in exp] if 'experience' in line else [])
    companies.extend([exp['org'] for exp in line['experience'] if 'org' in exp] if 'experience' in line else [])
    schools.extend([edu['name'] for edu in line['education'] if 'name' in edu] if 'education' in line else [])
    majors.extend([edu['major'] for edu in line['education'] if 'major' in edu] if 'education' in line else [])
    
print('JSON fields', len(fields), fields)

JSON fields 18 {'skills', 'name', 'specilities', 'honors', 'summary', 'education', 'experience', 'locality', 'url', 'industry', 'overview_html', 'group', 'also_view', 'events', '_id', 'homepage', 'interval', 'interests'}


In [4]:
skills = pd.Series(list(skills)).value_counts()
localities = pd.Series(list(localities)).value_counts()
industries = pd.Series(list(industries)).value_counts()
specialities = pd.Series(list(specialities)).value_counts()
interests = pd.Series(list(interests)).value_counts()
occupations = pd.Series(list(occupations)).value_counts()
companies = pd.Series(list(companies)).value_counts()
schools = pd.Series(list(schools)).value_counts()
majors = pd.Series(list(majors)).value_counts()

print(len(skills))
print(len(localities))
print(len(industries))
print(len(specialities))
print(len(interests))
print(len(occupations))
print(len(companies))
print(len(schools))
print(len(majors))

653628
24598
2646
861527
557665
3500534
3787170
864915
733611


In [31]:
num_profiles_loaded = 1000
profiles = []

for line in open(profiles_path):
    if len(profiles) < num_profiles_loaded:
        line = json.loads(line)
        profiles.append(line)
    else:
        break
        
print('Examples primary keys', [p['_id'] for p in profiles[:10]])

Examples primary keys ['in-00000001', 'in-00001', 'in-00006', 'in-000montgomery', 'in-000vijaychauhan', 'in-001adambutler', 'in-001monica', 'in-001neilpeacock', 'in-00666', 'in-00789123']


In [11]:
import csv

In [19]:
csvfile = "/Users/Christy/Desktop/242/Project/1000profiles.csv"

In [74]:
#Count the max number of columns/keys here 
# result: max column is 18
maxNum = 0 
for i in profiles:
    num = len(i.keys())
    # add the number of columns to the dict
    i['numCol']=num
    if num>maxNum:
        maxNum=num

In [79]:
maxItem = max(profiles,key=lambda x:x['numCol'])
maxItem.keys()

dict_keys(['_id', 'interests', 'education', 'group', 'name', 'overview_html', 'locality', 'skills', 'industry', 'interval', 'experience', 'summary', 'honors', 'url', 'also_view', 'specilities', 'homepage', 'events', 'numCol'])

In [69]:
with open(csvfile, 'w') as f: 
    for i in profiles:
        w = csv.DictWriter(f, i.keys())
        w.writeheader()
        w.writerow(i)

In [114]:
df = pd.DataFrame()
count=0
# 5 columns 
# find the key - value 
for i in profiles: 
    if "experience" and "education" and "skills" in i:
        count+=1
        df1 = pd.DataFrame.from_dict(i,orient='index')
        df1=df1.transpose()
        df.append(df1,ignore_index=True)

In [104]:
df2 = pd.DataFrame.from_dict(profiles[1],orient='index')

In [113]:
df2=df2.transpose()
df2

Unnamed: 0,_id,education,group,name,overview_html,locality,skills,industry,interval,experience,summary,url,also_view,specilities,events,numCol
0,in-00001,"[{'start': '2008', 'major': 'Economics', 'end'...","{'affilition': ['ASMALLWORLD.net', 'Biomarker ...","{'family_name': 'Forslund', 'given_name': 'Ann'}","<dl id=""overview""><dt id=""overview-summary-cur...","Antwerp Area, Belgium","[Molecular Biology, Biomarkers]",Pharmaceuticals,20,"[{'org': 'Johnson and Johnson', 'title': 'Seni...",Ph.D. scientist with background in cancer rese...,http://be.linkedin.com/in/00001,[{'url': 'http://www.linkedin.com/pub/peter-ki...,"Biomarkers in Oncology, Cancer Genomics, Molec...","[{'from': 'Sahlgrenska University Hospital', '...",15


In [7]:
skills_count = skills[skills >= 10]

skills_count = pd.DataFrame(skills_count, columns=['Count'])
skills_count['Skill'] = skills_count.index
skills_count['Rank'] = range(1,len(skills_count)+1)
skills_count['Cumulative Probability'] = np.cumsum(skills_count.Count)/np.sum(skills_count.Count)

print('Number of skills', len(skills_count))
skills_count

Number of skills 33489


Unnamed: 0,Count,Skill,Rank,Cumulative Probability
Microsoft Office,149914,Microsoft Office,1,0.009359
Strategic Planning,123673,Strategic Planning,2,0.017080
Project Management,115735,Project Management,3,0.024305
Business Strategy,104696,Business Strategy,4,0.030841
Marketing Strategy,96228,Marketing Strategy,5,0.036848
Negotiation,92935,Negotiation,6,0.042650
New Business Development,91805,New Business Development,7,0.048381
Change Management,91090,Change Management,8,0.054067
Sales Management,83066,Sales Management,9,0.059253
Social Media,81731,Social Media,10,0.064355


In [8]:
industries_count = industries[industries >= 10]

industries_count = pd.DataFrame(industries_count, columns=['Count'])
industries_count['Industry'] = industries_count.index
industries_count['Rank'] = range(1,len(industries_count)+1)
industries_count['Cumulative Probability'] = np.cumsum(industries_count.Count)/np.sum(industries_count.Count)

print('Number of industries', len(industries_count))
industries_count

Number of industries 2317


Unnamed: 0,Count,Industry,Rank,Cumulative Probability
Information Technology and Services,167289,Information Technology and Services,1,0.056065
Computer Software,90346,Computer Software,2,0.086343
Financial Services,89304,Financial Services,3,0.116272
,81615,,4,0.143624
Marketing and Advertising,74079,Marketing and Advertising,5,0.168451
Oil & Energy,64818,Oil & Energy,6,0.190173
Telecommunications,54424,Telecommunications,7,0.208413
Banking,47001,Banking,8,0.224165
Human Resources,44230,Human Resources,9,0.238988
Pharmaceuticals,43666,Pharmaceuticals,10,0.253622


In [9]:
occupations_count = occupations[occupations >= 10]

occupations_count = pd.DataFrame(occupations_count, columns=['Count'])
occupations_count['Occupations'] = occupations_count.index
occupations_count['Rank'] = range(1,len(occupations_count)+1)
occupations_count['Cumulative Probability'] = np.cumsum(occupations_count.Count)/np.sum(occupations_count.Count)

print('Number of occupations', len(occupations_count))
occupations_count

Number of occupations 56823


Unnamed: 0,Count,Occupations,Rank,Cumulative Probability
Owner,75722,Owner,1,0.014272
Project Manager,68503,Project Manager,2,0.027182
Consultant,64052,Consultant,3,0.039255
Director,59652,Director,4,0.050497
Intern,55371,Intern,5,0.060933
Account Manager,40761,Account Manager,6,0.068616
Software Engineer,39704,Software Engineer,7,0.076099
Manager,36578,Manager,8,0.082993
Managing Director,35297,Managing Director,9,0.089645
General Manager,31269,General Manager,10,0.095538


In [10]:
companies_count = companies[companies >= 10]

companies_count = pd.DataFrame(companies_count, columns=['Count'])
companies_count['Companies'] = companies_count.index
companies_count['Rank'] = range(1,len(companies_count)+1)
companies_count['Cumulative Probability'] = np.cumsum(companies_count.Count)/np.sum(companies_count.Count)

print('Number of companies', len(companies_count))
companies_count

Number of companies 87979


Unnamed: 0,Count,Companies,Rank,Cumulative Probability
IBM,21927,IBM,1,0.004894
Ernst & Young,16914,Ernst & Young,2,0.008670
Accenture,15755,Accenture,3,0.012186
Microsoft,12929,Microsoft,4,0.015072
Hewlett-Packard,11680,Hewlett-Packard,5,0.017679
Deloitte,11557,Deloitte,6,0.020259
KPMG,10411,KPMG,7,0.022582
Procter & Gamble,10222,Procter & Gamble,8,0.024864
Unilever,10155,Unilever,9,0.027131
PricewaterhouseCoopers,9190,PricewaterhouseCoopers,10,0.029182


In [11]:
schools_count = schools[schools >= 10]

schools_count = pd.DataFrame(schools_count, columns=['Count'])
schools_count['Schools'] = schools_count.index
schools_count['Rank'] = range(1,len(schools_count)+1)
schools_count['Cumulative Probability'] = np.cumsum(schools_count.Count)/np.sum(schools_count.Count)

print('Number of schools', len(schools_count))
schools_count

Number of schools 26071


Unnamed: 0,Count,Schools,Rank,Cumulative Probability
Fundação Getúlio Vargas,7808,Fundação Getúlio Vargas,1,0.002708
University of Mumbai,7615,University of Mumbai,2,0.005350
Universidad Complutense de Madrid,7543,Universidad Complutense de Madrid,3,0.007966
Universidad de Buenos Aires,6802,Universidad de Buenos Aires,4,0.010325
University of Toronto,6650,University of Toronto,5,0.012632
University of Phoenix,6191,University of Phoenix,6,0.014779
Delhi University,6065,Delhi University,7,0.016883
"University of California, Berkeley",6017,"University of California, Berkeley",8,0.018970
University of Michigan,5859,University of Michigan,9,0.021002
The University of Texas at Austin,5854,The University of Texas at Austin,10,0.023033


In [12]:
majors_count = majors[majors >= 10]

majors_count = pd.DataFrame(majors_count, columns=['Count'])
majors_count['Majors'] = majors_count.index
majors_count['Rank'] = range(1,len(majors_count)+1)
majors_count['Cumulative Probability'] = np.cumsum(majors_count.Count)/np.sum(majors_count.Count)

print('Number of majors', len(majors_count))
majors_count

Number of majors 16376


Unnamed: 0,Count,Majors,Rank,Cumulative Probability
Computer Science,60186,Computer Science,1,0.032916
Marketing,49475,Marketing,2,0.059974
Economics,43383,Economics,3,0.083701
Finance,33985,Finance,4,0.102287
Mechanical Engineering,33410,Mechanical Engineering,5,0.120559
Business Administration,32054,Business Administration,6,0.138090
Accounting,31414,Accounting,7,0.155270
Law,27550,Law,8,0.170338
Business,21372,Business,9,0.182026
Psychology,19873,Psychology,10,0.192895
