In [144]:
import pandas as pd
import numpy as np

In [145]:
df = pd.read_csv('Software_Engineer_US.csv')
df = df.rename(columns={'Unnamed: 0': 'id'})
pd.set_option('display.max_columns', 500)

In [146]:
# Clean the job listing summaries
df.summary = df.summary.str.lower()
df['summary'].replace('\n',' ',regex=True,inplace=True)
df['summary'].replace("•",' ',regex=True,inplace=True)
df['summary'].replace(',',' ',regex=True,inplace=True)
df['summary'].replace(':',' ',regex=True,inplace=True) 
df['summary'].replace('-',' ',regex=True,inplace=True)

# need \ escape character for regex
df['summary'].replace('\?',' ',regex=True,inplace=True) 
df['summary'].replace('\/',' ',regex=True,inplace=True) 
df['summary'].replace('\*',' ',regex=True,inplace=True) 

df.head()

Unnamed: 0,id,job_title,company,location,summary
0,0,Full Stack Software Engineer,Nine Summer,Connecticut,this position is on the product team responsib...
1,1,Software Engineer - Intern 2018,Intel,"Folsom, CA 95630",job description client computing group ccg is ...
2,2,Software Engineer,General Electric,"West Melbourne, FL",about us ge is the world's digital industrial...
3,3,Senior Software Engineer,Rebric,"Denver, CO 80202",rebric is looking for a senior software engine...
4,4,Software Engineer (Junior),SkyKick,"Seattle, WA",about skykick sky...


In [147]:
df.shape

(4649, 5)

In [154]:
# This is the master dictionary that controls the counts. 
# If you want any instance of the string then simply put it in quotes
# If you want to make sure that the string is only counting instances of that word 
# and not instances where it is part of another word then make sure to put spaces at the ends of the string
# For example, adding "java" will also count instances of the "java" found in the word in "javascript"
# include spaces to " java " in order to get instances of that word by itself.
# Each instance of a term will be totaled under its corresponding key.

word_dict = {
    'javascript': ['javascript'],
    'jquery': ['jquery'],
    'es6': ['es6', 'es2015'],
    'react': [' react ', 'reactjs', 'react.js'],
    'angular': [' angular ', 'angularjs', 'angular.js'],
    'node': [' node ', 'nodejs', 'node.js'],
    'mongodb': [' mongo ', 'mongodb'],
    'computer science': ['computer science', ' cs '],
    'information systems': ['information systems'],
    'java': [' java '],
    'python': ['python'],
    'django': ['django'],
    'php': ['php'],
    'wordpress': ['wordpress'],
#     'twig': ['twig'],
#     'drupal': ['drupal'],
    'ruby': ['ruby'],
    'c#': ['c#'],
    'c++': ['c++'],
    '.net': ['.net'],
    'html': [' html ', 'html5'],
    'css': [' css ', 'css3'],
    'less': [' less '],
    'sass': ['sass'],
    'bootstrap': ['bootstrap'],
    'front end': ['front end'], # I removed dashes so this should cover "front-end" as well.
    'back end': ['back end'],
    'agile': ['agile'],
    'scrum': ['scrum'],
    'ajax': ['ajax'],
    'rest': [' rest ', 'restful'],
    'api': ['api'],
    'git': ['git'],
    'github': ['github'],
    'sql': [' sql '],
    'mysql': ['mysql'],
    'nosql': ['nosql'],
    'aws': ['aws', 'amazon web services'],
    'postgres': ['postgres'],
    'devops': ['devops'],
    'mvc': ['mvc'],
    'testing': ['testing', 'tdd'],
    'quality assurance': ['quality assurance', 'qa'],
    'responsive': ['responsive'],
    'linux': ['linux'],
    'ui_ux': ['ui ux'],
    'data_structure': ['data structure'],
    'algorithm': ['algorithm'],
    'object oriented': ['object oriented'],
    'bachelors': [" bachelor's ", ' bachelor ', 'bachelors']
}

In [155]:
# Calculate word counts for individual words and topics

def get_mentions(word_dict):
    for key in word_dict:
        for value in word_dict[key]:
            for i, row in df.iterrows():
                if value in row.summary: 
                    df.loc[i, value]=1
                    
get_mentions(word_dict)
df.head()

Unnamed: 0,id,job_title,company,location,summary,javascript,jquery,es6,es2015,react,reactjs,react.js,angular,angularjs,angular.js,node,nodejs,node.js,mongo,mongodb,computer science,cs,information systems,java,python,django,php,wordpress,ruby,c#,c++,.net,html,html5,css,css3,less,sass,bootstrap,front end,back end,agile,scrum,ajax,rest,restful,api,git,github,sql,mysql,nosql,aws,amazon web services,postgres,mvc,testing,tdd,quality assurance,qa,responsive,linux,ui ux,data structure,algorithm,object oriented,bachelor's,bachelor,bachelors,devops
1,1,Software Engineer - Intern 2018,Intel,"Folsom, CA 95630",job description client computing group ccg is ...,,,,,,,,,,,,,,,,1.0,,,1.0,1.0,,,,,,1.0,,,,,,,,,,,,,,,,,1.0,,,,,,,,,,,,,,1.0,,,,,1.0,,,
3,3,Senior Software Engineer,Rebric,"Denver, CO 80202",rebric is looking for a senior software engine...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,,,,,,,,,,,,,,,,,,,,,,,1.0,,,,,1.0
4,4,Software Engineer (Junior),SkyKick,"Seattle, WA",about skykick sky...,,,,,,,,,,,,,,,,,,,,,,,,,1.0,,1.0,,,,,,,,,,1.0,,,,,,,,,,,,,,,1.0,,,1.0,,,,,,1.0,,,,
5,5,"Full Stack Software Engineer- Cape Canaveral, ...",SpaceX,"Cape Canaveral, FL",. spacex was founded under the belief that a f...,1.0,,,,,,,,1.0,,,,,,,1.0,,1.0,,,,,,,1.0,,1.0,1.0,,1.0,,,,,1.0,,1.0,1.0,,,,1.0,,,1.0,,,,,,,1.0,,,,,,,,,,1.0,,,
6,6,Full Stack Software Engineer,Adobe,"New York, NY",we are looking for a talented full stack softw...,1.0,,1.0,,1.0,,,1.0,,,,,,,,,,,,,,1.0,,,,,1.0,1.0,,1.0,,1.0,1.0,,,,,,,,,1.0,1.0,1.0,,1.0,,1.0,,,,1.0,,,1.0,1.0,1.0,,,,1.0,,,,


In [156]:
# Filter out a certain column - ' java ' in this instance

#df = df[df[' java '].notnull()] # Removes null values
df = df[df[' java '] != 1] 

df.shape

(2728, 70)

In [157]:
# New DataFrame to hold individual word counts
counts_headers = dict(word_dict)

column_header_keys = {}

for key in counts_headers:
    column_header_keys[key] = 0
topics = pd.DataFrame(column_header_keys, index=[0])

topics.head()

Unnamed: 0,.net,agile,ajax,algorithm,angular,api,aws,bachelors,back end,bootstrap,c#,c++,computer science,css,data_structure,devops,django,es6,front end,git,github,html,information systems,java,javascript,jquery,less,linux,mongodb,mvc,mysql,node,nosql,object oriented,php,postgres,python,quality assurance,react,responsive,rest,ruby,sass,scrum,sql,testing,ui_ux,wordpress
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [158]:
# New DataFrame to hold total word counts - topics
total_counts_headers = dict(word_dict) # makes a copy instead of passing by reference

column_header_values = {}

for key in total_counts_headers:
    for value in total_counts_headers[key]:
        column_header_values[value] = 0
words = pd.DataFrame(column_header_values, index=[0])

words.head()

Unnamed: 0,angular,bachelor,bachelor's,cs,css,html,java,less,mongo,node,react,rest,sql,.net,agile,ajax,algorithm,amazon web services,angular.js,angularjs,api,aws,bachelors,back end,bootstrap,c#,c++,computer science,css3,data structure,devops,django,es2015,es6,front end,git,github,html5,information systems,javascript,jquery,linux,mongodb,mvc,mysql,node.js,nodejs,nosql,object oriented,php,postgres,python,qa,quality assurance,react.js,reactjs,responsive,restful,ruby,sass,scrum,tdd,testing,ui ux,wordpress
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [159]:
# total value_counts for each column

# Lazy hack to get rid of get_loc error
df[' java '][1] = 1
# print(df[' java '])

def get_totals(word_dict):
    for key in word_dict:
        total = 0
        for value in word_dict[key]:
            words[value] = df[value].value_counts()[1]
            total = total + df[value].value_counts()[1]
        topics[key] = total
        
get_totals(word_dict)

In [160]:
topics.head()

Unnamed: 0,.net,agile,ajax,algorithm,angular,api,aws,bachelors,back end,bootstrap,c#,c++,computer science,css,data_structure,devops,django,es6,front end,git,github,html,information systems,java,javascript,jquery,less,linux,mongodb,mvc,mysql,node,nosql,object oriented,php,postgres,python,quality assurance,react,responsive,rest,ruby,sass,scrum,sql,testing,ui_ux,wordpress
0,253,705,10,281,77,1189,492,483,41,20,654,929,1462,100,34,28,225,24,79,511,217,112,14,1,358,22,212,113,23,38,244,253,613,272,33,54,1138,44,283,22,664,249,11,33,478,978,9,1


In [161]:
words.head()

Unnamed: 0,angular,bachelor,bachelor's,cs,css,html,java,less,mongo,node,react,rest,sql,.net,agile,ajax,algorithm,amazon web services,angular.js,angularjs,api,aws,bachelors,back end,bootstrap,c#,c++,computer science,css3,data structure,devops,django,es2015,es6,front end,git,github,html5,information systems,javascript,jquery,linux,mongodb,mvc,mysql,node.js,nodejs,nosql,object oriented,php,postgres,python,qa,quality assurance,react.js,reactjs,responsive,restful,ruby,sass,scrum,tdd,testing,ui ux,wordpress
0,41,18,47,26,79,78,1,212,2,13,264,246,478,253,705,10,281,10,4,32,1189,482,418,41,20,654,929,1436,21,34,28,225,3,21,79,511,217,34,14,358,22,113,21,38,244,233,7,613,272,33,54,1138,35,9,6,13,22,418,249,11,33,13,965,9,1


In [162]:
words.to_csv('US_SE_words.csv')
topics.to_csv('US_SE_topics.csv')