In [111]:
import pandas as pd
import numpy as np

In [112]:
df = pd.read_csv('US_webdev.csv')
df = df.rename(columns={'Unnamed: 0': 'id'})
pd.set_option('display.max_columns', 500)

In [113]:
# Clean the job listing summaries
df.summary = df.summary.str.lower()
df['summary'].replace('\n',' ',regex=True,inplace=True)
df['summary'].replace("•",' ',regex=True,inplace=True)
df['summary'].replace(',',' ',regex=True,inplace=True)
df['summary'].replace(':',' ',regex=True,inplace=True) 
df['summary'].replace('-',' ',regex=True,inplace=True)

# need \ escape character for regex
df['summary'].replace('\?',' ',regex=True,inplace=True) 
df['summary'].replace('\/',' ',regex=True,inplace=True) 
df['summary'].replace('\*',' ',regex=True,inplace=True) 

df.head()

Unnamed: 0,id,job_title,company,location,summary
0,0,Full Stack Web Developer,TRAINOR Associates,Connecticut,we’re always looking for a full stack web deve...
1,1,Front End Web Developer,Innovative Systems,"Pittsburgh, PA 15220",innovative systems is seeking an exceptional f...
2,2,Front-end Web Developer,Modern Message,Remote,at modern message our focus is on building gre...
3,3,WordPress Web Developer - Remote,TrustYou,Remote,imagine a workplace which encourages you to ta...
4,4,Web Developer - CSS/HTML,Phantom,"Palo Alto, CA",we are looking for an outstanding web develope...


In [114]:
df.shape

(2579, 5)

In [115]:
# This is the master dictionary that controls the counts. 
# If you want any instance of the string then simply put it in quotes
# If you want to make sure that the string is only counting instances of that word 
# and not instances where it is part of another word then make sure to put spaces at the ends of the string
# For example, adding "java" will also count instances of the "java" found in the word in "javascript"
# include spaces to " java " in order to get instances of that word by itself.
# Each instance of a term will be totaled under its corresponding key.

word_dict = {
    'javascript': ['javascript'],
    'jquery': ['jquery'],
    'es6': ['es6', 'es2015'],
    'react': [' react ', 'reactjs', 'react.js'],
    'angular': [' angular ', 'angularjs', 'angular.js'],
    'node': [' node ', 'nodejs', 'node.js'],
    'mongodb': [' mongo ', 'mongodb'],
    'computer science': ['computer science', ' cs '],
    'information systems': ['information systems'],
    'java': [' java '],
    'python': ['python'],
    'django': ['django'],
    'php': ['php'],
    'wordpress': ['wordpress'],
    'twig': ['twig'],
    'drupal': ['drupal'],
    'ruby': ['ruby'],
    'c#': ['c#'],
    'c++': ['c++'],
    '.net': ['.net'],
    'html': [' html ', 'html5'],
    'css': [' css ', 'css3'],
    'less': [' less '],
    'sass': ['sass'],
    'bootstrap': ['bootstrap'],
    'front end': ['front end'], # I removed dashes so this should cover "front-end" as well.
    'back end': ['back end'],
    'agile': ['agile'],
    'scrum': ['scrum'],
    'ajax': ['ajax'],
    'rest': [' rest ', 'restful'],
    'api': ['api'],
    'git': ['git'],
    'github': ['github'],
    'sql': [' sql '],
    'mysql': ['mysql'],
    'nosql': ['nosql'],
    'aws': ['aws', 'amazon web services'],
    'postgres': ['postgres'],
    'devops': ['devops', 'dev ops'],
    'mvc': ['mvc', 'model view controller'],
    'testing': ['testing', 'tdd'],
    'quality assurance': ['quality assurance', 'qa'],
    'responsive': ['responsive'],
    'linux': ['linux'],
    'ui_ux': ['ui ux'],
    'data_structure': ['data structure'],
    'algorithm': ['algorithm'],
    'object oriented': ['object oriented'],
    'bachelors': [" bachelor's ", ' bachelor ', 'bachelors']
}

In [116]:
# Calculate word counts for individual words and topics

def get_mentions(word_dict):
    for key in word_dict:
        for value in word_dict[key]:
            for i, row in df.iterrows():
                if value in row.summary: 
                    df.loc[i, value]=1
                    
get_mentions(word_dict)
df.head()

Unnamed: 0,id,job_title,company,location,summary,javascript,jquery,es6,es2015,react,reactjs,react.js,angular,angularjs,angular.js,node,nodejs,node.js,mongo,mongodb,computer science,cs,information systems,java,python,django,php,wordpress,twig,drupal,ruby,c#,c++,.net,html,html5,css,css3,less,sass,bootstrap,front end,back end,agile,scrum,ajax,rest,restful,api,git,github,sql,mysql,nosql,aws,amazon web services,postgres,devops,dev ops,mvc,model view controller,testing,tdd,quality assurance,qa,responsive,linux,ui ux,data structure,algorithm,object oriented,bachelor's,bachelor,bachelors
0,0,Full Stack Web Developer,TRAINOR Associates,Connecticut,we’re always looking for a full stack web deve...,,,,,1.0,,,1.0,,,,,,,,,,,,,,,1.0,,,,,,,,,,,,,1.0,1.0,1.0,,,,,,1.0,1.0,,,,,,,,,,1.0,,,,1.0,,1.0,,,,,,,,
1,1,Front End Web Developer,Innovative Systems,"Pittsburgh, PA 15220",innovative systems is seeking an exceptional f...,1.0,1.0,,,,,,,,,,,,,,1.0,,,,,,,,,,,,,,,1.0,1.0,,,,,1.0,1.0,,,1.0,,,,,,,,,,,,,,,,,,,,,,1.0,,,,,,
2,2,Front-end Web Developer,Modern Message,Remote,at modern message our focus is on building gre...,1.0,1.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,,1.0,,,1.0,,1.0,,,,,,,1.0,1.0,1.0,,,,,,,,,,,,,,,,,,,,,,,
3,3,WordPress Web Developer - Remote,TrustYou,Remote,imagine a workplace which encourages you to ta...,1.0,1.0,,,,,,,,,,,,,,,,,,,,,1.0,,,,,,,1.0,,1.0,,,,,1.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
4,4,Web Developer - CSS/HTML,Phantom,"Palo Alto, CA",we are looking for an outstanding web develope...,1.0,,,,,,,,,,,,,,,,,,,1.0,,,,,,1.0,,,,1.0,,1.0,,,,,,1.0,,,,,,1.0,,,,,,,,,,,,,,,,,,,,,,1.0,,,


In [117]:
# Filter out a certain column - ' java ' in this instance

#df = df[df[' java '].notnull()] # Removes null values
df = df[df[' java '] != 1] 

df.shape

(2158, 74)

In [104]:
# New DataFrame to hold individual word counts
counts_headers = dict(word_dict)

column_header_keys = {}

for key in counts_headers:
    column_header_keys[key] = 0
topics = pd.DataFrame(column_header_keys, index=[0])

topics.head()

Unnamed: 0,.net,agile,ajax,algorithm,angular,api,aws,bachelors,back end,bootstrap,c#,c++,computer science,css,data_structure,devops,django,drupal,es6,front end,git,github,html,information systems,java,javascript,jquery,less,linux,mongodb,mvc,mysql,node,nosql,object oriented,php,postgres,python,quality assurance,react,responsive,rest,ruby,sass,scrum,sql,testing,twig,ui_ux,wordpress
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [105]:
# New DataFrame to hold total word counts - topics
total_counts_headers = dict(word_dict) # makes a copy instead of passing by reference

column_header_values = {}

for key in total_counts_headers:
    for value in total_counts_headers[key]:
        column_header_values[value] = 0
words = pd.DataFrame(column_header_values, index=[0])

words.head()

Unnamed: 0,angular,bachelor,bachelor's,cs,css,html,java,less,mongo,node,react,rest,sql,.net,agile,ajax,algorithm,amazon web services,angular.js,angularjs,api,aws,bachelors,back end,bootstrap,c#,c++,computer science,css3,data structure,dev ops,devops,django,drupal,es2015,es6,front end,git,github,html5,information systems,javascript,jquery,linux,model view controller,mongodb,mvc,mysql,node.js,nodejs,nosql,object oriented,php,postgres,python,qa,quality assurance,react.js,reactjs,responsive,restful,ruby,sass,scrum,tdd,testing,twig,ui ux,wordpress
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [107]:
# total value_counts for each column

# Lazy hack to get rid of get_loc error
df[' java '][1] = 1
# print(df[' java '])

def get_totals(word_dict):
    for key in word_dict:
        total = 0
        for value in word_dict[key]:
            words[value] = df[value].value_counts()[1]
            total = total + df[value].value_counts()[1]
        topics[key] = total
        
get_totals(word_dict)

In [118]:
words.head()

Unnamed: 0,angular,bachelor,bachelor's,cs,css,html,java,less,mongo,node,react,rest,sql,.net,agile,ajax,algorithm,amazon web services,angular.js,angularjs,api,aws,bachelors,back end,bootstrap,c#,c++,computer science,css3,data structure,dev ops,devops,django,drupal,es2015,es6,front end,git,github,html5,information systems,javascript,jquery,linux,model view controller,mongodb,mvc,mysql,node.js,nodejs,nosql,object oriented,php,postgres,python,qa,quality assurance,react.js,reactjs,responsive,restful,ruby,sass,scrum,tdd,testing,twig,ui ux,wordpress
0,64,10,349,7,886,993,1,50,2,17,79,50,418,556,411,638,9,5,6,61,740,492,17,88,91,370,6,1210,558,10,1,15,14,52,1,22,348,496,30,600,30,1422,624,50,2,8,364,361,49,165,15,200,442,13,54,23,18,5,19,151,178,35,81,194,9,478,149,41,234


In [119]:
topics.head()

Unnamed: 0,.net,agile,ajax,algorithm,angular,api,aws,bachelors,back end,bootstrap,c#,c++,computer science,css,data_structure,devops,django,drupal,es6,front end,git,github,html,information systems,java,javascript,jquery,less,linux,mongodb,mvc,mysql,node,nosql,object oriented,php,postgres,python,quality assurance,react,responsive,rest,ruby,sass,scrum,sql,testing,twig,ui_ux,wordpress
0,556,411,638,9,131,740,497,376,88,91,370,6,1217,1444,10,16,14,52,23,348,496,30,1593,30,1,1422,624,50,50,10,366,361,231,15,200,442,13,54,41,103,151,228,35,81,194,418,487,149,41,234


In [120]:
words.to_csv('US_webdev_words.csv')
topics.to_csv('US_webdev_totals.csv')

In [64]:
# Number of job listings by company
df.company.value_counts()

Johnson Controls                       294
Camris International                   294
Amazon.com                             288
Convention Data Services               151
SERCO INC.                             151
Bigelow Aerospace, LLC                 151
AT&T                                   151
Balluun Inc                            145
Dixon-Schwabl Advertising Inc          143
New Jersey City University             143
Apple                                   10
Booz Allen Hamilton                      5
ICF                                      5
Adobe                                    4
Evans & Chambers Technology              4
University of Arizona                    4
University of Wyoming                    4
Jacobs                                   3
Octo Consulting Group                    3
DataSync Technologies, Inc               3
ASRC Federal Holding Company             3
iboss                                    3
OmniUpdate, Inc.                         3
Workstate  