In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session


In [None]:
dataset = pd.read_csv("/kaggle/input/us-technology-jobs-on-dicecom/dice_com-job_us_sample.csv")
dataset.head()

Questions I want to answer: 

1. Which languages are the most requested? 
2. Which job titles are the most listed? 
3. What are the tops skills that are asked for? 


Adding a state column using the joblocation_address. 

In [None]:
dataset['State'] = dataset['joblocation_address'].apply(lambda x: pd.Series(str(x).split(", ")[-1]))
dataset.head()

What states have the most listings in this dataset?

In [None]:
dataset.describe()

In [None]:
states = {
    "AL": "Alabama",
    "AK": "Alaska",
    "AS": "American Samoa",
    "AZ": "Arizona",
    "AR": "Arkansas",
    "CA": "California",
    "CO": "Colorado",
    "CT": "Connecticut",
    "DE": "Delaware",
    "DC": "District Of Columbia",
    "FM": "Federated States Of Micronesia",
    "FL": "Florida",
    "GA": "Georgia",
    "GU": "Guam",
    "HI": "Hawaii",
    "ID": "Idaho",
    "IL": "Illinois",
    "IN": "Indiana",
    "IA": "Iowa",
    "KS": "Kansas",
    "KY": "Kentucky",
    "LA": "Louisiana",
    "ME": "Maine",
    "MH": "Marshall Islands",
    "MD": "Maryland",
    "MA": "Massachusetts",
    "MI": "Michigan",
    "MN": "Minnesota",
    "MS": "Mississippi",
    "MO": "Missouri",
    "MT": "Montana",
    "NE": "Nebraska",
    "NV": "Nevada",
    "NH": "New Hampshire",
    "NJ": "New Jersey",
    "NM": "New Mexico",
    "NY": "New York",
    "NC": "North Carolina",
    "ND": "North Dakota",
    "MP": "Northern Mariana Islands",
    "OH": "Ohio",
    "OK": "Oklahoma",
    "OR": "Oregon",
    "PW": "Palau",
    "PA": "Pennsylvania",
    "PR": "Puerto Rico",
    "RI": "Rhode Island",
    "SC": "South Carolina",
    "SD": "South Dakota",
    "TN": "Tennessee",
    "TX": "Texas",
    "UT": "Utah",
    "VT": "Vermont",
    "VI": "Virgin Islands",
    "VA": "Virginia",
    "WA": "Washington",
    "WV": "West Virginia",
    "WI": "Wisconsin",
    "WY": "Wyoming"
}

In [None]:
numberOfJobsPerState = {state: dataset.where(dataset['State'] == state).get(['jobtitle','jobdescription','skills','State']).count()['jobtitle'] for state in states}
print(numberOfJobsPerState)

In [None]:
from matplotlib import pyplot as plt

In [None]:
plt.bar(numberOfJobsPerState.keys(), numberOfJobsPerState.values())

Since we have a lot of states that dont have many job listings we will drop states with less than 200 listings. 

In [None]:
statesWithMoreThan200Listings = {state: numberOfJobsPerState[state] for state in numberOfJobsPerState if numberOfJobsPerState[state] > 200}
print(statesWithMoreThan200Listings)
plt.bar(statesWithMoreThan200Listings.keys(), statesWithMoreThan200Listings.values())

Let's look at California because it has the most tech jobs

In [None]:
ca_subset = dataset.get(['jobtitle', 'jobdescription', 'skills', 'State']).where(dataset['State'] == 'CA').dropna()
ca_subset.head()

In [None]:
import regex as re

In [None]:
popularProgrammingLanguages = ['c', 'c#','c++', 'python', 'java', 'javascript', 'swift', 'objective-c','sql']
regexPattern = {
    'c': re.compile(r'\bc[\s\\/\,\.]'),
    'c#': re.compile(r'\bc#'),
    'python': re.compile(r'\bpython\b'),
    'c++': re.compile(r'\bc\+\+'),
    'java': re.compile(r'\bjava\b'),
    'javascript': re.compile(r'\bjavascript\b'),
    'swift': re.compile(r'\bswift\b'),
    'objective-c':re.compile(r'\bobjective-c\b'),
    'sql': re.compile(r'\bsql\b')
}

In [None]:
ld = {language: 0 for language in popularProgrammingLanguages}
ld['none'] = 0

for index, row in ca_subset.iterrows(): 
    description = row.get('skills')
    languageFound = False
    for language in regexPattern:
        if re.search(regexPattern[language], description.lower()):
            ld[language] += 1
            languageFound = True
    if languageFound == False: 
        ld['none'] += 1
       
print(ld)            

In [None]:
wordDictionary = {}

for index, row in ca_subset.iterrows(): 
    jobTitle = row.get('jobtitle')
    words = jobTitle.lower().split()
    for word in words: 
        if word in wordDictionary: 
            wordDictionary[word] += 1
        else: 
            wordDictionary[word] = 1
sortedWordDictionary = sorted(wordDictionary.items(),
                             key = lambda x: x[1], reverse = True)[:10]
print(sortedWordDictionary)

Here we can see that the most popular job titles include 'engineer' and 'developer', followed by 'manager'. 
We can create word clouds to have a visualization of which words show up in the job title, or skills section of the listing. 

In [None]:
# The following code is taken from https://www.section.io/engineering-education/word-cloud/
import matplotlib.pyplot as plt

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from wordcloud import WordCloud

class WordCloudGeneration:
    def preprocessing(self, data):
        # convert all words to lowercase
        data = [item.lower() for item in data]
        # load the stop_words of english
        stop_words = set(stopwords.words('english'))
        # concatenate all the data with spaces.
        paragraph = ' '.join(data)
        # tokenize the paragraph using the inbuilt tokenizer
        word_tokens = word_tokenize(paragraph) 
        # filter words present in stopwords list 
        preprocessed_data = ' '.join([word for word in word_tokens if not word in stop_words])
        return preprocessed_data

    def create_word_cloud(self, final_data):
        # initiate WordCloud object with parameters width, height, maximum font size and background color
        # call the generate method of WordCloud class to generate an image
        wordcloud = WordCloud(width=1600, height=800, max_font_size=200, background_color="black").generate(final_data)
        # plt the image generated by WordCloud class
        plt.figure(figsize=(12,10))
        plt.imshow(wordcloud)
        plt.axis("off")
        plt.show()

In [None]:
print("This is a word cloud for job title")
wordcloud_generator = WordCloudGeneration()
input_text = [row.get('jobtitle') for _, row in ca_subset.iterrows()]
clean_data = wordcloud_generator.preprocessing(input_text)
wordcloud_generator.create_word_cloud(clean_data)

In [None]:
print("This is a word cloud for skills")
wordcloud_generator = WordCloudGeneration()
input_text = [row.get('skills') for _, row in ca_subset.iterrows()]
clean_data = wordcloud_generator.preprocessing(input_text)
wordcloud_generator.create_word_cloud(clean_data)

You can see that java and python are skills that are widely requested in california. 

In [None]:
print("This is a word cloud for job description")
wordcloud_generator = WordCloudGeneration()
input_text = [row.get('jobdescription') for _, row in ca_subset.iterrows()]
clean_data = wordcloud_generator.preprocessing(input_text)
wordcloud_generator.create_word_cloud(clean_data)

Now we compare the different word clouds for each state to see if there is a difference

In [None]:
subsetsByState = {state: dataset.get(['jobtitle', 'jobdescription', 'skills', 'State']).where(dataset['State'] == state).dropna() for state in statesWithMoreThan200Listings}
for state in subsetsByState: 
    subset = subsetsByState[state]
    print("This is a word cloud for job title for", states[state])
    wordcloud_generator = WordCloudGeneration()
    input_text = [row.get('jobtitle') for _, row in subset.iterrows()]
    clean_data = wordcloud_generator.preprocessing(input_text)
    wordcloud_generator.create_word_cloud(clean_data)
    

We can do the same for the skills section. 

In [None]:
subsetsByState = {state: dataset.get(['jobtitle', 'jobdescription', 'skills', 'State']).where(dataset['State'] == state).dropna() for state in statesWithMoreThan200Listings}
for state in subsetsByState: 
    subset = subsetsByState[state]
    print("This is a word cloud for skills for", states[state])
    wordcloud_generator = WordCloudGeneration()
    input_text = [row.get('skills') for _, row in subset.iterrows()]
    clean_data = wordcloud_generator.preprocessing(input_text)
    wordcloud_generator.create_word_cloud(clean_data)
    

You can gain a lot of insight just from looking at and comparing these word clouds. For example you can see that management is a skill that is popular in many different states. Java is also popular across the board. Interestingly, security seems to be a priority in D.C.