# Importing libraries and data frame

In [1]:
# Import useful libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

In [2]:
# create data frame

df=pd.read_csv('alldata.csv')

In [3]:
df

Unnamed: 0,position,company,description,reviews,location
0,Development Director,ALS TDI,Development Director\nALS Therapy Development ...,,"Atlanta, GA 30301"
1,An Ostentatiously-Excitable Principal Research...,The Hexagon Lavish,"Job Description\n\n""The road that leads to acc...",,"Atlanta, GA"
2,Data Scientist,Xpert Staffing,"Growing company located in the Atlanta, GA are...",,"Atlanta, GA"
3,Data Analyst,Operation HOPE,DEPARTMENT: Program OperationsPOSITION LOCATIO...,44.0,"Atlanta, GA 30303"
4,Assistant Professor -TT - Signal Processing & ...,Emory University,DESCRIPTION\nThe Emory University Department o...,550.0,"Atlanta, GA"
...,...,...,...,...,...
6959,Data Developer / Machine Learning Analyst,NetApp,Are you data-driven? We at NetApp believe in t...,574.0,"Sunnyvale, CA"
6960,Scientist I,"Pharmacyclics, an Abbvie Company",Pharmacyclics is committed to the development ...,26.0,"Sunnyvale, CA"
6961,Intern Scientist,Oath Inc,"Oath, a subsidiary of Verizon, is a values-led...",5.0,"Sunnyvale, CA"
6962,Senior Data & Applied Scientist,Microsoft,We are the Bing Core Relevance team responsibl...,4618.0,"Sunnyvale, CA"


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6964 entries, 0 to 6963
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   position     6953 non-null   object 
 1   company      6953 non-null   object 
 2   description  6953 non-null   object 
 3   reviews      5326 non-null   float64
 4   location     6953 non-null   object 
dtypes: float64(1), object(4)
memory usage: 272.2+ KB


# Cleaning the dataframe

- deal with nulls (drop reviews column and clean null rows)
- split location column in 3 columns
- group job positions in 3 roles: data scientist, data analyst and data engineer
- create new columns for skills (boolean)
- create new columns for tech (boolean)
- create new columns for degrees and other trainings (boolean)


## Deal with nulls

In [5]:
# All columns have some nulls, but the 'reviews' one is the one with the highest number
# We drop off the reviews column as we understand that it brings no value to our analysis
# We see that the rest of nulls are all grouped on the same rows, that have all values equal to null
# We delete those rows
 
df.drop(['reviews'], axis = 1, inplace = True)
df[df.isnull().any(axis=1)]

Unnamed: 0,position,company,description,location
302,,,,
331,,,,
466,,,,
482,,,,
3337,,,,
5014,,,,
5059,,,,
5103,,,,
5114,,,,
6093,,,,


In [6]:
df.drop([302,331,466,482,3337,5014,5059,5103,5114,6093,6168], axis = 0, inplace = True)

In [7]:
df.columns

Index(['position', 'company', 'description', 'location'], dtype='object')

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6953 entries, 0 to 6963
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   position     6953 non-null   object
 1   company      6953 non-null   object
 2   description  6953 non-null   object
 3   location     6953 non-null   object
dtypes: object(4)
memory usage: 271.6+ KB


## Check if there are duplicated rows

In [9]:
print(df.duplicated())

0       False
1       False
2       False
3       False
4       False
        ...  
6959    False
6960    False
6961    False
6962    False
6963    False
Length: 6953, dtype: bool


In [10]:
print(df.duplicated().value_counts())  # There are no duplicates

False    6953
dtype: int64


## Split "Location" column 

In [11]:
# We split "location column" to be able to use its data properly and we drop the original column
# We also drop the postcode column as we have a lot of nulls and it is not interesting for our analysis

df[['city', 'state_postcode']] = df['location'].str.split(',', 1, expand=True)
df['state'] = df['state_postcode'].str.replace('\d+','', regex=True) #regex=True means the string is a regular expression, not a common string
# also works: df['state'] = df['state_postcode'].str.extract('([a-zA-Z ]+)', expand=False).str.strip()
df.drop(['location'], axis = 1, inplace = True)
df.drop(['state_postcode'], axis = 1, inplace = True)


In [12]:
df

Unnamed: 0,position,company,description,city,state
0,Development Director,ALS TDI,Development Director\nALS Therapy Development ...,Atlanta,GA
1,An Ostentatiously-Excitable Principal Research...,The Hexagon Lavish,"Job Description\n\n""The road that leads to acc...",Atlanta,GA
2,Data Scientist,Xpert Staffing,"Growing company located in the Atlanta, GA are...",Atlanta,GA
3,Data Analyst,Operation HOPE,DEPARTMENT: Program OperationsPOSITION LOCATIO...,Atlanta,GA
4,Assistant Professor -TT - Signal Processing & ...,Emory University,DESCRIPTION\nThe Emory University Department o...,Atlanta,GA
...,...,...,...,...,...
6959,Data Developer / Machine Learning Analyst,NetApp,Are you data-driven? We at NetApp believe in t...,Sunnyvale,CA
6960,Scientist I,"Pharmacyclics, an Abbvie Company",Pharmacyclics is committed to the development ...,Sunnyvale,CA
6961,Intern Scientist,Oath Inc,"Oath, a subsidiary of Verizon, is a values-led...",Sunnyvale,CA
6962,Senior Data & Applied Scientist,Microsoft,We are the Bing Core Relevance team responsibl...,Sunnyvale,CA


## Group job positions in three roles

In [13]:
# The aim is to homogenize the job titles and be able to understand each of the 3 roles (data scientist, analyst and engineer)

df['position'] = df['position'].str.lower()

In [14]:
def roles(position):
    if 'scient' in position:
        return 'scientist'
    elif 'analy' in position:
        return 'analyst'
    elif 'engine' in position:
        return 'engineer'
    else:
        return 'other'

In [15]:
df['role'] = df['position'].apply(roles)
df

Unnamed: 0,position,company,description,city,state,role
0,development director,ALS TDI,Development Director\nALS Therapy Development ...,Atlanta,GA,other
1,an ostentatiously-excitable principal research...,The Hexagon Lavish,"Job Description\n\n""The road that leads to acc...",Atlanta,GA,scientist
2,data scientist,Xpert Staffing,"Growing company located in the Atlanta, GA are...",Atlanta,GA,scientist
3,data analyst,Operation HOPE,DEPARTMENT: Program OperationsPOSITION LOCATIO...,Atlanta,GA,analyst
4,assistant professor -tt - signal processing & ...,Emory University,DESCRIPTION\nThe Emory University Department o...,Atlanta,GA,other
...,...,...,...,...,...,...
6959,data developer / machine learning analyst,NetApp,Are you data-driven? We at NetApp believe in t...,Sunnyvale,CA,analyst
6960,scientist i,"Pharmacyclics, an Abbvie Company",Pharmacyclics is committed to the development ...,Sunnyvale,CA,scientist
6961,intern scientist,Oath Inc,"Oath, a subsidiary of Verizon, is a values-led...",Sunnyvale,CA,scientist
6962,senior data & applied scientist,Microsoft,We are the Bing Core Relevance team responsibl...,Sunnyvale,CA,scientist


# Detect tools needed for each role

List of tools:
- Tableau
- Looker
- Power BI
- Qlik
- Hadoop
- Oracle
- Data Studio
- Excel
- Google Cloud Platform
- Amazon Web Services
- Microsoft Azure

In [16]:
# we have created a list of tools that we want to look up in the description field

df['description'] = df['description'].str.lower() #lower case to avoid missing values

In [29]:
# One function per tool we want to detect and a command to add a new boolean column:
# We have found a more efficient and accurate way to do this, but we didn't have time to change all the manual work:
# e.g.: df['Deep_learning']=df['description'].str.contains('deep|learn', flags=re.IGNORECASE, regex=True)
# This way we put all code in one line and we are able to look for 2 or more words at the same time

def tableau(description):
    if 'tableau' in description:
        return True
    else:
         return False
df['tableau']=df['description'].apply(tableau)
def looker(description):
    if 'looker' in description:
        return True
    else:
         return False
df['looker']=df['description'].apply(looker)
def power(description):
    if 'power' in description:
        return True
    else:
         return False
df['power']=df['description'].apply(power)
def qlik(description):
    if 'qlik' in description:
        return True
    else:
         return False
df['qlik']=df['description'].apply(qlik)
def hadoop(description):
    if 'hadoop' in description:
        return True
    else:
         return False
df['hadoop']=df['description'].apply(hadoop)
def oracle(description):
    if 'oracle' in description:
        return True
    else:
         return False
df['oracle']=df['description'].apply(oracle)
def studio(description):
    if 'studio' in description:
        return True
    else:
         return False
df['studio']=df['description'].apply(studio)
def excel(description):
    if 'excel' in description:
        return True
    else:
         return False
df['excel']=df['description'].apply(excel)
def amazon(description):
    if 'amazon' in description:
        return True
    else:
         return False
df['amazon']=df['description'].apply(amazon)
def platform(description):
    if 'platform' in description:
        return True
    else:
         return False
df['platform']=df['description'].apply(platform)
def azure(description):
    if 'azure' in description:
        return True
    else:
         return False
df['azure']=df['description'].apply(azure)

df

Unnamed: 0,position,company,description,city,state,role,tableau,looker,power,qlik,hadoop,oracle,studio,excel,amazon,platform,azure
0,development director,ALS TDI,development director\nals therapy development ...,Atlanta,GA,other,False,False,False,False,False,False,False,True,False,False,False
1,an ostentatiously-excitable principal research...,The Hexagon Lavish,"job description\n\n""the road that leads to acc...",Atlanta,GA,scientist,False,False,False,False,False,False,False,True,False,False,False
2,data scientist,Xpert Staffing,"growing company located in the atlanta, ga are...",Atlanta,GA,scientist,True,False,False,True,True,True,False,False,False,False,False
3,data analyst,Operation HOPE,department: program operationsposition locatio...,Atlanta,GA,analyst,True,False,True,False,False,False,False,False,False,False,False
4,assistant professor -tt - signal processing & ...,Emory University,description\nthe emory university department o...,Atlanta,GA,other,False,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6959,data developer / machine learning analyst,NetApp,are you data-driven? we at netapp believe in t...,Sunnyvale,CA,analyst,True,True,True,False,True,True,False,True,False,False,False
6960,scientist i,"Pharmacyclics, an Abbvie Company",pharmacyclics is committed to the development ...,Sunnyvale,CA,scientist,False,False,True,False,False,False,False,True,False,False,False
6961,intern scientist,Oath Inc,"oath, a subsidiary of verizon, is a values-led...",Sunnyvale,CA,scientist,False,False,False,False,True,True,False,False,False,True,False
6962,senior data & applied scientist,Microsoft,we are the bing core relevance team responsibl...,Sunnyvale,CA,scientist,False,False,False,False,True,False,False,False,False,False,False


# Detect skills needed for each role

List of skills:
- SQL
- Python
- R
- Java
- Predictive Analytics
- Statistical Modeling
- Machine learning
- Deep learning
- NLP
- Scala

In [30]:
# We are now going to do the same as before but for our list of skills:

def sql(description):
    if 'sql' in description:
        return True
    else:
         return False
df['sql']=df['description'].apply(sql)
def python(description):
    if 'python' in description:
        return True
    else:
         return False
df['python']=df['description'].apply(python)
def r(description):
    if ' r ' in description:
        return True
    else:
         return False
df['r']=df['description'].apply(r)
def java(description):
    if 'java' in description:
        return True
    else:
         return False
df['java']=df['description'].apply(java)
def predictive(description):
    if 'predictive' in description:
        return True
    else:
         return False
df['predictive']=df['description'].apply(predictive)
def statistical(description):
    if 'statistical' in description:
        return True
    else:
         return False
df['statistical']=df['description'].apply(statistical)
def machine(description):
    if 'machine' in description:
        return True
    else:
         return False
df['machine']=df['description'].apply(machine)
def deep(description):
    if 'deep' in description:
        return True
    else:
         return False
df['deep']=df['description'].apply(deep)
def nlp(description):
    if 'nlp' in description:
        return True
    else:
         return False
df['nlp']=df['description'].apply(nlp)
def scala(description):
    if 'scala' in description:
        return True
    else:
         return False
df['scala']=df['description'].apply(scala)

df

Unnamed: 0,position,company,description,city,state,role,tableau,looker,power,qlik,...,sql,python,r,java,predictive,statistical,machine,deep,nlp,scala
0,development director,ALS TDI,development director\nals therapy development ...,Atlanta,GA,other,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,an ostentatiously-excitable principal research...,The Hexagon Lavish,"job description\n\n""the road that leads to acc...",Atlanta,GA,scientist,False,False,False,False,...,False,False,False,False,False,True,False,False,False,True
2,data scientist,Xpert Staffing,"growing company located in the atlanta, ga are...",Atlanta,GA,scientist,True,False,False,True,...,True,True,False,True,True,True,True,False,False,False
3,data analyst,Operation HOPE,department: program operationsposition locatio...,Atlanta,GA,analyst,True,False,True,False,...,True,True,False,False,False,True,False,False,False,False
4,assistant professor -tt - signal processing & ...,Emory University,description\nthe emory university department o...,Atlanta,GA,other,False,False,False,False,...,False,False,False,False,False,True,True,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6959,data developer / machine learning analyst,NetApp,are you data-driven? we at netapp believe in t...,Sunnyvale,CA,analyst,True,True,True,False,...,True,True,False,True,False,False,True,False,False,True
6960,scientist i,"Pharmacyclics, an Abbvie Company",pharmacyclics is committed to the development ...,Sunnyvale,CA,scientist,False,False,True,False,...,False,False,True,False,False,False,False,False,False,False
6961,intern scientist,Oath Inc,"oath, a subsidiary of verizon, is a values-led...",Sunnyvale,CA,scientist,False,False,False,False,...,True,True,False,True,False,False,True,True,False,False
6962,senior data & applied scientist,Microsoft,we are the bing core relevance team responsibl...,Sunnyvale,CA,scientist,False,False,False,False,...,False,False,False,True,False,False,True,False,False,False


# Detect degrees required for each role

List of degrees:
- Business Analysis
- Computer Science
- Economics
- Data Science   
- Mathematics
- Engineering
- Statistics
- Bioinformatics

In [31]:
# And finally, same process to identify the required education:

def business(description):
    if 'business' in description:
        return True
    else:
         return False
df['business']=df['description'].apply(business)
def computer(description):
    if 'computer' in description:
        return True
    else:
         return False
df['computer']=df['description'].apply(computer)
def economics(description):
    if 'economics' in description:
        return True
    else:
         return False
df['economics']=df['description'].apply(economics)
def science(description):
    if 'science' in description:
        return True
    else:
         return False
df['science']=df['description'].apply(science)
def math(description):
    if 'math' in description:
        return True
    else:
         return False
df['math']=df['description'].apply(math)
def engineering(description):
    if 'engineering' in description:
        return True
    else:
         return False
df['engineering']=df['description'].apply(engineering)
def statistics(description):
    if 'statistics' in description:
        return True
    else:
         return False
df['statistics']=df['description'].apply(statistics)
def bioinformatics(description):
    if 'bioinformatics' in description:
        return True
    else:
         return False
df['bioinformatics']=df['description'].apply(bioinformatics)

df

Unnamed: 0,position,company,description,city,state,role,tableau,looker,power,qlik,...,nlp,scala,business,computer,economics,science,math,engineering,statistics,bioinformatics
0,development director,ALS TDI,development director\nals therapy development ...,Atlanta,GA,other,False,False,False,False,...,False,False,True,True,False,False,False,False,False,False
1,an ostentatiously-excitable principal research...,The Hexagon Lavish,"job description\n\n""the road that leads to acc...",Atlanta,GA,scientist,False,False,False,False,...,False,True,False,True,False,True,True,True,True,False
2,data scientist,Xpert Staffing,"growing company located in the atlanta, ga are...",Atlanta,GA,scientist,True,False,False,True,...,False,False,True,True,True,True,False,False,False,False
3,data analyst,Operation HOPE,department: program operationsposition locatio...,Atlanta,GA,analyst,True,False,True,False,...,False,False,False,False,False,False,False,False,False,False
4,assistant professor -tt - signal processing & ...,Emory University,description\nthe emory university department o...,Atlanta,GA,other,False,False,False,False,...,False,False,False,True,False,True,True,True,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6959,data developer / machine learning analyst,NetApp,are you data-driven? we at netapp believe in t...,Sunnyvale,CA,analyst,True,True,True,False,...,False,True,True,False,False,True,False,True,False,False
6960,scientist i,"Pharmacyclics, an Abbvie Company",pharmacyclics is committed to the development ...,Sunnyvale,CA,scientist,False,False,True,False,...,False,False,False,True,False,False,False,False,False,False
6961,intern scientist,Oath Inc,"oath, a subsidiary of verizon, is a values-led...",Sunnyvale,CA,scientist,False,False,False,False,...,False,False,False,True,False,True,True,True,True,False
6962,senior data & applied scientist,Microsoft,we are the bing core relevance team responsibl...,Sunnyvale,CA,scientist,False,False,False,False,...,False,False,False,False,False,False,False,True,False,False


In [36]:
# Now that we have identified everything with booleans and we have prepared our data frame, it's time to export it and bring it to Tableau to gain more insights:

df.to_excel('/Users/rogerserret/Documents/GitHub/IronRoger/week2_classexs\df_jobs.xlsx', index = False)