# Connect to Google Drive

In [1]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [2]:
%cd /content/gdrive/MyDrive/Data Science

/content/gdrive/MyDrive/Data Science


# Import Packages

In [3]:
import pandas as pd
import numpy as np
import re

# Import Data

In [4]:
df = pd.read_csv('Cleaned_DS_jobs.csv')

In [5]:
df

Unnamed: 0,index,Job Title,Salary Estimate,Job Description,Rating,Company Name,Location,Headquarters,Size,Founded,Type of ownership,Industry,Sector,Revenue,Competitors
0,0,data scientist,154.0,Description\n\nThe Senior Data Scientist is re...,3.100000,Healthfirst,"New York, NY","New York, NY",1001 to 5000 employees,1993,Nonprofit Organization,Insurance Carriers,Insurance,Unknown / Non-Applicable,"EmblemHealth, UnitedHealth Group, Aetna"
1,1,data scientist,154.0,"Secure our Nation, Ignite your Future\n\nJoin ...",4.200000,ManTech,"Chantilly, VA","Herndon, VA",5001 to 10000 employees,1968,Company - Public,Research & Development,Business Services,$1 to $2 billion (USD),-1
2,2,data scientist,154.0,Overview\n\n\nAnalysis Group is one of the lar...,3.800000,Analysis Group,"Boston, MA","Boston, MA",1001 to 5000 employees,1981,Private Practice / Firm,Consulting,Business Services,$100 to $500 million (USD),-1
3,3,data scientist,154.0,JOB DESCRIPTION:\n\nDo you have a passion for ...,3.500000,INFICON,"Newton, MA","Bad Ragaz, Switzerland",501 to 1000 employees,2000,Company - Public,Electrical & Electronic Manufacturing,Manufacturing,$100 to $500 million (USD),"MKS Instruments, Pfeiffer Vacuum, Agilent Tech..."
4,4,data scientist,154.0,Data Scientist\nAffinity Solutions / Marketing...,2.900000,Affinity Solutions,"New York, NY","New York, NY",51 to 200 employees,1998,Company - Private,Advertising & Marketing,Business Services,Unknown / Non-Applicable,"Commerce Signals, Cardlytics, Yodlee"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
667,667,data scientist,136.0,Summary\n\nWe’re looking for a data scientist ...,3.600000,TRANZACT,"Fort Lee, NJ","Fort Lee, NJ",1001 to 5000 employees,1989,Company - Private,Advertising & Marketing,Business Services,Unknown / Non-Applicable,-1
668,668,data scientist,136.0,Job Description\nBecome a thought leader withi...,3.881833,JKGT,"San Francisco, CA",-1,-1,-1,-1,-1,-1,-1,-1
669,669,data scientist,136.0,Join a thriving company that is changing the w...,3.881833,AccessHope,"Irwindale, CA",-1,-1,-1,-1,-1,-1,-1,-1
670,670,data scientist,136.0,100 Remote Opportunity As an AINLP Data Scient...,5.000000,ChaTeck Incorporated,"San Francisco, CA","Santa Clara, CA",1 to 50 employees,-1,Company - Private,Advertising & Marketing,Business Services,$1 to $5 million (USD),-1


# Answers

## 1) The `Location` column is actually not clean yet, in the sense of there are some samples in this column that have different format with the rest. Please check this column and clean it! (Note: write a function that can automatically check and clean this column. You can utilize information from Google to help building your function.)

The hypothesis is that all values in the `Location` column has format of:

 `(city), (state with 2 uppercase characters)`

 For instance: `Chantilly, VA`.

 Now, we wanted to check whether all values in this column follows this format or not. If no, then we have to replace the value following this format.

In [6]:
def check_location(x):
    splitted = x.split(', ')

    if len(splitted) == 1:
        # if there is no ',' then return False
        return False
    else:
        if (len(splitted[1]) != 2) or (splitted[1].upper() != splitted[1]):
            # If the state is not written in 2 uppercase characters then returns False
            return False
        else:
            return True

In [7]:
location_check = df['Location'].apply(check_location)

In [8]:
uncleaned_location = list(df[location_check==False]['Location'].unique())
uncleaned_location

['Remote',
 'United States',
 'Utah',
 'New Jersey',
 'Texas',
 'Patuxent, Anne Arundel, MD',
 'California']

Yay! We found 7 samples that are not following the format. Now, we have to replace those values with 'others'.

In [9]:
df['Location'] = df['Location'].apply(lambda x: 'others' if x in uncleaned_location else x)

To make sure, we can using the same `check_location` function whether the `Location` column is already cleaned or not.

In [10]:
location_check_v2 = df['Location'].apply(check_location)

In [11]:
list(df[location_check_v2==False]['Location'].unique())

['others']

## 2) Notice that we haven’t work with the `Job Description` column. Do you think we can get some information from this column? If yes, please list down your ideas to clean this column and ways on how to extract meaningful information. 


Yes we can! We can get some information from this column. 

Here are several ideas to extract meaningful information from this column:
1. Count the length of the job description and assume it as new variable called `workload`
2. Check the minimum education requirement using string matching, e.g. Master, Bachelor, PhD
3. Check whether need Python skills or not using string matching
4. and many more..

Here are several ideas to clean this column:
1. Remove non alphanumeric characters
2. Lowercase all characters