In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install langdetect
!pip install spacy
!pip install nltk
#python -m spacy download en_core_web_sm

In [None]:
import pandas as pd
import numpy as np
import spacy
import nltk
import string
import re

from nltk.tokenize import sent_tokenize, word_tokenize, RegexpTokenizer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import stopwords
from langdetect import detect

import warnings

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
warnings.filterwarnings('ignore')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...


In [None]:
data = pd.read_csv('/content/drive/MyDrive/WEB SCRAPING 101/job_description_only.csv')

In [None]:
data.head()

Unnamed: 0,job_description
0,Background Are you looking to build a career i...
1,nullResponsibilities SIRCLO Internship Program...
2,Gambaran Pekerjaan Melakukan analisa data terk...
3,Flash Coffee is one of Asia's fastest-growing ...
4,nullResponsibilities Complete ad hoc data requ...


In [None]:
#data.sample(2500).to_csv('/content/drive/MyDrive/WEB SCRAPING 101/job_description_only_sample_2500.csv', index=False)

In [None]:
#data_new.to_csv('/content/drive/MyDrive/WEB SCRAPING 101/job_description_only.csv', index=False)

### Remove Non-English Row

In [None]:
job_desc = pd.DataFrame(columns=data.columns)

for index, row in data.iterrows():
    description = row['job_description']
    try:
        lang = detect(description)
        if lang == 'en':
            job_desc = job_desc.append(row)
    except:
        continue

In [None]:
job_desc.reset_index(drop=True, inplace=True)

In [None]:
job_desc.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18041 entries, 0 to 18040
Data columns (total 1 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   job_description  18041 non-null  object
dtypes: object(1)
memory usage: 141.1+ KB


### Cleaning Text

In [None]:
def clean_text(raw):

    try:
        # remove carriage returns and new lines
        raw = raw.replace('\r', '')
        raw = raw.replace('\n', '')

        # brackets appear in all instances
        raw = raw.replace('[', '')
        raw = raw.replace(']', '')
        raw = raw.replace(')', '')
        raw = raw.replace('(', '')
        raw = re.sub('null', '', raw)


        # removing html tags
        clean_html = re.compile('<.*?>')
        clean_text = re.sub(clean_html, ' ', raw)

        # removing duplicate whitespace in between words
        clean_text = re.sub(" +", " ", clean_text)

        # stripping first and last white space
        clean_text = clean_text.strip()

        # commas had multiple spaces before and after in each instance
        clean_text = re.sub(" , ", ", ", clean_text)

        # eliminating the extra comma after a period
        clean_text = clean_text.replace('.,', '.')

        # using try and except due to Nan in the column
    except:
        clean_text = raw

    return clean_text

In [None]:
job_desc = job_desc.apply(clean_text)

In [None]:
job_desc

Unnamed: 0,job_description
0,Background Are you looking to build a career i...
1,nullResponsibilities SIRCLO Internship Program...
2,Flash Coffee is one of Asia's fastest-growing ...
3,nullResponsibilities Complete ad hoc data requ...
4,This newly created opportunity to be our Inter...
...,...
18036,We are seeking an experienced software enginee...
18037,About The Company This is an organisation wher...
18038,Join The Team Redefining How The World Experie...
18039,Note: Partly's headquarters are in Christchurc...


### Tokenization

In [None]:
# create a temp column for lowercased description
job_desc['lower_description'] = job_desc.job_description.str.lower()

# regexp tokenizer eliminates the punctuation for the word count and is faster than word_tokenizer
tokenizer = RegexpTokenizer(r"\w+")
job_desc['word_tokenized'] = job_desc.lower_description.apply(lambda row: tokenizer.tokenize(row))

job_desc['sentence_tokenized'] = job_desc.job_description.apply(lambda row: sent_tokenize(row))

In [None]:
job_desc

Unnamed: 0,job_description,lower_description,word_tokenized,sentence_tokenized
0,Background Are you looking to build a career i...,background are you looking to build a career i...,"[background, are, you, looking, to, build, a, ...",[Background Are you looking to build a career ...
1,nullResponsibilities SIRCLO Internship Program...,nullresponsibilities sirclo internship program...,"[nullresponsibilities, sirclo, internship, pro...",[nullResponsibilities SIRCLO Internship Progra...
2,Flash Coffee is one of Asia's fastest-growing ...,flash coffee is one of asia's fastest-growing ...,"[flash, coffee, is, one, of, asia, s, fastest,...",[Flash Coffee is one of Asia's fastest-growing...
3,nullResponsibilities Complete ad hoc data requ...,nullresponsibilities complete ad hoc data requ...,"[nullresponsibilities, complete, ad, hoc, data...",[nullResponsibilities Complete ad hoc data req...
4,This newly created opportunity to be our Inter...,this newly created opportunity to be our inter...,"[this, newly, created, opportunity, to, be, ou...",[This newly created opportunity to be our Inte...
...,...,...,...,...
18036,We are seeking an experienced software enginee...,we are seeking an experienced software enginee...,"[we, are, seeking, an, experienced, software, ...",[We are seeking an experienced software engine...
18037,About The Company This is an organisation wher...,about the company this is an organisation wher...,"[about, the, company, this, is, an, organisati...",[About The Company This is an organisation whe...
18038,Join The Team Redefining How The World Experie...,join the team redefining how the world experie...,"[join, the, team, redefining, how, the, world,...",[Join The Team Redefining How The World Experi...
18039,Note: Partly's headquarters are in Christchurc...,note: partly's headquarters are in christchurc...,"[note, partly, s, headquarters, are, in, chris...",[Note: Partly's headquarters are in Christchur...


### Remove Stopwords

In [None]:
def remove_stopwords(text):
    stop_words = set(stopwords.words('english'))
    cleaned = [word for word in text if word not in stop_words]
    return cleaned

In [None]:
job_desc['clean_stopword'] = job_desc['word_tokenized'].apply(remove_stopwords)

### Lemmatization

In [None]:
def lemming(text):
    lemmatizer = WordNetLemmatizer()
    lemmed = [lemmatizer.lemmatize(word) for word in text]
    return lemmed

In [None]:
job_desc['clean_lemmed'] = job_desc.clean_stopword.apply(lemming)

In [None]:
job_desc.head()

Unnamed: 0,job_description,lower_description,word_tokenized,sentence_tokenized,clean_stopword,clean_lemmed
0,Background Are you looking to build a career i...,background are you looking to build a career i...,"[background, are, you, looking, to, build, a, ...",[Background Are you looking to build a career ...,"[background, looking, build, career, data, sci...","[background, looking, build, career, data, sci..."
1,nullResponsibilities SIRCLO Internship Program...,nullresponsibilities sirclo internship program...,"[nullresponsibilities, sirclo, internship, pro...",[nullResponsibilities SIRCLO Internship Progra...,"[nullresponsibilities, sirclo, internship, pro...","[nullresponsibilities, sirclo, internship, pro..."
2,Flash Coffee is one of Asia's fastest-growing ...,flash coffee is one of asia's fastest-growing ...,"[flash, coffee, is, one, of, asia, s, fastest,...",[Flash Coffee is one of Asia's fastest-growing...,"[flash, coffee, one, asia, fastest, growing, t...","[flash, coffee, one, asia, fastest, growing, t..."
3,nullResponsibilities Complete ad hoc data requ...,nullresponsibilities complete ad hoc data requ...,"[nullresponsibilities, complete, ad, hoc, data...",[nullResponsibilities Complete ad hoc data req...,"[nullresponsibilities, complete, ad, hoc, data...","[nullresponsibilities, complete, ad, hoc, data..."
4,This newly created opportunity to be our Inter...,this newly created opportunity to be our inter...,"[this, newly, created, opportunity, to, be, ou...",[This newly created opportunity to be our Inte...,"[newly, created, opportunity, intern, based, j...","[newly, created, opportunity, intern, based, j..."


In [None]:
job_desc.to_csv('/content/drive/MyDrive/WEB SCRAPING 101/job_description_only_after_cleaning.csv', index=False)