In [10]:
!pip install logzero
import logzero         
from logzero import logger
from google.colab import files
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import nltk
nltk.download('punkt')
import re
from bs4 import BeautifulSoup
import gensim 
from gensim.models import Word2Vec  
from nltk.tokenize import sent_tokenize, word_tokenize
from google.colab import drive
drive.mount('/content/drive')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [11]:
JobData = pd.read_csv("/content/drive/My Drive/JobDescriptionPrediction/Resources/Data/raw/indeed_job_dataset.csv", index_col = None)
JobData.head(n=2)

Unnamed: 0.1,Unnamed: 0,Job_Title,Link,Queried_Salary,Job_Type,Skill,No_of_Skills,Company,No_of_Reviews,No_of_Stars,Date_Since_Posted,Description,Location,Company_Revenue,Company_Employees,Company_Industry,python,sql,machine learning,r,hadoop,tableau,sas,spark,java,Others,CA,NY,VA,TX,MA,IL,WA,MD,DC,NC,Other_states,Consulting and Business Services,Internet and Software,Banks and Financial Services,Health Care,Insurance,Other_industries
0,0,Data Scientist,https://www.indeed.com/rc/clk?jk=6a105f495c36a...,<80000,data_scientist,"['SAP', 'SQL']",2,Express Scripts,3301.0,3.3,1.0,"[<p><b>POSITION SUMMARY</b></p>, <p>\r\r\nThe ...",MO,More than $10B (USD),"10,000+",Health Care,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0
1,1,Data Scientist,https://www.indeed.com/rc/clk?jk=86afd561ea8c6...,<80000,data_scientist,"['Machine Learning', 'R', 'SAS', 'SQL', 'Python']",5,Money Mart Financial Services,,,15.0,"[<p><b>What do we need?</b></p>, <ul><li>\r\r\...",TX,,,,1,1,1,1,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0


In [12]:
JobData.rename(columns={JobData.columns[0]: 'JobID'}, inplace = True)
JobData['JobID'] = JobData['JobID'] +1

In [13]:
"Checking for missing values"
JobData.isnull().sum()

JobID                                  0
Job_Title                              0
Link                                   0
Queried_Salary                         0
Job_Type                               0
Skill                                232
No_of_Skills                           0
Company                              104
No_of_Reviews                        962
No_of_Stars                          962
Date_Since_Posted                    104
Description                          302
Location                             252
Company_Revenue                     3698
Company_Employees                   2516
Company_Industry                    1889
python                                 0
sql                                    0
machine learning                       0
r                                      0
hadoop                                 0
tableau                                0
sas                                    0
spark                                  0
java            

# New Section

In [14]:
JobDataCopy = JobData.copy()
# Deleting redundant columns

def delete_redundancy(col):
  return JobData.drop(col, axis = 1, inplace = False)

JobData  = delete_redundancy(['Link', 'No_of_Skills', 'No_of_Reviews', 'No_of_Stars', 'No_of_Stars', 'Company_Revenue', 'Company_Employees'])   # deleting company revenue and company employees due to large no of missing values

"Exclude last columns"
JobData = JobData[JobData.columns[:10]]

In [15]:
JobData.nunique()    #checking count of unique values in all columns

JobID                5715
Job_Title            2314
Queried_Salary          6
Job_Type                3
Skill                4024
Company              2231
Date_Since_Posted      30
Description          4802
Location               51
Company_Industry       33
dtype: int64

In [16]:

# Cleaning the description text
def clean_description_text(text):         
   
    text = BeautifulSoup(text, "lxml").get_text()       #removing html tags
    text = text.replace('/', ' ')                       #removing forward slashes
    text = text.replace('\n', ' ')                      #removing new lines
    text = re.sub(r'(x.[0-9])', '', text)               #removing special characters
    text = text.replace('\r', ' ')
    text = text.lower()    #lower case the text         
    return text

logger.info("-----Cleaning Job Description-----------") 
JobData['Description'] = JobData.astype(str).apply(lambda x: clean_description_text(x['Description']), axis=1)
logger.info("%s Cleaned Job Description ")                      

[I 210411 05:51:51 <ipython-input-16-cea299bc85a8>:13] -----Cleaning Job Description-----------
[I 210411 05:51:57 <ipython-input-16-cea299bc85a8>:15] %s Cleaned Job Description 


### Handling Input features

In [17]:
#categorizing less frequent values into 'Other'
def bin_companyIndustry(df,column):

  series = pd.value_counts(df[column])
  mask = (series/series.sum() * 100).lt(1)   # masks categroies with less than 1%
  df[column] = np.where(df[column].isin(series[mask].index),'Other',df[column])
  df[column] = df[column].fillna('Other')
  return df[column]

JobData['Company_Industry'] = bin_companyIndustry(JobData, 'Company_Industry')

In [18]:
def clean_cols(df):
  df['Job_Title'] =df['Job_Title'].str.replace(r"\(.*\)","")
  df['Job_Title'] =df['Job_Title'].str.replace(r"Sr.","Senior")
  df['Job_Title'] =df['Job_Title'].str.lower()
  df['Job_Title'] = [re.sub('[^a-z0-9]+', " ", text) for text in df['Job_Title']]
  df['Job_Title'] = df['Job_Title'].str.lower()
  df['Job_Type'] = [re.sub('[^a-z0-9,]+', " ", text) for text in df['Job_Type']]
  df['Skill'] = df['Skill'].astype(str).str.replace('\[|\]|' , '')#convert list into strings
  #df['Skill'] = df['Skill'].str.replace(',','') 
  df['Skill'] = df['Skill'].str.replace('\'','')  # should we relace , ?? 
  df['Skill'] = df['Skill'].str.lower()
  df['Location'] = df['Location'].fillna(df['Location'].mode()[0])
  
  df['Company_Industry'] = df['Company_Industry'].str.lower()

clean_cols(JobData)
logger.info("%s Cleaned Job Description ")       
logzero.logfile("logdatacleaning.log",maxBytes=1e6)
JobData.to_excel("cleaned_indeed_job_dataset.xlsx", index=False)
JobData.to_pickle('cleaned_indeed_job_dataset.pkl')

[I 210411 05:51:57 <ipython-input-18-3b6b6d53e7a8>:17] %s Cleaned Job Description 
