<a href="https://colab.research.google.com/github/ArmandDS/jobs_recommendations/blob/master/job_analysis_content_recommendation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
pd.set_option('display.max_rows', 1000)
pd.set_option('display.max_columns', 1000)
pd.set_option('display.width', -1)
import warnings
warnings.filterwarnings('ignore')
import datetime

In [2]:
import os
import sys
nb_dir = os.path.split(os.getcwd())[0]
if nb_dir not in sys.path:
    sys.path.append(nb_dir)
import clean_text


## Load the data

In [7]:
class preprocess_data():
    
    def __init__(self, root_path, text_column_to_clean):
        self.root_path = root_path
        self.text_column_to_clean = text_column_to_clean
        
    
    def load_data(self):
        df = pd.read_csv(f"{self.root_path}/data/Combined_Jobs_Final.csv")
        return df
    
    
    def select_columns(self, df):
        cols = ['Job.ID']+['Title']+['Position']+ ['Company']+['City']+['Employment.Type']+['Job.Description']
        df =df[cols]
        df.columns = ['Job.ID', 'Title', 'Position', 'Company','City', 'Empl_type','Job_Description']
        return df
    
    
    def clean_na_values(self, df):
        df['Company'] = df['Company'].replace(['Genesis Health Systems'], 'Genesis Health System')
        df.loc[df.Company == 'CHI Payment Systems', 'City'] = 'Illinois'
        df.loc[df.Company == 'Academic Year In America', 'City'] = 'Stamford'
        df.loc[df.Company == 'CBS Healthcare Services and Staffing ', 'City'] = 'Urbandale'
        df.loc[df.Company == 'Driveline Retail', 'City'] = 'Coppell'
        df.loc[df.Company == 'Educational Testing Services', 'City'] = 'New Jersey'
        df.loc[df.Company == 'Genesis Health System', 'City'] = 'Davennport'
        df.loc[df.Company == 'Home Instead Senior Care', 'City'] = 'Nebraska'
        df.loc[df.Company == 'St. Francis Hospital', 'City'] = 'New York'
        df.loc[df.Company == 'Volvo Group', 'City'] = 'Washington'
        df.loc[df.Company == 'CBS Healthcare Services and Staffing', 'City'] = 'Urbandale'
        
        df['Empl_type']=df['Empl_type'].fillna('Full-Time/Part-Time')
        
        df = df.fillna(" ")
        return df
    
    
    def combine_text(self, df):
        df[self.text_column_to_clean] = df["Position"].map(str) + " " + df["Company"] +" "+ df["City"]+ " "+df['Empl_type']+" "+df['Job_Description'] +" "+df['Title']
        return df
    
    
    def clean_data(self, df):
        print('cleaning data')
        df[self.text_column_to_clean] = df[self.text_column_to_clean].apply(clean_text.clean_txt)
        return df

    def save_df_to_csv(self, df):
        df.to_csv(f'{self.root_path}/data/df_jobs.csv')

In [4]:
root_path = "/Users/snelson/Dropbox/Pearl/job_matching_algorithm/jobs_recommendations-master"

In [8]:
jobs = preprocess_data(root_path=root_path,
                      text_column_to_clean='text')
df_jobs = jobs.load_data()
df_jobs = jobs.select_columns(df=df_jobs)
df_jobs = jobs.clean_na_values(df=df_jobs)
df_jobs = jobs.combine_text(df=df_jobs)
df_jobs = jobs.clean_data(df=df_jobs)

jobs.save_df_to_csv(df=df_jobs)

cleaning data


In [9]:
df_jobs

Unnamed: 0,Job.ID,Title,Position,Company,City,Empl_type,Job_Description,text
0,111,Server @ Tacolicious,Server,Tacolicious,Palo Alto,Part-Time,Tacolicious' first Palo Alto store just opened...,server tacolicious palo alto part time tacolic...
1,113,Kitchen Staff/Chef @ Claude Lane,Kitchen Staff/Chef,Claude Lane,San Francisco,Part-Time,\r\n\r\nNew French Brasserie in S.F. Financia...,kitchen staff chef claude lane san francisco p...
2,117,Bartender @ Machka Restaurants Corp.,Bartender,Machka Restaurants Corp.,San Francisco,Part-Time,We are a popular Mediterranean wine bar and re...,bartender machka restaurants corp san francisc...
3,121,Server @ Teriyaki House,Server,Teriyaki House,Brisbane,Part-Time,● Serve food/drinks to customers in a profess...,server teriyaki house brisbane part time serve...
4,127,Kitchen Staff/Chef @ Rosa Mexicano - Sunset,Kitchen Staff/Chef,Rosa Mexicano - Sunset,Los Angeles,Part-Time,"Located at the heart of Hollywood, we are one ...",kitchen staff chef rosa mexicano sunset los an...
...,...,...,...,...,...,...,...,...
84085,82,Book Keeper @ National Japanese American Histo...,Book Keeper,National Japanese American Historical Society,San Francisco,Part-Time,NJAHS stands for National Japanese American Hi...,book keeper national japanese american histori...
84086,83,Kitchen Staff/Chef @ Emporio Rulli,Kitchen Staff/Chef,Emporio Rulli,Larkspur,Part-Time,Weekend Brunch Line Cook \r\n● Other shifts ma...,kitchen staff chef emporio rulli larkspur part...
84087,84,Driver @ Onigilly,Driver,Onigilly,San Francisco,Part-Time,ONIGILLY (Japanese rice ball wraps) seeks outg...,driver onigilly san francisco part time onigil...
84088,88,Line Cook @ Machka Restaurants Corp.,Line Cook,Machka Restaurants Corp.,San Francisco,Part-Time,We are a popular Mediterranean restaurant in F...,line cook machka restaurants corp san francisc...
