<a href="https://colab.research.google.com/github/ArmandDS/jobs_recommendations/blob/master/job_analysis_content_recommendation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [23]:
import pandas as pd
import datetime
import os
import sys
nb_dir = os.path.split(os.getcwd())[0]
if nb_dir not in sys.path:
    sys.path.append(nb_dir)
import clean_text

## Load the data

In [24]:
class preprocess_data():
    
    def __init__(self, root_path, text_column_to_clean):
        self.root_path = root_path
        self.text_column_to_clean = text_column_to_clean
        
    
    def load_data(self):
        df = pd.read_csv(f"{self.root_path}/data/scrape_all_columns.csv")
        df = df.fillna('')
        return df
    
    
    def add_features(self, df):
        # calculate years of experience ?
        # make the assumption that Years of experience is related to graduation date;
        # else, concatenate all the date info together and take the lowest date/parse into sep columns by - character
        df['graduation_year'] = df['school_dateRange1'].str.split('–', n=1, expand=True)[1]

        for index, row in df.iterrows():
            if row['graduation_year'] != '':
                years_subtraction = datetime.datetime.now().year - int(row['graduation_year'])
                if years_subtraction >= 0:
                    df['years_of_experience'] = years_subtraction
                else:
                    df['years_of_experience'] = 0
        return df
    
    
    def select_columns(self, df, columns_to_keep: list):
        df_selected_columns = df[[c for c in df.columns if c in columns_to_keep]]
        return df_selected_columns

     
    def combine_text(self, df):
        df[self.text_column_to_clean] = df['job_jobTitle1'] + ' ' + df['job_location1'] + ' ' + df['job_description1'] + ' ' + df['job_jobTitle2'] + ' ' + df['job_description2'] + ' ' + df['allSkills'] + ' ' + df['school_degree1'] + ' ' + df['school_degreeSpec1']
        return df
    
    
    def clean_data(self, df):
        print('cleaning data')
        df[self.text_column_to_clean] = df[self.text_column_to_clean].apply(clean_text.clean_txt)
        return df

    def save_df_to_csv(self, df):
        df.to_csv(f'{self.root_path}/data/df_applicants.csv', index=False)

In [14]:
root_path = "/Users/snelson/Dropbox/Pearl/job_matching_algorithm/jobs_recommendations-master"
columns_to_keep = ['fullName', 'job_companyName1', 'job_jobTitle1', 'job_location1', 'job_description1', 
                   'job_jobTitle2', 'job_description2', 'allSkills', 'years_of_experience', 'school_degree1', 
                   'school_degreeSpec1', 'graduation_year']

In [25]:
applicants = preprocess_data(root_path=root_path,
                            text_column_to_clean='description_combined_cleaned')
applicants_df = applicants.load_data()
applicants_df = applicants.add_features(df=applicants_df)
applicants_df = applicants.select_columns(df=applicants_df,
                                         columns_to_keep=columns_to_keep)
applicants_df = applicants.combine_text(df=applicants_df)
applicants_df = applicants.clean_data(df=applicants_df)
applicants.save_df_to_csv(df=applicants_df)

cleaning data
