In [38]:
# The directory for top level folder
dir_ = "/home/sugam/Work/10-19 NLP/12 Projects/Resume Builder/data/"

raw_data_dir = dir_ + 'raw/'
processed_data_dir = dir_+'Processed/'

In [39]:
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split

In [40]:
####################### LOAD DATA
##################################################################
def load_data(url: str) ->pd.DataFrame:
    """
    Loads the dataframe into memory

    Args:
      url (string) - the path to the dataframe

    Returns:
      df (pandas dataframe)
    """
    
    print("⏳ Loading Data")
    
    # Refering our data without any modification and dtype
    if(url[-3:]=="csv"):
        df = pd.read_csv(url)
        
        for col in list(df):  # List out the column names
            if "Unnamed" in col:  # Checks if the column contains Unnamed in it 
                df.drop(col, axis=1,inplace=True) # Removes the Garbage column
    else:
        df = pd.read_json(url, lines=True)


    
    print(f"✅ Done Loading")    
    return df


df_dir= processed_data_dir+"final.csv"

df = load_data(df_dir)

⏳ Loading Data
✅ Done Loading


In [41]:
df.head()

Unnamed: 0,jobtitle,company,job_role,joblocation,education,experience,skills,industry,payrate
0,Walkin Data Entry Operator (night Shift),MM Media Pvt Ltd,Fresher Keyskills English Typing Part Time Dat...,Bangalore,Postgraduate,0 - 1 yrs,ITES,"Media , Entertainment , Internet","1,50,000 - 2,25,000 PA"
1,Work Based Onhome Based Part Time,find live infotech,"Search Engine Marketing,SEM Specialist Keyskil...",Lucknow,MTech,0 - 0 yrs,Marketing,"Advertising , PR , MR , Event Management","1,50,000 - 2,50,000 PA 20000"
2,"Pl,sql Developer - SQL",Softtech Career Infosystem Pvt Ltd,Software Developer Keyskills SQL Loader PL SQL...,Hyderabad,MSc(Science)-Computers,4 - 8 yrs,IT Software - Application Programming,"IT-Software , Software Services",Not Disclosed by Recruiter
3,"Manager,ad,partner - Indirect Tax - CA",Onboard HRServices LLP,Taxation(Indirect) Manager Keyskills Service T...,Chennai,"MS,MSc(Science)",11 - 15 yrs,Accounts,"Banking , Financial Services , Broking",Not Disclosed by Recruiter
4,JAVA Technical Lead (6-8 yrs) -,Spire Technologies and Solutions Pvt Ltd,"Team Lead,Technical Lead Keyskills Javascript ...",Pune,"Diploma-Electronics,Telecommunication",6 - 8 yrs,IT Software - Application Programming,"IT-Software , Software Services",Not Disclosed by Recruiter


In [42]:
def remove_garbage(x):
    x = re.sub(r"/",",",str(x)) # Replace \n with nothing
    x = re.sub(r"\xa0","",str(x)) # Replace \n with nothing
    x = re.sub("_"," ",str(x))
    x = re.sub("|","",str(x))
    x = re.sub(r"\.","",str(x))
    return x
    
for col in df.columns:
    df[col] = df[col].map(remove_garbage)

In [43]:
# Combining each column to make a single context dataset by adding flags infront of each column to make indexing easier
df_final = (
    "JT_" + df["jobtitle"] + \
    ",CO_" + df["company"] +\
    ",JR_" + df["job_role"] + \
    ",JL_" + df["joblocation"] + \
    ",ED_" + df["education"] + \
    ",EX_" + df["experience"] + \
    ",SK_" + df["skills"] + \
    ",IN_" + df["industry"] + \
    ",PR_" + df["payrate"]
)

In [44]:
def find_index(row: int,df: pd.DataFrame):
    """
    It will take row index and dataframe and find out the indexes of occurance of flags on that row

    Args:
      row (int) - row index of the dataframe
      df (Dataframe) - dataframe itself

    Returns:
      indexes (tuple(int,int,....)) - first occurance of each flags
      
    """
    
    title_index = df[row].find("JT_") + 3 # Adding 3, will compensate with the length of flag
    company_index = df[row].find("CO_") + 3
    job_role_index = df[row].find("JR_") + 3
    job_location_index = df[row].find("JL_") + 3
    education_index = df[row].find("ED_") + 3
    experience_index = df[row].find("EX_") + 3
    skills_index = df[row].find("SK_") + 3
    industry_index = df[row].find("IN_") + 3
    payrate_index = df[row].find("PR_") + 3

    return title_index, company_index, job_role_index,job_location_index,education_index,experience_index,skills_index,industry_index,payrate_index

In [47]:
# ################### This dataframe will contain the index values of the information
### This is a helper dataframe or lookup dataframe 
### Which tells the first index of each flag throug out the dataframe

cols = ["title_index","company_index","job_role_index","job_location_index","education_index","experience_index","skills_index","industry_index","payrate_index"]

val_list = []


for index in range(len(df_final)):
    
    title_index, company_index, job_role_index,job_location_index,education_index,experience_index,skills_index,industry_index,payrate_index = find_index(index,df_final)
    val_list.append([title_index, company_index, job_role_index,job_location_index,education_index,experience_index,skills_index,industry_index,payrate_index])
    

index_df = pd.DataFrame(val_list,columns=cols)
    
index_df.head()

Unnamed: 0,title_index,company_index,job_role_index,job_location_index,education_index,experience_index,skills_index,industry_index,payrate_index
0,3,47,67,169,182,198,211,219,255
1,3,40,62,168,179,188,201,214,258
2,3,29,67,227,240,266,279,320,355
3,3,45,71,765,776,795,810,822,864
4,3,38,82,216,224,265,278,319,354


In [49]:
################ This code is used to put the data into the format supported with Simple Transformer
######### Each context has 5 questions with similar structure


json_objects= [] # Object to store each json data

question_id = list(range(1,110001)) # This is unique id of each question
# 22,000 rows = 20,000 context
# Each context has 5 questions then
# 22,000 * 5 = 110000 questions

step = 5
low = 0
high = step

for index, value in df_final.items():
    
    counter = question_id[low:high]   # Slicing 5 id's for each question
    
    # Taking company name and post name as per each row
    company_name = df_final[index][index_df.loc[index,"company_index"]:index_df.loc[index,"job_role_index"]-4]
    post_name = df_final[index][index_df.loc[index,"title_index"]:index_df.loc[index,"company_index"]-4]
    pay_rate = df_final[index][index_df.loc[index,"payrate_index"]:]
    skills_required = df_final[index][index_df.loc[index,"skills_index"]:index_df.loc[index,"payrate_index"]-3]
    experience_required = df_final[index][index_df.loc[index,"experience_index"]:index_df.loc[index,"skills_index"]-3]
    role = df_final[index][index_df.loc[index,"job_role_index"]:index_df.loc[index,"job_location_index"]-3]
    
    json_objects.append(
        {
        "context": value,
        "qas": 
            [
                {
                    "id": str(counter[0]),
                    "is_impossible": False,
                    "question": f"What is job provided by {company_name} company?",
                    "answers": 
                    [
                        {
                            "text": f"{post_name}",
                            "answer_start": f"{index_df.loc[index,'company_index']}",
                        }
                    ],
                },
                {
                    "id": str(counter[1]),
                    "is_impossible": False,
                    "question": f"What is salary provided by {company_name} company for {post_name} post?",
                    "answers": 
                    [
                        {
                            "text": f"{pay_rate}",
                            "answer_start": f"{index_df.loc[index,'payrate_index']}",
                        }
                    ],
                },
                {
                    "id": str(counter[2]),
                    "is_impossible": False,
                    "question": f"What are skills required for {post_name} post in {company_name} company?",
                    "answers": 
                    [
                        {
                            "text": f"{skills_required}",
                            "answer_start":f"{index_df.loc[index,'skills_index']}" ,
                        }
                    ],
                },
                {
                    "id": str(counter[3]),
                    "is_impossible": False,
                    "question": f"What is experience required for {post_name} post in {company_name} company?",
                    "answers": [
                        {
                            "text": f"{experience_required}",
                            "answer_start":f"{index_df.loc[index,'experience_index']}" ,
                        }
                    ],
                },
                {
                    "id": str(counter[4]),
                    "is_impossible": False,
                    "question": f"What is role for {post_name} post in {company_name} company?",
                    "answers": 
                    [
                        {
                            "text": f"{role}",
                            "answer_start":f"{index_df.loc[index,'job_role_index']}" ,
                        }
                    ],
                },
                
            ],
        },
    )
    
    low = high
    high = high+step

In [50]:
df_main = pd.DataFrame(json_objects)

# Randomly sampling the dataframe
df_main = df_main.sample(frac=1,
          random_state=20,
              ignore_index = True)

# Splitting the dataset into Training Validation and Testing
train_df = df_main.iloc[:18000,:]
valid_df = df_main.iloc[18000:20000,:]
test_df = df_main.iloc[20000:,:]

In [55]:
# Convert the DataFrame to JSON
train_json = train_df.to_json(orient='records')
valid_json = valid_df.to_json(orient='records')
test_json = test_df.to_json(orient='records')

train_path = processed_data_dir+"train_json.json"
valid_path = processed_data_dir+"valid_json.json"
test_path = processed_data_dir+"test_json.json"
# Write the JSON data to the file
with open(train_path, 'w') as json_file:
    json_file.write(train_json)

with open(valid_path, 'w') as json_file:
    json_file.write(valid_json)

with open(test_path, 'w') as json_file:
    json_file.write(test_json)