In [24]:
# The directory for top level folder
dir_ = "/home/sugam/Work/10-19 NLP/12 Projects/Resume Builder/data/"

In [25]:
raw_data_dir = dir_ + 'raw/'
processed_data_dir = dir_+'Processed/'

In [26]:
import pandas as pd
import numpy as np
import re

In [27]:
####################### LOAD DATA
##################################################################
def load_data(url: str) ->pd.DataFrame:
    """
    Loads the dataframe into memory

    Args:
      url (string) - the path to the dataframe

    Returns:
      df (pandas dataframe)
    """
    
    print("⏳ Loading Data")
    
    # Refering our data without any modification and dtype
    if(url[-3:]=="csv"):
        df = pd.read_csv(url)
        
        for col in list(df):  # List out the column names
            if "Unnamed" in col:  # Checks if the column contains Unnamed in it 
                df.drop(col, axis=1,inplace=True) # Removes the Garbage column
    else:
        df = pd.read_json(url, lines=True)


    
    print(f"✅ Done Loading")    
    return df


ds_dir= raw_data_dir+"data_scientist_united_states_job_postings_jobspikr.csv"
software_developers_dir = raw_data_dir+"software_developer_united_states_1971_20191023_1(1).csv"

ds = load_data(ds_dir)
software_developers = load_data(software_developers_dir)

⏳ Loading Data
✅ Done Loading
⏳ Loading Data
✅ Done Loading


In [28]:
# Dropping unnecessary columns from the dataframes
ds.drop(["crawl_timestamp","html_job_description","uniq_id","contact_phone_number","cursor"], axis=1,inplace=True)
software_developers.drop(["crawl_timestamp","uniq_id","contact_phone_number","cursor"], axis=1,inplace=True)

In [29]:
# Joining everything into a single dataframe and deleting the rest.
main_df = pd.concat([ds,software_developers])
main_df.drop(["salary_offered","geo","job_board","city","state","inferred_city","inferred_state","inferred_country","contact_email"],axis=1,inplace=True)
del ds, software_developers

In [33]:
### Filling NA's with values
main_df["country"] = main_df["country"].fillna("Unknown")
main_df["company_name"] = main_df["company_name"].fillna("Unknown")
main_df["category"] = main_df["category"].fillna("Unknown")

In [34]:
### Separating the top 1000 for making question answering dataset
data_scientist_1000 = main_df.iloc[:1000].copy()

In [35]:
data_scientist_1000.head()

Unnamed: 0,url,job_title,category,company_name,country,post_date,job_description,job_type
0,https://www.indeed.com/viewjob?jk=fd83355c2b23...,Enterprise Data Scientist I,Accounting/Finance,Farmers Insurance Group,Usa,2019-02-06,Read what people are saying about working here...,Undefined
1,https://www.dice.com/jobs/detail/Data-Scientis...,Data Scientist,Unknown,Luxoft USA Inc,Usa,2019-02-05,We have an immediate opening for a Sharp Data ...,Undefined
2,https://www.dice.com/jobs/detail/Data-Scientis...,Data Scientist,Unknown,Cincinnati Bell Technology Solutions,Usa,2019-02-05,Candidates should have the following backgroun...,Full Time
3,https://www.indeed.com/viewjob?jk=841edd86ead2...,"Data Scientist, Aladdin Wealth Tech, Associate...",Accounting/Finance,BlackRock,Usa,2019-02-06,Read what people are saying about working here...,Undefined
4,https://job-openings.monster.com/senior-data-s...,Senior Data Scientist,biotech,CyberCoders,Usa,2019-02-05,We are seeking an extraordinary Data Scientist...,Full Time


In [36]:
data_scientist_1000_final = (
    "T_" + data_scientist_1000["job_title"] + \
    ",C_" + data_scientist_1000["category"] + \
    ",CN_" + data_scientist_1000["company_name"] +\
    ",CO_" + data_scientist_1000["country"] +\
    ",PD_" + data_scientist_1000["post_date"] + \
    ",JD_" + data_scientist_1000["job_description"] + \
    ",JT_" + data_scientist_1000["job_type"] + \
    ",URL_" + data_scientist_1000["url"]
)

In [38]:
def remove_garbage(x):
    x = re.sub(r"\n","",x) # Replace \n with nothing
    x = re.sub(r"\\u","",x) # Replace \n with nothing
    x = re.sub(r"\/"," ",x) # Replace \ / with a space
    x = re.sub(r'\s{2,}', ' ', x) # Replace more than 2 spaces witha 1
    
    return x
    

data_scientist_1000_final = data_scientist_1000_final.map(remove_garbage)

In [64]:
def find_index(row,df):
    title_index = df[row].find("T_") + 2
    category_index = df[row].find("C_") + 2
    company_name_index = df[row].find("CN_") + 3
    country_index = df[row].find("CO_") + 3
    post_date_index = df[row].find("PD_") + 3
    job_desc_index = df[row].find("JD_") + 3
    job_type_index = df[row].find("JT_") + 3
    url_index = df[row].find("URL_") + 4

    print(f"title_index {title_index}")
    print(f"category_index {category_index}")
    print(f"company_name_index {company_name_index}")
    print(f"country_index {country_index}")
    print(f"post_date_index {post_date_index}")
    print(f"job_desc_index {job_desc_index}")
    print(f"job_type_index {job_type_index}")
    print(f"url_index {url_index}")

In [68]:
find_index(10,data_scientist_1000_final)

title_index 2
category_index 19
company_name_index 40
country_index 60
post_date_index 67
job_desc_index 81
job_type_index 3511
url_index 3525


In [70]:
data_scientist_1000_final[10][81:3511]

"Read what people are saying about working here. At Northrop Grumman, innovation isn't just an idea-it's a way of life. Our employees have incredible opportunities to work on revolutionary systems in air and space that impact lives around the world today, and for generations to come. Our work preserves freedom and democracy, and advances human discovery and our understanding of the universe. We look for people who have bold new ideas, courage and a pioneering spirit to join forces to invent the future.Northrop Grumman is seeking a Data Scientist to become part of the organization supporting the Army Analytics Group's Research Facilitation Laboratory located in Monterey, California. The team conducts a variety of research studies and supports data analytics efforts for clients across the Army, Office of the Secretary of Defense and greater DoD, conducting high-quality research with practical real-world implications. The team works primarily within a data environment that houses a wealth

In [55]:
data_scientist_1000_final[0][job_type_index:url_index]

'Undefined,URL_'

In [18]:
json_objects= []
for index, value in data_scientist_1000_final.items():
    json_objects.append({"context":value})

In [19]:
df2 = pd.DataFrame(json_objects)

In [20]:
# Convert the DataFrame to JSON
json_data = df2.to_json(orient='records')

file_path = processed_data_dir+"first_1000.json"
# Write the JSON data to the file
with open(file_path, 'w') as json_file:
    json_file.write(json_data)