In [1]:
# The directory for top level folder
dir_ = "/home/sugam/Work/10-19 NLP/12 Projects/Resume Builder/data/"

In [2]:
raw_data_dir = dir_ + 'raw/'
processed_data_dir = dir_+'Processed/'

In [3]:
import pandas as pd
import numpy as np
import re

In [4]:
####################### LOAD DATA
##################################################################
def load_data(url: str) ->pd.DataFrame:
    """
    Loads the dataframe into memory

    Args:
      url (string) - the path to the dataframe

    Returns:
      df (pandas dataframe)
    """
    
    print("⏳ Loading Data")
    
    # Refering our data without any modification and dtype
    if(url[-3:]=="csv"):
        df = pd.read_csv(url)
        
        for col in list(df):  # List out the column names
            if "Unnamed" in col:  # Checks if the column contains Unnamed in it 
                df.drop(col, axis=1,inplace=True) # Removes the Garbage column
    else:
        df = pd.read_json(url, lines=True)


    
    print(f"✅ Done Loading")    
    return df


ds_dir= raw_data_dir+"data_scientist_united_states_job_postings_jobspikr.csv"
software_developers_dir = raw_data_dir+"software_developer_united_states_1971_20191023_1(1).csv"

ds = load_data(ds_dir)
software_developers = load_data(software_developers_dir)

⏳ Loading Data
✅ Done Loading
⏳ Loading Data
✅ Done Loading


In [5]:
# Dropping unnecessary columns from the dataframes
ds.drop(["crawl_timestamp","html_job_description","uniq_id","contact_phone_number","cursor"], axis=1,inplace=True)
software_developers.drop(["crawl_timestamp","uniq_id","contact_phone_number","cursor"], axis=1,inplace=True)

In [6]:
# Joining everything into a single dataframe and deleting the rest.

main_df = pd.concat([ds,software_developers])
main_df.drop(["salary_offered","geo","job_board","city","state","inferred_city","inferred_state","inferred_country","contact_email"],axis=1,inplace=True)
del ds, software_developers

In [7]:
main_df.head()

Unnamed: 0,url,job_title,category,company_name,country,post_date,job_description,job_type
0,https://www.indeed.com/viewjob?jk=fd83355c2b23...,Enterprise Data Scientist I,Accounting/Finance,Farmers Insurance Group,Usa,2019-02-06,Read what people are saying about working here...,Undefined
1,https://www.dice.com/jobs/detail/Data-Scientis...,Data Scientist,,Luxoft USA Inc,Usa,2019-02-05,We have an immediate opening for a Sharp Data ...,Undefined
2,https://www.dice.com/jobs/detail/Data-Scientis...,Data Scientist,,Cincinnati Bell Technology Solutions,Usa,2019-02-05,Candidates should have the following backgroun...,Full Time
3,https://www.indeed.com/viewjob?jk=841edd86ead2...,"Data Scientist, Aladdin Wealth Tech, Associate...",Accounting/Finance,BlackRock,Usa,2019-02-06,Read what people are saying about working here...,Undefined
4,https://job-openings.monster.com/senior-data-s...,Senior Data Scientist,biotech,CyberCoders,Usa,2019-02-05,We are seeking an extraordinary Data Scientist...,Full Time


In [8]:
main_df["country"] = main_df["country"].fillna("Unknown")
main_df["company_name"] = main_df["company_name"].fillna("Unknown")
main_df["category"] = main_df["category"].fillna("Unknown")

In [9]:
main_df.shape

(20000, 8)

In [10]:
data_scientist_1000 = main_df.iloc[:1000].copy()

In [11]:
data_scientist_1000.head()

Unnamed: 0,url,job_title,category,company_name,country,post_date,job_description,job_type
0,https://www.indeed.com/viewjob?jk=fd83355c2b23...,Enterprise Data Scientist I,Accounting/Finance,Farmers Insurance Group,Usa,2019-02-06,Read what people are saying about working here...,Undefined
1,https://www.dice.com/jobs/detail/Data-Scientis...,Data Scientist,Unknown,Luxoft USA Inc,Usa,2019-02-05,We have an immediate opening for a Sharp Data ...,Undefined
2,https://www.dice.com/jobs/detail/Data-Scientis...,Data Scientist,Unknown,Cincinnati Bell Technology Solutions,Usa,2019-02-05,Candidates should have the following backgroun...,Full Time
3,https://www.indeed.com/viewjob?jk=841edd86ead2...,"Data Scientist, Aladdin Wealth Tech, Associate...",Accounting/Finance,BlackRock,Usa,2019-02-06,Read what people are saying about working here...,Undefined
4,https://job-openings.monster.com/senior-data-s...,Senior Data Scientist,biotech,CyberCoders,Usa,2019-02-05,We are seeking an extraordinary Data Scientist...,Full Time


What is the job description of Data Scientist at Farmers Insurance Group ?
What are the jobs 

In [12]:
data_scientist_1000_final = (
    "T_" + data_scientist_1000["job_title"] + \
    ",C_" + data_scientist_1000["category"] + \
    ",CN_" + data_scientist_1000["company_name"] +\
    ",CO_" + data_scientist_1000["country"] +\
    ",PD_" + data_scientist_1000["post_date"] + \
    ",JD_" + data_scientist_1000["job_description"] + \
    ",JT" + data_scientist_1000["job_type"]
)

In [13]:
title_index = data_scientist_1000_final[0].find("T_") + 2
category_index = data_scientist_1000_final[0].find("C_") + 2
company_name_index = data_scientist_1000_final[0].find("CN_") + 2
country_index = data_scientist_1000_final[0].find("CO_") + 2
post_date_index = data_scientist_1000_final[0].find("PD_") + 2
job_desc_index = data_scientist_1000_final[0].find("JD_") + 2
job_type_index = data_scientist_1000_final[0].find("JT") + 2

In [14]:
data_scientist_1000_final[0][title_index:category_index-3]

'Enterprise Data Scientist I'

In [15]:
data_scientist_1000_final

0      T_Enterprise Data Scientist I,C_Accounting/Fin...
1      T_Data Scientist,C_Unknown,CN_Luxoft USA Inc,C...
2      T_Data Scientist,C_Unknown,CN_Cincinnati Bell ...
3      T_Data Scientist, Aladdin Wealth Tech, Associa...
4      T_Senior Data Scientist,C_biotech,CN_CyberCode...
                             ...                        
995    T_Senior Data Scientist, Advanced Marketing An...
996    T_Senior Data Scientist,C_Computer/Internet,CN...
997    T_Data Scientist, People Engineering,C_Compute...
998    T_Data Scientist,C_business and financial oper...
999    T_Associate Data Scientist,C_Computer/Internet...
Length: 1000, dtype: object

In [17]:
def remove_garbage(x):
    x = re.sub(r"\n","",x) # Replace \n with nothing
    x = re.sub(r"\\u","",x) # Replace \n with nothing
    x = re.sub(r"\/"," ",x) # Replace \ / with a space
    x = re.sub(r'\s{2,}', ' ', x) # Replace more than 2 spaces witha 1
    
    return x
    

data_scientist_1000_final = data_scientist_1000_final.map(remove_garbage)

In [18]:
json_objects= []
for index, value in data_scientist_1000_final.items():
    json_objects.append({"context":value})

In [19]:
df2 = pd.DataFrame(json_objects)

In [20]:
# Convert the DataFrame to JSON
json_data = df2.to_json(orient='records')

file_path = processed_data_dir+"first_1000.json"
# Write the JSON data to the file
with open(file_path, 'w') as json_file:
    json_file.write(json_data)