In [11]:
import pandas as pd
import os

In [20]:
def create_vrf_training_data(vrf_df, output_dir):
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    required_columns = ['Job Title', 'Job Title Generic Description', 'Request Name', 'Request Name Description', 'Skills/Keywords']
    columns = required_columns + ['Add\'l Skills', 'Languages']
    vrf_df = vrf_df.dropna(subset=required_columns)
    vrf_df = vrf_df[columns].fillna("NA")
    vrf_df = vrf_df.apply(lambda x: x.replace("\n", " "))
    vrf_df["Request Id"] = vrf_df["Request Name"].str.split(" - ", n=1).str[1]
    # Make generic training data
    vrf_generic_df = vrf_df[["Job Title", "Job Title Generic Description"]].drop_duplicates(subset=["Job Title", "Job Title Generic Description"])
    vrf_generic_df["summary"] = "The job titled: \"" + vrf_generic_df["Job Title"] + "\" has a description: " + vrf_generic_df["Job Title Generic Description"] + "."
    vrf_generic_df.to_csv(os.path.join(output_dir, "vrf_generic_train_data.csv"), index=False)
    # Make specific training data
    vrf_specific_df = vrf_df[["Request Id"]]
    vrf_specific_df["summary"] = "A specific job request for the job titled: \"" + vrf_df["Job Title"] + "\" has a request id [" + \
        vrf_df["Request Id"] + "] with a specific request description: " + vrf_df["Request Name Description"] + \
        ", and requests the following skills: " + vrf_df["Skills/Keywords"] + ", and " + vrf_df["Add'l Skills"] + \
        " and know the languages: " + vrf_df["Languages"] + "."
    vrf_specific_df.to_csv(os.path.join(output_dir, "vrf_specific_train_data.csv"), index=False)

vrf_df = pd.read_csv('data/vrf_data_new.csv')
create_vrf_training_data(vrf_df, 'data/generated_training_data/')


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  vrf_specific_df["summary"] = "A specific job request for the job titled: \"" + vrf_df["Job Title"] + "\" has a request id [" + \


In [None]:
def vrf_single_txt_corpus(specific_train_data_file, generic_train_data_file):
    specific_df = pd.read_csv(specific_train_data_file, header=0)
    generic_df = pd.read_csv(generic_train_data_file, header=0)
    return "\n".join(specific_df["summary"].tolist() + generic_df["summary"].tolist())

corpus = vrf_single_txt_corpus('data/generated_training_data/vrf_specific_train_data.csv', 'data/generated_training_data/vrf_generic_train_data.csv')
print(corpus)

In [30]:
def create_vrf_single_df(specific_train_data_file, generic_train_data_file):
    specific_df = pd.read_csv(specific_train_data_file, header=0)
    specific_df = specific_df.rename(columns={'Request Id': 'Job Title'})
    generic_df = pd.read_csv(generic_train_data_file, header=0)
    target_columns = ['Job Title', 'summary']
    return pd.concat([specific_df[target_columns], generic_df[target_columns]]).reset_index(drop=True)

df = create_vrf_single_df('data/generated_training_data/vrf_specific_train_data.csv', 'data/generated_training_data/vrf_generic_train_data.csv')
df

Unnamed: 0,Job Title,summary
0,Access Control - 2514,"A specific job request for the job titled: ""Ac..."
1,Access Control - 2515,"A specific job request for the job titled: ""Ac..."
2,Access Control - 2794,"A specific job request for the job titled: ""Ac..."
3,Access Control - 2838,"A specific job request for the job titled: ""Ac..."
4,Account Auditor - 2317,"A specific job request for the job titled: ""Ac..."
...,...,...
757,Video calls - Interviews,"The job titled: ""Video calls - Interviews"" has..."
758,Video Editor,"The job titled: ""Video Editor"" has a descripti..."
759,Vietnamese Translator,"The job titled: ""Vietnamese Translator"" has a ..."
760,Violin Teacher,"The job titled: ""Violin Teacher"" has a descrip..."


In [33]:
for i, sentence in enumerate(df["summary"]):
    print(i, sentence)

0 A specific job request for the job titled: "Access Control" has a request id [Access Control - 2514] with a specific request description: Coordinate Ashram Program check-in at different places and give participants & volunteers program wristband, and requests the following skills: Basic Computer Skills / Basic Computer (MS Office and Email) Skills, and NA and know the languages: NA.
1 A specific job request for the job titled: "Access Control" has a request id [Access Control - 2515] with a specific request description: Program volunteering coordination
To handle front office and enroll volunteers for program through calls & email, and requests the following skills: Basic Computer Skills / Basic Computer (MS Office and Email) Skills, and NA and know the languages: NA.
2 A specific job request for the job titled: "Access Control" has a request id [Access Control - 2794] with a specific request description: ashram security management, and requests the following skills: Office Assistant

In [34]:
df['Job Title'].to_list()

['Access Control - 2514',
 'Access Control - 2515',
 'Access Control - 2794',
 'Access Control - 2838',
 'Account Auditor - 2317',
 'Account Auditor - 2506',
 'Account Auditor - 2887',
 'Accountant - 2276',
 'Accountant - 2315',
 'Accountant - 2316',
 'Accountant - 2318',
 'Accountant - 2328',
 'Accountant - 2335',
 'Accountant - 2382',
 'Accountant - 2390',
 'Accountant - 2590',
 'Accountant - 2593',
 'LSP - Accountant - 2803',
 'Accountant - 2828',
 'Ad Creative Design Expert - 2551',
 'Administrative Activities (Back Office) - 2254',
 'Administrative Activities (Back Office) - 2290',
 'Administrative Activities (Back Office) - 2323',
 'Administrative Activities (Back Office) - 2331',
 'Administrative Activities (Back Office) - 2336',
 'Administrative Activities (Back Office) - 2366',
 'Administrative Activities (Back Office) - 2367',
 'Administrative Activities (Back Office) - 2375',
 'Administrative Activities (Back Office) - 2413',
 'Administrative Activities (Back Office) - 2438'