In [2]:
import pandas as pd
import json
import os

# 技能部分结果映射

## 完成 llm 调用后输出结果的映射

In [6]:
def mapping_files(batch_path, part_path): 
    
    with open(batch_path, 'r', encoding = 'utf-8') as file1, open(part_path, 'r', encoding = 'utf-8') as file2:
        data1 = [json.loads(line) for line in file1]
        data2 = [json.loads(line) for line in file2]

    df1 = pd.DataFrame(data1)
    df2 = pd.DataFrame(data2)

    print("Batch columns:", df1.columns.tolist())
    print("Part columns:", df2.columns.tolist())

    merged_df = pd.merge(df1, df2, on='custom_id', how='inner')
    
    if 'body' in merged_df.columns:
        merged_df['parsed_body'] = merged_df['body'].apply(json.dumps) 

    return merged_df

In [5]:
def combine_merged_dataframes(dataframes):
    if not dataframes:
        raise ValueError("The list of dataframes is empty!")
    
    # 合并所有的 DataFrame
    final_combined_df = pd.concat(dataframes, ignore_index=True)
    
    return final_combined_df

In [None]:
# 文件路径列表
batch_files = [
    "..\\data\\cutwords\\req_to_skills_result\\batch_part1_output.jsonl",
    "..\\data\\cutwords\\req_to_skills_result\\batch_part2_output.jsonl",
    "..\\data\\cutwords\\req_to_skills_result\\batch_part3_output.jsonl", 
    "..\\data\\cutwords\\req_to_skills_result\\batch_part4_output.jsonl", 
    "..\\data\\cutwords\\req_to_skills_result\\batch_part5_output.jsonl", 
    "..\\data\\cutwords\\req_to_skills_result\\batch_part6_output.jsonl", 
    "..\\data\\cutwords\\req_to_skills_result\\batch_part7_output.jsonl", 
    "..\\data\\cutwords\\req_to_skills_result\\batch_part8_output.jsonl", 
    "..\\data\\cutwords\\req_to_skills_result\\batch_part9_output.jsonl"
]

part_files = [
    "..\\data\\cutwords\\req_to_skills_result\\part1.jsonl",
    "..\\data\\cutwords\\req_to_skills_result\\part2.jsonl",
    "..\\data\\cutwords\\req_to_skills_result\\part3.jsonl", 
    "..\\data\\cutwords\\req_to_skills_result\\part4.jsonl", 
    "..\\data\\cutwords\\req_to_skills_result\\part5.jsonl", 
    "..\\data\\cutwords\\req_to_skills_result\\part6.jsonl", 
    "..\\data\\cutwords\\req_to_skills_result\\part7.jsonl", 
    "..\\data\\cutwords\\req_to_skills_result\\part8.jsonl", 
    "..\\data\\cutwords\\req_to_skills_result\\part9.jsonl"
]

if len(batch_files) != len(part_files):
    raise ValueError("The number of batch files and part files must match!")

merged_dfs = [mapping_files(batch, part) for batch, part in zip(batch_files, part_files)]
final_merged_df = combine_merged_dataframes(merged_dfs)

final_merged_df.to_json('..\\data\\cutwords\\req_to_skills_result\\final_merged.jsonl', orient='records', lines=True)

Batch columns: ['id', 'custom_id', 'response', 'error']
Part columns: ['custom_id', 'method', 'url', 'body']
Batch columns: ['id', 'custom_id', 'response', 'error']
Part columns: ['custom_id', 'method', 'url', 'body']
Batch columns: ['id', 'custom_id', 'response', 'error']
Part columns: ['custom_id', 'method', 'url', 'body']
Batch columns: ['id', 'custom_id', 'response']
Part columns: ['custom_id', 'method', 'url', 'body']
Batch columns: ['id', 'custom_id', 'response']
Part columns: ['custom_id', 'method', 'url', 'body']
Batch columns: ['id', 'custom_id', 'response']
Part columns: ['custom_id', 'method', 'url', 'body']
Batch columns: ['id', 'custom_id', 'response']
Part columns: ['custom_id', 'method', 'url', 'body']
Batch columns: ['id', 'custom_id', 'response', 'error']
Part columns: ['custom_id', 'method', 'url', 'body']
Batch columns: ['id', 'custom_id', 'response', 'error']
Part columns: ['custom_id', 'method', 'url', 'body']


## 完成结果与招聘信息 id 的映射

In [None]:
final_merged_file = "..\\data\\cutwords\\req_to_skills_result\\final_merged.jsonl"
jobdata_with_id = "..\\data\\jobdata_preprocessed_with_id.csv"

In [63]:
# 处理 jsonl
content_data = []

with open(final_merged_file, 'r', encoding='utf-8') as file:
    for line_number, line in enumerate(file, 1):
        try:
            record = json.loads(line)
            
            # 检查 'custom_id' 和 'response' 是否存在
            if 'custom_id' in record and 'response' in record:
                response_body = record['response'].get('body', {})
                choices = response_body.get('choices', [])
                
                if choices:  # 确保 choices 存在且非空
                    message = choices[0].get('message', {})
                    content = message.get('content', '')

                    if content:  # 确保 content 存在
                        try:
                            decoded_content = json.loads(content) 
                            content_data.append({
                                'custom_id': record['custom_id'],
                                'hard_skills': decoded_content.get('hard_skills', []),
                                'soft_skills': decoded_content.get('soft_skills', [])
                            })
                        except json.JSONDecodeError as e:
                            print(f"JSONDecodeError at line {line_number}: {e}")
                    else:
                        print(f"No content found in message for custom_id: {record['custom_id']} (line {line_number})")
                else:
                    print(f"No choices found in body for custom_id: {record['custom_id']} (line {line_number})")
            else:
                print(f"Missing 'custom_id' or 'response' at line {line_number}")
        except Exception as e:
            print(f"Error at line {line_number}: {e}")

final_content_df = pd.DataFrame(content_data)

In [64]:
print(final_content_df.head())

               custom_id               hard_skills         soft_skills
0  Lcer-s-mfC1_DTn4Xhi8B  [媒体运营, 视觉设计, 数据处理, 内容策略]        [组织技能, 交际能力]
1  j2iMOLV8nyaPmi7N4i7vW              [媒体运营, 数据处理]  [个人素养, 交际能力, 市场管理]
2  avZPqlcVKpPDNHyTPUeQd              [媒体运营, 数据处理]        [组织技能, 交际能力]
3  GsSA5fS_1ur0eWLbv0Pgl        [数据处理, 内容策略, 媒体运营]              [市场管理]
4  MWp8dcg3q2v49IP6jqWlM              [媒体运营, 数据处理]  [个人素养, 抗压能力, 交际能力]


In [66]:
jobdata_df = pd.read_csv(jobdata_with_id)

# 确保 ID 对应一致
if 'id' in jobdata_df.columns:
    jobdata_df.rename(columns={'id': 'custom_id'}, inplace=True)

# 添加空列
jobdata_df['hard_skills'] = None
jobdata_df['soft_skills'] = None

# 合并数据
merged_df = jobdata_df.merge(final_content_df, on='custom_id', how='inner', suffixes=('_jobdata', '_final'))

print("Merged DataFrame columns:", merged_df.columns)


Merged DataFrame columns: Index(['岗位编号', '岗位名称', '工作类型', '工作经验', '城市', '行政区域', '街道区域', '企业名称', '企业标签',
       '企业人数规模', '学历', '融资阶段', '工作简介', '企业财产类型', '招聘信息更新时间', '招募人数', '岗位标签',
       '岗位分类', 'salary_min', 'salary_max', 'salary_type', 'custom_id',
       'hard_skills_jobdata', 'soft_skills_jobdata', 'hard_skills_final',
       'soft_skills_final'],
      dtype='object')


In [67]:
# 发现 `hard_skills` 和 `soft_skills` 被重命名，故修正列名
if 'hard_skills_final' in merged_df.columns:
    merged_df.rename(columns={
        'hard_skills_final': 'hard_skills',
        'soft_skills_final': 'soft_skills'
    }, inplace=True)

merged_df.drop(columns=['hard_skills_jobdata', 'soft_skills_jobdata'], inplace=True, errors='ignore')
print("Columns after renaming and cleaning:", merged_df.columns)

Columns after renaming and cleaning: Index(['岗位编号', '岗位名称', '工作类型', '工作经验', '城市', '行政区域', '街道区域', '企业名称', '企业标签',
       '企业人数规模', '学历', '融资阶段', '工作简介', '企业财产类型', '招聘信息更新时间', '招募人数', '岗位标签',
       '岗位分类', 'salary_min', 'salary_max', 'salary_type', 'custom_id',
       'hard_skills', 'soft_skills'],
      dtype='object')


In [68]:
processed_df = merged_df.dropna(subset=['hard_skills', 'soft_skills'])
print(processed_df.head())

                      岗位编号         岗位名称 工作类型   工作经验  城市 行政区域 街道区域  \
0  CC120994460J40664011302       天猫运营店长   全职   3-5年  北京   西城   椿树   
1  CC447718930J40602358006    天猫运营经理/店长   全职   3-5年  北京   朝阳  麦子店   
2  CC130850440J40657947102       天猫运营助理   全职   经验不限  北京   海淀  西北旺   
3  CC513268080J40826412204       天猫运营店长   全职   1-3年  北京   海淀   东升   
4  CC447718930J40602828706  天猫运营经理/天猫店长   全职  5-10年  北京   朝阳  麦子店   

             企业名称 企业标签    企业人数规模  ...             招聘信息更新时间 招募人数  \
0  北京市美顺雅鞋业有限责任公司  NaN  500-999人  ...  2024-11-15 10:26:26    1   
1         克拉斯国际家居  NaN  500-999人  ...  2024-11-15 00:19:43    1   
2    北京绿伞科技股份有限公司  NaN  500-999人  ...  2024-11-14 16:44:33    1   
3  北京意间文化艺术发展有限公司  NaN    20-99人  ...  2024-11-13 16:34:52    1   
4         克拉斯国际家居  NaN  500-999人  ...  2024-11-15 00:05:03    1   

                       岗位标签     岗位分类 salary_min  salary_max salary_type  \
0                 天猫;淘宝;服装;  淘宝/天猫运营      96000      144000           M   
1     天猫;淘宝;家具;店长;经理;新媒体运营;  淘宝/

In [None]:
processed_df.to_csv('..\\data\\task3\\jobdata_get_skills.csv', index=False, encoding='utf-8')

# 态度 -> 观点部分映射处理

In [None]:
# 文件路径列表
batch_files = [
    "..\\data\\task4\\batch_part1_output.jsonl",
    "..\\data\\task4\\batch_part1_output.jsonl"
]

part_files = [
    "..\\data\\task4\\part1.jsonl",
    "..\\data\\task4\\part2.jsonl"
]

if len(batch_files) != len(part_files):
    raise ValueError("The number of batch files and part files must match!")

merged_dfs = [mapping_files(batch, part) for batch, part in zip(batch_files, part_files)]
final_merged_df = combine_merged_dataframes(merged_dfs)

final_merged_df.to_json('..\\data\\task4\\merged_opinions.jsonl', orient='records', lines=True)

Batch columns: ['id', 'custom_id', 'response', 'error']
Part columns: ['custom_id', 'method', 'url', 'body']
Batch columns: ['id', 'custom_id', 'response', 'error']
Part columns: ['custom_id', 'method', 'url', 'body']


In [None]:
final_merged_file = "..\\data\\task4\\merged_opinions.jsonl"
zhihudata_with_id = "..\\data\\attitudes_zhihu_with_ids.csv"

In [10]:
content_data = []

with open(final_merged_file, 'r', encoding='utf-8') as file:
    for line_number, line in enumerate(file, 1):
        try:
            record = json.loads(line)
            
            # 检查 'custom_id' 和 'response' 是否存在
            if 'custom_id' in record and 'response' in record:
                response_body = record['response'].get('body', {})
                choices = response_body.get('choices', [])
        
                if choices:  # 确保 choices 存在且非空
                    message = choices[0].get('message', {})
                    content = message.get('content', '')

                    if content:  # 确保 content 存在
                        try:
                            decoded_content = json.loads(content) 
                            content_data.append({
                                'custom_id': record['custom_id'],
                                'agree': decoded_content.get('agree', []),
                                'disagree': decoded_content.get('disagree', []), 
                                'neutral': decoded_content.get('neutral', [])
                            })
                        except json.JSONDecodeError as e:
                            print(f"JSONDecodeError at line {line_number}: {e}")
                    else:
                        print(f"No content found in message for custom_id: {record['custom_id']} (line {line_number})")
                else:
                    print(f"No choices found in body for custom_id: {record['custom_id']} (line {line_number})")
            else:
                print(f"Missing 'custom_id' or 'response' at line {line_number}")
        except Exception as e:
            print(f"Error at line {line_number}: {e}")

final_content_df = pd.DataFrame(content_data)

final_content_df.head()

Unnamed: 0,custom_id,agree,disagree,neutral
0,AkDAFULrD7GTJdWvSE8TN,[],[],"[journalism-favors-new-media-jobs, tier-1-citi..."
1,sViYNRTOSjTcxXPEL2ykH,"[journalism-favors-new-media-jobs, experience-...",[],"[tier-1-cities-offer-more-jobs, education-leve..."
2,aD9e51cniLKXotH1AyMZ2,[],"[journalism-favors-new-media-jobs, tier-1-citi...",[]
3,3S51njyrremig0DMNLY-S,[journalism-favors-new-media-jobs],[],"[tier-1-cities-offer-more-jobs, experience-mat..."
4,r4ro9pzQ9xvmrlj0wIuMn,[],[],"[journalism-favors-new-media-jobs, tier-1-citi..."


In [14]:
zhihudata_df = pd.read_csv(zhihudata_with_id)

# 确保 ID 对应一致
if 'id' in zhihudata_df.columns:
    zhihudata_df.rename(columns={'id': 'custom_id'}, inplace=True)

# 添加空列
zhihudata_df['agree'] = None
zhihudata_df['disagree'] = None
zhihudata_df['neutral'] = None

# 合并数据
merged_df = zhihudata_df.merge(final_content_df, on='custom_id', how='inner', suffixes=('_zhihudata', '_final'))

if 'agree_final' in merged_df.columns:
    merged_df.rename(columns={
        'agree_final': 'agree',
        'disagree_final': 'disagree',
        'neutral_final' : 'neutral'
    }, inplace=True)

merged_df.drop(columns=['agree_zhihudata', 'disagree_zhihudata', 'neutral_zhihudata'], inplace=True, errors='ignore')
print("Columns after renaming and cleaning:", merged_df.columns)

Columns after renaming and cleaning: Index(['custom_id', 'content', 'agree', 'disagree', 'neutral'], dtype='object')


In [15]:
processed_df = merged_df.dropna(subset=['agree', 'disagree', 'neutral'])
processed_df.head()

Unnamed: 0,custom_id,content,agree,disagree,neutral
0,AkDAFULrD7GTJdWvSE8TN,报纸 科技媒体 互联网,[],[],"[journalism-favors-new-media-jobs, tier-1-citi..."
1,sViYNRTOSjTcxXPEL2ykH,知乎的神奇之处在于，会帮你记录来时的路时间来到了2024，距离第二次回答也已经3年，真弹指一...,"[journalism-favors-new-media-jobs, experience-...",[],"[tier-1-cities-offer-more-jobs, education-leve..."
2,aD9e51cniLKXotH1AyMZ2,21年毕业的，这几天面临失业危机了，想找几个同学问问他们的现状，了解一下新闻学的就业行情。当...,[],"[journalism-favors-new-media-jobs, tier-1-citi...",[]
3,3S51njyrremig0DMNLY-S,目前刚被录取，还没入学军训我知道社会对新闻学的评价哈哈哈哈哈被调剂来的，是命运让我来到这里～...,[journalism-favors-new-media-jobs],[],"[tier-1-cities-offer-more-jobs, experience-mat..."
4,r4ro9pzQ9xvmrlj0wIuMn,啊啊啊啊，我也蹲一个。四年后毕业。,[],[],"[journalism-favors-new-media-jobs, tier-1-citi..."


In [None]:
processed_df.to_csv("..\\data\\task4\\zhihu_get_opinion.csv", index=False, encoding='utf-8')