In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [7]:
acc_cleaned = pd.read_csv('acc_cleaned.csv')
acc_df_cleaned = acc_cleaned.copy()

In [8]:
acc_df_cleaned.head(5)

Unnamed: 0,loan_amnt,term,int_rate,installment,grade,emp_title,emp_length,home_ownership,annual_inc,verification_status,...,revol_util,total_acc,loan_status,pub_rec,pub_rec_bankruptcies,mort_acc,inq_last_6mths,inq_last_12m,num_bc_tl,num_actv_bc_tl
0,3600.0,36 months,13.99,123.03,C,leadman,10+ years,MORTGAGE,55000.0,Not Verified,...,29.7,13.0,0,0.0,0.0,1.0,1.0,4.0,5.0,2.0
1,24700.0,36 months,11.99,820.28,C,Engineer,10+ years,MORTGAGE,65000.0,Not Verified,...,19.2,38.0,0,0.0,0.0,4.0,4.0,6.0,17.0,5.0
2,20000.0,60 months,10.78,432.66,B,truck driver,10+ years,MORTGAGE,63000.0,Not Verified,...,56.2,18.0,0,0.0,0.0,5.0,0.0,1.0,4.0,2.0
3,10400.0,60 months,22.45,289.91,F,Contract Specialist,3 years,MORTGAGE,104433.0,Source Verified,...,64.5,35.0,0,0.0,0.0,6.0,3.0,3.0,9.0,4.0
4,11950.0,36 months,13.44,405.18,C,Veterinary Tecnician,4 years,RENT,34000.0,Source Verified,...,68.4,6.0,0,0.0,0.0,0.0,0.0,0.0,2.0,2.0


# 범주형 컬럼 전처리

In [None]:
# 범주형 컬럼 리스트
cat_feat = [
    'term',
    'grade',
    'emp_title',    'emp_length',
    'home_ownership',
    'verification_status',
    'purpose'
]

# 범주형 컬럼 데이터 프레임 생성
df_cat = acc_df_cleaned[cat_feat].copy()
df_cat.head()

# 오브젝트 타입을 카테고리 타입으로 변환하여 메모리, 처리 효율성 증대시키기
for col in df_cat.columns:
    df_cat[col] = df_cat[col].astype('category')

# 각 컬럼 결측치 개수
df_cat.isnull().sum()

# 각 컬럼의 결측치 비율 계산 및 출력
missing_per_cat = df_cat.isnull().sum() / len(acc_df) * 100

print(missing_per_cat)

In [None]:
# 직업 컬럼 nan값 제거
df_cat = df_cat.dropna(subset=['emp_title'])

# 소문자 변환 + 앞뒤 공백제거
df_cat.loc[:, 'emp_title'] = df_cat['emp_title'].astype(str).str.lower().str.strip()

# 카테고리형으로 변환
df_cat['emp_title'] = df_cat['emp_title'].astype('category')

# 인덱스 동일하게 맞추기(nan값 제거로 인덱스 다를것임)
acc_df_cleaned_aligned = acc_df_cleaned.loc[df_cat.index].copy()
print(f"\nacc_df_cleaned_aligned 생성 완료. 형태: {acc_df_cleaned_aligned.shape}")
print(f"인덱스 일치 여부 (acc_df_cleaned_aligned == df_cat): {acc_df_cleaned_aligned.index.equals(df_cat.index)}")

# BERT 생성

In [None]:
import pandas as pd
import numpy as np
import re
import torch
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from torch import nn
from sklearn.utils.class_weight import compute_class_weight

# --- 0. BERT 학습용 데이터 준비: Lending Club emp_title -> SOC 대직업군 라벨 생성 ---

# 0-1. classify_lc_title_to_soc_major_group 함수 정의 (핵심 분류 규칙 직접 주입)
def classify_lc_title_to_soc_major_group(emp_title_str):
    if not isinstance(emp_title_str, str) or not emp_title_str.strip():
        return '99-0000 Unclassified Occupations'

    emp_title_str = emp_title_str.lower().strip()

    rules = [
        # 29-0000 Healthcare Occupations (의료) - 구체적인 규칙 우선 배치
        (r'\b(?:nurse|rn|physician|doctor|therapist|pharmacist|dentist|veterinarian|emt|paramedic|optometrist|clinician|medical technician|dental assistant)\b', '29-0000 Healthcare Occupations'),

        # 11-0000 Management Occupations (관리직)
        (r'\b(?:ceo|cfo|cto|chief|director|manager|supervisor|president|executive|owner|partner|vice president|vp|lead|head of|operations manager|project manager|hr manager)\b', '11-0000 Management Occupations'),

        # 15-0000 Computer and Mathematical Occupations (컴퓨터/수학)
        (r'\b(?:software engineer|data scientist|web developer|mobile developer|devops|cloud engineer|cybersecurity|programmer|it specialist|analyst programmer|network engineer)\b', '15-0000 Computer and Mathematical Occupations'),

        # 17-0000 Architecture and Engineering Occupations (건축/공학)
        (r'\b(?:architect|designer engineer|cad designer|structural engineer|civil engineer|mechanical engineer)\b', '17-0000 Architecture and Engineering Occupations'),

        # 13-0000 Business and Financial Operations Occupations (경영/재무)
        (r'\b(?:financial analyst|accountant|auditor|loan officer|underwriter|credit analyst|recruiter|business analyst|pr specialist|business development|marketing manager)\b', '13-0000 Business and Financial Operations Occupations'),

        # 25-0000 Education, Training, and Library Occupations (교육/훈련/도서관)
        (r'\b(?:teacher|professor|librarian|counselor|instructor|trainer|tutor)\b', '25-0000 Education, Training, and Library Occupations'),

        # 49-0000 Installation, Maintenance, and Repair Occupations (설치/유지보수/수리)
        (r'\b(?:electrician|welder|hvac|installer|maintenance technician|repairman|plumber|mechanic)\b', '49-0000 Installation, Maintenance, and Repair Occupations'),

        # 47-0000 Construction and Extraction Occupations (건설/채굴)
        (r'\b(?:construction|carpenter|roofer|foreman|laborer)\b', '47-0000 Construction and Extraction Occupations'),

        # 53-0000 Transportation and Material Moving Occupations (운송/자재 이동)
        (r'\b(?:driver|truck driver|delivery|logistics|mover|pilot|freight mover)\b', '53-0000 Transportation and Material Moving Occupations'),

        # 43-0000 Office and Administrative Support Occupations (사무/행정)
        (r'\b(?:assistant|clerk|secretary|admin|receptionist|bookkeeper|coordinator|dispatcher|office support)\b', '43-0000 Office and Administrative Support Occupations'),

        # 41-0000 Sales and Related Occupations (판매)
        (r'\b(?:sales|retail|representative|merchandiser|telemarketer|broker|sales associate)\b', '41-0000 Sales and Related Occupations'),

        # 51-0000 Production Occupations (생산)
        (r'\b(?:operator|assembler|packer|manufacturing|production worker|machine operator|qa tester|quality control)\b', '51-0000 Production Occupations'),

        # 33-0000 Protective Service Occupations (보안/보호)
        (r'\b(?:police|firefighter|guard|security|corrections officer)\b', '33-0000 Protective Service Occupations'),

        # 27-0000 Arts, Design, Entertainment, Sports, and Media Occupations (예술/디자인/미디어)
        (r'\b(?:designer|artist|writer|musician|actor|photographer|journalist|media specialist|graphic designer)\b', '27-0000 Arts, Design, Entertainment, Sports, and Media Occupations'),

        # 23-0000 Legal Occupations (법률)
        (r'\b(?:attorney|lawyer|paralegal|legal assistant)\b', '23-0000 Legal Occupations'),

        # 35-0000 Food Preparation and Serving Related Occupations (요리/서빙)
        (r'\b(?:chef|cook|server|bartender|food preparer|kitchen staff)\b', '35-0000 Food Preparation and Serving Related Occupations'),

        # 37-0000 Building and Grounds Cleaning and Maintenance Occupations (건물 관리)
        (r'\b(?:janitor|cleaner|custodian|maintenance worker|grounds keeper)\b', '37-0000 Building and Grounds Cleaning and Maintenance Occupations'),

        # 39-0000 Personal Care and Service Occupations (개인 서비스)
        (r'\b(?:hairstylist|barber|cosmetologist|personal trainer|childcare|nanny)\b', '39-0000 Personal Care and Service Occupations'),

        # 21-0000 Community and Social Service Occupations (사회/커뮤니티 서비스)
        (r'\b(?:social worker|psychologist|counselor|caseworker|community service)\b', '21-0000 Community and Social Service Occupations'),

        # 45-0000 Farming, Fishing, and Forestry Occupations (농어업/임업)
        (r'\b(?:farmer|agricultural|fish|forest|rancher)\b', '45-0000 Farming, Fishing, and Forestry Occupations'),

        # 일반적인 직업명 규칙 (가장 마지막에 배치)
        (r'consultant', '13-0000 Business and Financial Operations Occupations'),
        (r'analyst', '15-0000 Computer and Mathematical Occupations'),
        (r'specialist', '13-0000 Business and Financial Operations Occupations'),
        (r'engineer', '17-0000 Architecture and Engineering Occupations'),
        (r'technician', '49-0000 Installation, Maintenance, and Repair Occupations'),

        # Other Non-Occupational / Default (항상 마지막에)
        (r'\b(?:retired|student|intern|unemployed|disabled|homemaker|military|self-employed|volunteer)\b', '99-0000 Other Non-Occupational'),
        (r'\b(?:not provided|unknown|unclassified|none)\b', '99-0000 Unclassified Occupations'),
    ]

    for pattern, major_group_name in rules:
        if re.search(pattern, emp_title_str):
            return major_group_name

    return '99-0000 Unclassified Occupations'


# acc_df_cleaned_aligned는 이전 단계에서 정의되어 있음
try:
    texts_for_bert_training = acc_df_cleaned_aligned['emp_title'].tolist()
    labels_for_bert_training_text = acc_df_cleaned_aligned['emp_title'].apply(
        classify_lc_title_to_soc_major_group
    ).tolist()
except NameError:
    print("오류: 'acc_df_cleaned_aligned' DataFrame이 정의되지 않았습니다. 데이터를 로드하고 전처리해주세요.")
    # 예시 데이터 생성 (실제 사용 시 이 부분은 삭제)
    data = {'emp_title': ['registered nurse', 'software developer', 'project manager', 'salesperson', 'retired']}
    acc_df_cleaned_aligned = pd.DataFrame(data)
    texts_for_bert_training = acc_df_cleaned_aligned['emp_title'].tolist()
    labels_for_bert_training_text = acc_df_cleaned_aligned['emp_title'].apply(classify_lc_title_to_soc_major_group).tolist()


# 라벨 인코딩 (문자열 라벨 -> 숫자 ID)
label_encoder_bert = LabelEncoder()
labels_for_bert_training = label_encoder_bert.fit_transform(labels_for_bert_training_text)
num_labels_bert = len(label_encoder_bert.classes_)

print(f"\nBERT 학습 데이터셋 준비 완료.")
print(f"총 {len(texts_for_bert_training)}개 직업명. 분류할 SOC 대직업군 개수: {num_labels_bert}")
print(f"생성된 SOC 대직업군 라벨: {label_encoder_bert.classes_}")

# --- PyTorch Dataset 및 DataLoader 정의 ---
class SOCDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

# --- BERT 모델 설정 및 데이터 준비 ---
model_name = 'bert-base-uncased' # 사전 학습된 BERT 모델
tokenizer = AutoTokenizer.from_pretrained(model_name)
model_bert = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_labels_bert)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model_bert.to(device)


# ---소수 클래스들을 'Unclassified Occupations' 라벨로 통합 ----
label_counts = pd.Series(labels_for_bert_training).value_counts()
low_count_labels = label_counts[label_counts < 2].index.tolist()
unclassified_label_id = label_encoder_bert.transform(['99-0000 Unclassified Occupations'])[0]
labels_for_bert_training_merged = np.array(labels_for_bert_training)
for low_label_id in low_count_labels:
    labels_for_bert_training_merged[labels_for_bert_training_merged == low_label_id] = unclassified_label_id

print("--- 라벨 통합 후 분포 ---")
merged_label_counts = pd.Series(labels_for_bert_training_merged).value_counts()
print(merged_label_counts)


# --- 클래스 가중치 계산 ---
class_weights = compute_class_weight(
    class_weight='balanced',
    classes=np.unique(labels_for_bert_training_merged),
    y=labels_for_bert_training_merged
)
class_weights_tensor = torch.tensor(class_weights, dtype=torch.float).to(device)
print("\n계산된 클래스 가중치:", class_weights_tensor)


# --- 데이터 분할 및 데이터셋 객체 생성 ---
train_texts, val_texts, train_labels, val_labels = train_test_split(
    texts_for_bert_training,
    labels_for_bert_training_merged,
    test_size=0.1,
    random_state=42,
    stratify=labels_for_bert_training_merged
)
print("\n데이터 분할 완료!")
print(f"훈련 데이터셋 크기: {len(train_texts)}")
print(f"검증 데이터셋 크기: {len(val_texts)}")

train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=64)
val_encodings = tokenizer(val_texts, truncation=True, padding=True, max_length=64)
train_dataset = SOCDataset(train_encodings, train_labels)
val_dataset = SOCDataset(val_encodings, val_labels)


# --- compute_metrics 함수 정의 (Trainer보다 먼저 정의되어야 함) ---
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return {"accuracy": accuracy_score(labels, predictions)}


# --- TrainingArguments 정의 (올바른 위치로 이동) ---
training_args = TrainingArguments(
    output_dir='./bert_lc_soc_classifier_results', # 결과 저장 경로 (Lending Club 데이터로 학습)
    num_train_epochs=3,                              # 학습 에폭 수
    per_device_train_batch_size=16,                  # 훈련 배치 크기
    per_device_eval_batch_size=16,                   # 평가 배치 크기
    warmup_steps=500,                                # 워밍업 스텝 수
    weight_decay=0.01,                               # 가중치 감소
    logging_dir='./bert_lc_soc_classifier_logs',     # 로그 디렉토리
    logging_steps=100,                               # 로그 출력 주기
    eval_strategy="epoch",                           # 에폭마다 평가
    save_strategy="epoch",                           # 에폭마다 모델 저장
    load_best_model_at_end=True,                     # 학습 종료 시 최고의 모델 로드
    metric_for_best_model="accuracy",                # 최고 모델 판단 기준
    fp16=torch.cuda.is_available(),                  # GPU 사용 시 float16 사용
    report_to="none"
)


# --- WeightedTrainer 클래스 정의 및 Trainer 객체 생성 ---
class WeightedTrainer(Trainer):
    def __init__(self, *args, class_weights=None, **kwargs):
        super().__init__(*args, **kwargs)
        self.class_weights = class_weights

    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        labels = inputs.get("labels")
        outputs = model(**inputs)
        logits = outputs.get("logits")
        loss_fct = nn.CrossEntropyLoss(weight=self.class_weights)
        loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))
        return (loss, outputs) if return_outputs else loss

trainer = WeightedTrainer(
    model=model_bert,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
    class_weights=class_weights_tensor # 계산된 가중치 전달
)

print("\n--- BERT 모델 학습 시작 (Lending Club 데이터 사용) ---")
trainer.train() # 학습 실행
print("--- BERT 모델 학습 완료 ---")


# --- D. Lending Club 데이터에 학습된 BERT 모델 적용 및 안정성 분류 ---
# 이제 모델은 Lending Club 데이터에서 SOC 대직업군을 예측하게 됩니다.

lc_emp_titles_for_prediction = acc_df_cleaned_aligned['emp_title'].tolist()

predicted_soc_major_groups_lc = []
model_bert.eval()

# 예측을 위한 데이터셋 및 DataLoader 준비
lc_encodings_pred = tokenizer(lc_emp_titles_for_prediction, truncation=True, padding=True, max_length=64, return_tensors='pt')
lc_dataset_pred = SOCDataset(lc_encodings_pred, labels=[0]*len(lc_emp_titles_for_prediction)) # 예측 시 라벨은 더미로 사용
lc_loader_pred = DataLoader(lc_dataset_pred, batch_size=64, shuffle=False)

for batch in lc_loader_pred:
    # 'labels' 키를 제외하고 inputs를 준비
    inputs = {k: v.to(device) for k, v in batch.items() if k != 'labels'}
    with torch.no_grad():
        outputs = model_bert(**inputs)
        logits = outputs.logits
        predictions = torch.argmax(logits, dim=-1).cpu().numpy()
        predicted_soc_major_groups_lc.extend(label_encoder_bert.inverse_transform(predictions))

acc_df_cleaned_aligned['Predicted_SOC_Major_Group'] = predicted_soc_major_groups_lc

print("\n--- Lending Club 직업명에 대한 SOC Major Group 예측 완료 ---")
print(acc_df_cleaned_aligned[['emp_title', 'Predicted_SOC_Major_Group']].head())


# --- 상세 분류 보고서 ---
from sklearn.metrics import classification_report

# 예측 완료 후
# labels_for_bert_training_merged 변수는 데이터 분할 부분에서 생성된 최종 라벨
true_labels_lc = labels_for_bert_training_merged # 실제 라벨
predicted_labels_lc = label_encoder_bert.transform(predicted_soc_major_groups_lc) # 예측된 라벨

print("\n--- 전체 데이터셋에 대한 상세 분류 보고서 ---")
print(classification_report(true_labels_lc, predicted_labels_lc, target_names=label_encoder_bert.classes_))


# --- E. 예측된 SOC Major Group을 '직업 안정성'으로 매핑 (이전과 동일) ---
soc_major_group_to_stability_map = {
    '11-0000 Management Occupations': 'Relatively Stable',
    '13-0000 Business and Financial Operations Occupations': 'Relatively Stable',
    '15-0000 Computer and Mathematical Occupations': 'Relatively Stable',
    '17-0000 Architecture and Engineering Occupations': 'Relatively Stable',
    '21-0000 Community and Social Service Occupations': 'Relatively Stable',
    '23-0000 Legal Occupations': 'Relatively Stable',
    '25-0000 Education, Training, and Library Occupations': 'Very Stable',
    '27-0000 Arts, Design, Entertainment, Sports, and Media Occupations': 'Variable Stability',
    '29-0000 Healthcare Occupations': 'Very Stable',
    '33-0000 Protective Service Occupations': 'Very Stable',
    '35-0000 Food Preparation and Serving Related Occupations': 'Variable Stability',
    '37-0000 Building and Grounds Cleaning and Maintenance Occupations': 'Variable Stability',
    '39-0000 Personal Care and Service Occupations': 'Variable Stability',
    '41-0000 Sales and Related Occupations': 'Variable Stability',
    '43-0000 Office and Administrative Support Occupations': 'Variable Stability',
    '45-0000 Farming, Fishing, and Forestry Occupations': 'Variable Stability',
    '47-0000 Construction and Extraction Occupations': 'Variable Stability',
    '49-0000 Installation, Maintenance, and Repair Occupations': 'Very Stable',
    '51-0000 Production Occupations': 'Variable Stability',
    '53-0000 Transportation and Material Moving Occupations': 'Variable Stability',
    '99-0000 Other Non-Occupational': 'Low Stability',
    '99-0000 Unclassified Occupations': 'Low Stability',
}

acc_df_cleaned_aligned['Stability_Level'] = acc_df_cleaned_aligned['Predicted_SOC_Major_Group'].map(
    soc_major_group_to_stability_map
)
acc_df_cleaned_aligned['Stability_Level'].fillna('Unknown Stability (Mapping Needed)', inplace=True)

stability_string_to_numeric_map = {
    'Very Stable': 4, 'Relatively Stable': 3,
    'Variable Stability': 2, 'Low Stability': 1,
    'Unknown Stability (Mapping Needed)': -1
}
acc_df_cleaned_aligned['Stability_Level_Numeric'] = acc_df_cleaned_aligned['Stability_Level'].map(
    stability_string_to_numeric_map
)


print("\n--- Lending Club 직업명에 대한 최종 '직업 안정성' 결과 (문자열 및 숫자) ---")
print(acc_df_cleaned_aligned[['emp_title', 'Predicted_SOC_Major_Group', 'Stability_Level', 'Stability_Level_Numeric']].head())

print("\n--- Stability_Level 분포 ---")
print(acc_df_cleaned_aligned['Stability_Level'].value_counts(dropna=False))
print("\n--- Stability_Level_Numeric 분포 ---")
print(acc_df_cleaned_aligned['Stability_Level_Numeric'].value_counts(dropna=False))

# BERT 불러오기

In [None]:
# 모델이 저장된 경로
save_path = "/content/drive/My Drive/Colab Notebooks/my_fine_tuned_bert"

# 토크나이저만 해당 경로에 저장
# trainer 객체에 사용했던 tokenizer 변수가 필요함
try:
    if 'tokenizer' in locals():
        tokenizer.save_pretrained(save_path)
        print("토크나이저 파일이 성공적으로 저장되었습니다.")
        print(f"'{save_path}' 폴더를 확인해주세요.")
    else:
        print("오류: 'tokenizer' 변수가 메모리에 없습니다. 모델 로딩 코드를 다시 실행해주세요.")
except Exception as e:
    print(f"토크나이저 저장 중 오류 발생: {e}")