## EDA

In [350]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [None]:
df = pd.read_csv("startup_data.csv")
print(df.info())
print(df.describe())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 923 entries, 0 to 922
Data columns (total 49 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Unnamed: 0                923 non-null    int64  
 1   state_code                923 non-null    object 
 2   latitude                  923 non-null    float64
 3   longitude                 923 non-null    float64
 4   zip_code                  923 non-null    object 
 5   id                        923 non-null    object 
 6   city                      923 non-null    object 
 7   Unnamed: 6                430 non-null    object 
 8   name                      923 non-null    object 
 9   labels                    923 non-null    int64  
 10  founded_at                923 non-null    object 
 11  closed_at                 335 non-null    object 
 12  first_funding_at          923 non-null    object 
 13  last_funding_at           923 non-null    object 
 14  age_first_

In [None]:
import pandas as pd
import numpy as np

def handle_missing_values(df):
    # 결측치 현황 확인
    missing_info = df.isnull().sum()
    print("결측치 현황:")
    print(missing_info[missing_info > 0])
    print("\n")
    
    # 마일스톤 관련 결측치 처리
    df['age_first_milestone_year'] = df['age_first_milestone_year'].fillna(0)
    df['age_last_milestone_year'] = df['age_last_milestone_year'].fillna(0)
    
    # 결측치 처리 후 확인
    missing_after = df.isnull().sum()
    print("결측치 처리 후 현황:")
    print(missing_after[missing_after > 0])
    
    return df


결측치 현황:
Unnamed: 6                  493
closed_at                   588
age_first_milestone_year    152
age_last_milestone_year     152
state_code.1                  1
dtype: int64


결측치 처리 후 현황:
Unnamed: 6      493
closed_at       588
state_code.1      1
dtype: int64


# XGBOOST

In [None]:
from sklearn.model_selection import train_test_split

main_columns = [
    # 위치 관련
    'is_CA', 'is_NY', 'is_MA', 'is_TX', 'is_otherstate',
    
    # 기업 기본 정보
    'founded_at', #'closed_at',
    
    # 펀딩 관련 정보
    'first_funding_at', 'last_funding_at',
    'age_first_funding_year', 'age_last_funding_year',
    'funding_rounds', 'funding_total_usd',
    'avg_participants',
    
    # 투자 유형
    'has_VC', 'has_angel', 'has_roundA', 'has_roundB', 
    'has_roundC', 'has_roundD',
    
    # 마일스톤 관련
    'milestones', 'age_first_milestone_year', 
    'age_last_milestone_year',
    
    # 비즈니스 관계
    'relationships',
    
    # 산업 분류
    'is_software', 'is_web', 'is_mobile', 'is_enterprise',
    'is_advertising', 'is_gamesvideo', 'is_ecommerce',
    'is_biotech', 'is_consulting', 'is_othercategory'
]

df = handle_missing_values(df.copy())

target_columns = ['is_top500', 'labels']

date_cols = ['founded_at', 'first_funding_at', 'last_funding_at'] # close_at은 위에서 제거됨

for col in date_cols:
    df[col] = pd.to_datetime(df[col]).astype(np.int64) // 10**9

X = df[main_columns].copy().reset_index(drop=True)
y = df[target_columns].copy().reset_index(drop=True)

target = 'labels' 

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix

def plot_model_evaluation(y_test, y_pred, y_pred_proba, features, importance):

    if y_pred_proba.ndim == 2:
        y_pred_proba = y_pred_proba[:, 1]
    
    
    plt.figure(figsize=(8, 6))
    fpr, tpr, _ = roc_curve(y_test, y_pred_proba)
    roc_auc = auc(fpr, tpr)
    
    plt.plot(fpr, tpr, color='darkorange', lw=2, 
             label=f'ROC curve (AUC = {roc_auc:.2f})')
    plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver Operating Characteristic (ROC)')
    plt.legend(loc="lower right")
    plt.grid(True, alpha=0.3)
    plt.show()

    
    
    plt.figure(figsize=(8, 6))
    
    importance_df = pd.DataFrame({
        'feature': list(features),
        'importance': list(importance)
    })
    importance_df = importance_df.sort_values('importance', ascending=True)
    
    plt.barh(range(len(importance_df)), importance_df['importance'])
    plt.yticks(range(len(importance_df)), importance_df['feature'])
    plt.xlabel('Feature Importance')
    plt.title('Feature Importance')
    plt.grid(True, alpha=0.3)
    plt.tight_layout()
    plt.show()
    
    plt.figure(figsize=(6, 5))
    cm = confusion_matrix(y_test, y_pred)
    plt.imshow(cm, interpolation='nearest', cmap=plt.cm.Blues)
    plt.title('Confusion Matrix')
    plt.colorbar()
    
    thresh = cm.max() / 2.
    for i in range(cm.shape[0]):
        for j in range(cm.shape[1]):
            plt.text(j, i, format(cm[i, j], 'd'),
                    ha="center", va="center",
                    color="white" if cm[i, j] > thresh else "black")
    plt.style.use('default')  # 기본 스타일로 초기화
    plt.rcParams['figure.facecolor'] = 'white'
    plt.rcParams['axes.facecolor'] = 'white'
    plt.ylabel('True Label')
    plt.xlabel('Predicted Label')
    plt.tight_layout()
    plt.show()
    
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))
    print("AUC :", roc_auc)

    print("\n상위 중요 특성:")
    print(importance)

In [1]:
from xgboost import XGBClassifier

X_train, X_test, y_train, y_test = train_test_split(
    X, y[target], 
    test_size=0.2,
    random_state=42
)

xgb = XGBClassifier(
    random_state=42,
    use_label_encoder=False,
    eval_metric='logloss'
)

xgb.fit(X_train, y_train)
y_pred = xgb.predict(X_test)

importance = pd.DataFrame({
    'feature': X_train.columns,
    'importance': xgb.feature_importances_
}).sort_values('importance', ascending=False)

plot_model_evaluation(
    y_test=y_test,
    y_pred=y_pred,
    y_pred_proba=xgb.predict_proba(X_test),  
    features=importance['feature'],          
    importance=importance['importance']      
)

NameError: name 'train_test_split' is not defined

In [None]:
# 시간순 정렬

X_sort = X.sort_values('founded_at')
y = y.reindex(X_sort.index)  

split_idx = int(len(X_sort) * 0.8)
X_train = X_sort[:split_idx]
X_test = X_sort[split_idx:]
y_train = y[target][:split_idx]  
y_test = y[target][split_idx:]   


In [None]:
from xgboost import XGBClassifier

xgb_sort = XGBClassifier(
    random_state=42,
    use_label_encoder=False,
    eval_metric='logloss'
)

xgb_sort.fit(X_train, y_train)
y_pred = xgb_sort.predict(X_test)

importance = pd.DataFrame({
    'feature': X_train.columns,
    'importance': xgb_sort.feature_importances_
}).sort_values('importance', ascending=False)

plot_model_evaluation(
    y_test=y_test,
    y_pred=y_pred,
    y_pred_proba=xgb_sort.predict_proba(X_test),  
    features=importance['feature'],          
    importance=importance['importance']      
)

In [372]:
from imblearn.over_sampling import SMOTE
from collections import Counter

# SMOTE는 훈련 데이터에만 적용
smote = SMOTE(random_state=42)
X_train_balanced, y_train_balanced = smote.fit_resample(X_train, y_train)

print("Before SMOTE:", Counter(y_train))
print("After SMOTE:", Counter(y_train_balanced))

Before SMOTE: Counter({1: 495, 0: 243})
After SMOTE: Counter({0: 495, 1: 495})


In [None]:
from xgboost import XGBClassifier

X_train_balanced = X_train_balanced.sort_values('founded_at').reset_index(drop=True)
y_train_balanced = y_train_balanced[target].reindex(X.index)

xgb_smote = XGBClassifier(
    random_state=42,
    use_label_encoder=False,
    eval_metric='logloss'
)

xgb_smote.fit(X_train_balanced, y_train_balanced)
y_pred = xgb_smote.predict(X_test)

importance = pd.DataFrame({
    'feature': X_train.columns,
    'importance': xgb_smote.feature_importances_
}).sort_values('importance', ascending=False)

plot_model_evaluation(
    y_test=y_test,
    y_pred=y_pred,
    y_pred_proba=xgb.predict_proba(X_test),  
    features=importance['feature'],          
    importance=importance['importance']      
)

In [None]:
from sklearn.model_selection import TimeSeriesSplit
from sklearn.model_selection import KFold, cross_val_score
from sklearn.metrics import make_scorer, f1_score

def evaluate_model_with_time_cv(model, X, y, n_splits=5):
   # 시간순 정렬
   X = X.sort_values('founded_at').reset_index(drop=True)
   y = y.reindex(X.index)
   
   # TimeSeriesSplit 
   tscv = TimeSeriesSplit(n_splits=n_splits)
   scores = []
   
   for train_idx, val_idx in tscv.split(X):
       # Split
       X_train_fold = X.iloc[train_idx]
       X_val_fold = X.iloc[val_idx]
       y_train_fold = y.iloc[train_idx]
       y_val_fold = y.iloc[val_idx]
       
       # Train & Predict
       model.fit(X_train_fold, y_train_fold)
       y_pred = model.predict(X_val_fold)
       
       # Score
       scores.append(f1_score(y_val_fold, y_pred))
   
   print(f"F1 scores: {scores}")
   print(f"Mean F1: {np.mean(scores):.3f} (+/- {np.std(scores) * 2:.3f})")
   
   return scores

In [None]:
# 여러 모델에 대해 평가
scores_xgb1 = evaluate_model_with_time_cv(xgb, X, y[target])
scores_xgb2 = evaluate_model_with_time_cv(xgb_sort, X, y[target])
scores_xgb3 = evaluate_model_with_time_cv(xgb_smote, X, y[target])