In [490]:
import pandas as pd
import numpy as np

In [491]:
train = pd.read_csv("/Users/mingu/Desktop/CODING/Kaggle Project 1/DATASET/train.csv")
test = pd.read_csv("/Users/mingu/Desktop/CODING/Kaggle Project 1/DATASET/test.csv")

In [492]:
train.drop('Unnamed: 0',axis=1, inplace= True)
test.drop('Unnamed: 0',axis=1, inplace= True)

In [493]:
### Species
train.drop('Species', axis = 1, inplace = True)
test.drop('Species', axis = 1, inplace = True)


### Farm.Name
train.drop('Farm.Name', axis = 1, inplace = True)
test.drop('Farm.Name', axis = 1, inplace = True)


#### Lot.Number
import re
import math

def map_country_of_origin_preprocessing(value):
    if pd.isna(value):
        return -2
    elif re.match(r'^\d+/\d+[A-Za-z]$', str(value)) or str(value).startswith('431')  or re.match(r'^\d{3}/\d{2}$', str(value)) or 'Lot' in str(value):
        return 0
    elif re.match(r'^3-\d{2}-\d{4}$', str(value)) or re.match(r'^\d{2}-\d{4}$', str(value)):
        return 1
    elif re.match(r'^11/\d+/\d+$', str(value)) or re.match(r'^11-\d+-\d+$', str(value)):
        return 2
    else:
        return -1  # 예외 처리

train['Mapped.Lot.Number.Country.of.Origin'] = train['Lot.Number'].apply(map_country_of_origin_preprocessing)
test['Mapped.Lot.Number.Country.of.Origin'] = test['Lot.Number'].apply(map_country_of_origin_preprocessing)

train.drop('Lot.Number', axis = 1, inplace = True)
test.drop('Lot.Number', axis = 1, inplace = True)


### Altitude

def to_meters(value):
    if pd.isnull(value):
        return np.nan
    
    if value.isdigit():
        return float(value)  # 이미 숫자인 경우 그대로 반환합니다.
    
    if isinstance(value, str):
        # 정규표현식을 사용하여 숫자를 추출합니다.
        numbers = re.findall(r'\d+\.?\d*', value)
        value = value.replace('.', '')
        
        if len(numbers) == 1:
            number = float(numbers[0])
            # 단위에 따라 변환합니다.
            if 'ft' in value.lower() or 'pies' in value.lower() or 'feet' in value.lower():
                return number * 0.3048  # 피트(ft)를 미터(m)로 변환합니다.
            elif 'mts' in value.lower() or 'metros' in value.lower() or 'msn' in value.lower() or 'msnm' in value.lower():
                return number  # 미터(m)는 그대로 반환합니다.
            elif 'km' in value.lower():
                return number * 1000  # 킬로미터(km)를 미터(m)로 변환합니다.
            elif 'miles' in value.lower():
                return number * 1609.34  # 마일(miles)을 미터(m)로 변환합니다.
            elif 'psn' in value.lower() or 'psnm' in value.lower():
                return number * 1852
            else:
                return number
        elif len(numbers) == 2:
            # 숫자가 두 개인 경우에는 평균을 계산하여 반환합니다.
            avg_number = (float(numbers[0]) + float(numbers[1])) / 2
            if 'ft' in value.lower() or 'pies' in value.lower() or 'feet' in value.lower():
                return avg_number * 0.3048  # 피트(ft)를 미터(m)로 변환합니다.
            elif 'mts' in value.lower() or 'metros' in value.lower() or 'msn' in value.lower() or 'msnm' in value.lower():
                return avg_number  # 미터(m)는 그대로 반환합니다.
            elif 'km' in value.lower():
                return avg_number * 1000  # 킬로미터(km)를 미터(m)로 변환합니다.
            elif 'miles' in value.lower():
                return avg_number * 1609.34  # 마일(miles)을 미터(m)로 변환합니다.
            elif 'psn' in value.lower() or 'psnm' in value.lower():
                return avg_number * 1852
            else:
                return avg_number
    return np.nan

train['converted_altitude'] = train['Altitude'].apply(to_meters)
test['converted_altitude'] = test['Altitude'].apply(to_meters)

mean_altitude = train['converted_altitude'].mean()
train['converted_altitude'].fillna(mean_altitude, inplace=True)
test['converted_altitude'].fillna(mean_altitude, inplace=True)

train.drop('Altitude', axis = 1, inplace = True)
test.drop('Altitude', axis = 1, inplace = True)

### Bag.weight
# Bag.Weight 열의 값에 따라 kg로 변환하는 함수 정의
def convert_to_kg(value):
    # 만약 값이 비어있으면 그대로 반환
    if pd.isna(value):
        return value
    # 만약 값에 'kg'가 포함되어 있다면 이를 삭제한 후 반환
    elif 'kg' in str(value):
        return float(value.replace('kg', '').strip())
    # 만약 값이 'lbs'로 끝나면 lbs를 kg로 변환하여 반환
    elif str(value).endswith('lbs'):
        weight_in_lbs = float(value.replace('lbs', '').strip())
        weight_in_kg = weight_in_lbs * 0.453592  # 1 lbs = 0.453592 kg
        return weight_in_kg
    # 그 외의 경우는 단위가 없는 값으로 간주하여 kg로 반환
    else:
        return float(value)

# Bag.Weight 열에 적용하여 모든 값을 kg로 변환
train['Bag.Weight'] = train['Bag.Weight'].apply(convert_to_kg)
test['Bag.Weight'] = test['Bag.Weight'].apply(convert_to_kg)

### Harvest.Year
# Harvest.Year 열의 값에서 연도를 추출하여 반환하는 함수 정의
def extract_year(value):
    # 값이 비어있으면 nan 반환
    if pd.isna(value):
        return np.nan
    # 정규식을 사용하여 모든 연도 추출
    matches = re.finditer(r'\b(19|20)(\d{2})\b', str(value))
    # 추출된 연도가 없으면 nan 반환
    if not matches:
        return np.nan
    # 모든 매치의 연도를 리스트에 추가하여 반환
    years = []
    for match in matches:
        years.append(int(match.group()))
    return years

# Harvest.Year 열에 적용하여 모든 값을 연도로 변환
train['Harvest.Year'] = train['Harvest.Year'].apply(extract_year)
test['Harvest.Year'] = test['Harvest.Year'].apply(extract_year)

# 두 개 이상의 값이 있는 경우에는 두 값의 평균을 사용하는 함수 정의
def handle_multiple_years(value):
    # 만약 값이 nan이면 그대로 반환
    # 만약 값이 리스트 형태가 아니라면 그대로 반환
    if not isinstance(value, list):
        return value
    # 리스트의 길이가 1보다 작거나 같으면 그대로 반환
    if len(value) <= 1:
        return value[0] if len(value) == 1 else np.nan
    # 리스트의 길이가 2 이상이면 두 값의 평균을 계산하여 반환
    return np.mean(value)

# Harvest.Year 열에 적용하여 모든 값을 연도로 변환
train['Harvest.Year'] = train['Harvest.Year'].apply(handle_multiple_years)
test['Harvest.Year'] = test['Harvest.Year'].apply(handle_multiple_years)

max_harvest_year = train['Harvest.Year'].mode()[0]  # 최빈값 계산
train['Harvest.Year'].fillna(max_harvest_year, inplace=True)  # nan 값을 최빈값으로 대체
test['Harvest.Year'].fillna(max_harvest_year, inplace=True)

train['Harvest.Year'] = train['Harvest.Year']
test['Harvest.Year'] = test['Harvest.Year']

### Grading.Date 
# Grading.Date 열의 값을 파싱하여 년, 월, 일로 나누는 함수 정의
def parse_date(date_string):
    try:
        # 날짜 문자열을 datetime 객체로 변환
        date = pd.to_datetime(date_string)
        # 년, 월, 일 추출
        year = date.year
        month = date.month
        day = date.day
        return year, month, day
    except:
        return None, None, None

# Grading.Date 열에서 년, 월, 일 추출하여 새로운 열에 추가
train['Grading.Year'], train['Grading.Month'], train['Grading.Day'] = zip(*train['Grading.Date'].apply(parse_date))
test['Grading.Year'], test['Grading.Month'], test['Grading.Day'] = zip(*test['Grading.Date'].apply(parse_date))

# 월과 일에 대해 sin과 cos 함수를 적용하여 전처리하는 함수 정의
def preprocess_seasonality(month, day):
    # 월과 일을 0부터 1까지의 값으로 변환
    normalized_month = (month - 1) / 12  # 1월부터 12월까지를 0부터 1까지의 값으로 변환
    normalized_day = (day - 1) / 31  # 1일부터 31일까지를 0부터 1까지의 값으로 변환

    # sin과 cos 함수를 적용하여 전처리
    month_sin = -np.sin(2 * np.pi * normalized_month)
    month_cos = -np.cos(2 * np.pi * normalized_month)
    day_sin = -np.sin(2 * np.pi * normalized_day)
    day_cos = -np.cos(2 * np.pi * normalized_day)
    date_sin = -np.sin(2 * np.pi * (month+day/31)/12)
    date_cos = -np.cos(2 * np.pi * (month+day/31)/12)
    return month_sin, month_cos, day_sin, day_cos, date_sin, date_cos

# Grading.Date 열에서 월과 일 추출하여 전처리 적용
train['Grading.Month_Sin'], train['Grading.Month_Cos'], train['Grading.Day_Sin'], train['Grading.Day_Cos'], train['Grading.Date_Sin'], train['Grading.Date_Cos'] = zip(*train.apply(lambda x: preprocess_seasonality(x['Grading.Month'], x['Grading.Day']), axis=1))
test['Grading.Month_Sin'], test['Grading.Month_Cos'], test['Grading.Day_Sin'], test['Grading.Day_Cos'], test['Grading.Date_Sin'], test['Grading.Date_Cos'] = zip(*test.apply(lambda x: preprocess_seasonality(x['Grading.Month'], x['Grading.Day']), axis=1))

train.drop('Grading.Date', axis = 1, inplace = True)
test.drop('Grading.Date', axis = 1, inplace = True)

### Variety
from sklearn.preprocessing import LabelEncoder
# LabelEncoder 객체 생성
label_encoder = LabelEncoder()

# train 데이터에서 Variety 열을 기반으로 라벨 인코딩 규칙을 학습하고 적용
train['Variety'] = label_encoder.fit_transform(train['Variety'])
# test 데이터에서 Variety 열을 라벨 인코딩 규칙을 적용 (train 데이터에서 학습된 규칙을 사용)
test['Variety'] = label_encoder.transform(test['Variety'])

### Expiration
train['Expiration.Year'], train['Expiration.Month'], train['Expiration.Day'] = zip(*train['Expiration'].apply(parse_date))
test['Expiration.Year'], test['Expiration.Month'], test['Expiration.Day'] = zip(*test['Expiration'].apply(parse_date))

# Expiration 열에서 월과 일 추출하여 전처리 적용
train['Expiration.Month_Sin'], train['Expiration.Month_Cos'], train['Expiration.Day_Sin'], train['Expiration.Day_Cos'], train['Expiration.Date_Sin'], train['Expiration.Date_Cos'] = zip(*train.apply(lambda x: preprocess_seasonality(x['Expiration.Month'], x['Expiration.Day']), axis=1))
test['Expiration.Month_Sin'], test['Expiration.Month_Cos'], test['Expiration.Day_Sin'], test['Expiration.Day_Cos'], test['Expiration.Date_Sin'], test['Expiration.Date_Cos'] = zip(*test.apply(lambda x: preprocess_seasonality(x['Expiration.Month'], x['Expiration.Day']), axis=1))

train.drop('Expiration', axis = 1, inplace = True)
test.drop('Expiration', axis = 1, inplace = True)

In [494]:
from sklearn.preprocessing import LabelEncoder

# LabelEncoder 객체 생성
label_encoder = LabelEncoder()

# train 데이터프레임의 문자열 열에 대해 label encoding 수행
for column in train.columns:
    if train[column].dtype == 'object':
        print(column)

        # fit_transform()을 사용하여 train 데이터셋에 대해 label encoding 수행
        train[column] = label_encoder.fit_transform(train[column])
        
        # test 데이터셋에 대해 transform()을 사용하여 train 데이터셋을 기반으로 동일한 label encoding 수행
        # 만약 test 데이터셋에 대해 unseen label이 발생하면, 해당 값을 -1로 설정
        test[column] = test[column].map(lambda s: label_encoder.transform([s])[0] if s in label_encoder.classes_ else -1)

Processing.Method
Color


In [495]:
from sklearn.preprocessing import StandardScaler

# StandardScaler 객체 생성
scaler = StandardScaler()

scale_column = ['Number.of.Bags', 'Bag.Weight', 'Harvest.Year', 'Variety', 'Processing.Method']

#for column in scale_column:
for column in train.columns:
    if column != 'Country.of.Origin':
        # x_train 데이터에 대해 스케일링을 진행
        train[column] = scaler.fit_transform(train[column].values.reshape(-1, 1))

        # x_validation 데이터에 대해 스케일링을 진행 (훈련 데이터에서 추정된 변환 매개변수를 사용)
        test[column] = scaler.transform(test[column].values.reshape(-1, 1))

In [496]:
train.fillna(-1, inplace = True)
test.fillna(-1, inplace = True)

In [497]:
train

Unnamed: 0,Number.of.Bags,Bag.Weight,Harvest.Year,Variety,Processing.Method,Aroma,Flavor,Aftertaste,Acidity,Body,...,Grading.Date_Cos,Expiration.Year,Expiration.Month,Expiration.Day,Expiration.Month_Sin,Expiration.Month_Cos,Expiration.Day_Sin,Expiration.Day_Cos,Expiration.Date_Sin,Expiration.Date_Cos
0,0.558407,-0.549787,-0.743565,-1.048871,0.443524,-1.218672,-4.374557,-3.558456,-0.920381,-0.608264,...,-1.263458,-0.751825,-1.719060,-0.441136,-0.031414,-1.622870,-1.078131,0.891512,-1.083114,-1.279230
1,0.758483,0.453644,-0.743565,-0.572481,1.068734,-0.696480,-0.502131,-0.023442,0.227939,-0.912982,...,0.649597,-1.285585,0.201341,1.357222,-0.031414,1.172992,1.019472,-0.990739,0.970983,0.671765
2,-1.242277,0.439102,2.006026,-1.048871,0.443524,-0.141650,-0.223316,0.250827,0.819498,0.039262,...,-0.353847,1.916972,0.841475,-0.890725,1.242790,0.474027,-1.351745,-0.213086,1.177659,-0.352849
3,0.998575,-0.535244,0.356272,-0.810676,-2.057312,0.413179,0.551169,1.012683,-0.050442,-0.265456,...,-0.602792,0.849453,-1.398993,-0.328738,-0.767077,-1.435583,-0.885192,1.103672,-1.479327,-0.601700
4,-1.434350,-0.531348,-0.193646,1.571270,0.443524,-0.141650,-0.223316,0.250827,-0.328822,0.686788,...,-0.454523,0.315694,0.841475,-0.441136,1.242790,0.474027,-1.078131,0.891512,1.162148,-0.453488
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
580,0.758483,-0.535244,1.181149,0.380297,0.443524,0.935372,0.799004,1.500271,0.227939,0.686788,...,1.356392,0.849453,-0.118726,-1.340314,-0.767077,0.985705,-0.774324,-1.181407,-0.083711,1.358683
581,0.158255,0.439102,2.006026,0.142102,0.443524,0.935372,1.046839,0.738415,1.411057,-0.265456,...,1.119247,1.916972,0.201341,-1.452712,-0.031414,1.172992,-0.529471,-1.323464,0.566132,1.119689
582,0.558407,-0.549787,-0.743565,-1.048871,0.443524,-0.141650,0.551169,0.494621,-0.050442,0.343980,...,0.997035,-0.751825,0.201341,-0.553533,-0.031414,1.172992,-1.226263,0.643098,0.713870,0.997523
583,0.558407,0.453644,-0.743565,1.571270,1.068734,0.119446,0.551169,0.250827,-1.198762,-1.217700,...,-0.699150,-1.285585,-1.398993,-0.778328,-0.767077,-1.435583,-1.365929,0.079257,-1.448746,-0.698022


In [498]:
train.corr()['Country.of.Origin']

Number.of.Bags                        -0.387300
Bag.Weight                            -0.234214
Harvest.Year                          -0.312995
Variety                                0.121877
Processing.Method                      0.399663
Aroma                                 -0.170782
Flavor                                -0.261109
Aftertaste                            -0.310908
Acidity                               -0.114522
Body                                  -0.290722
Balance                               -0.316077
Uniformity                            -0.100379
Clean.Cup                             -0.039066
Sweetness                              0.000975
Cupper.Points                         -0.302856
Total.Cup.Points                      -0.258803
Moisture                               0.331794
Category.One.Defects                   0.132010
Quakers                               -0.138671
Color                                  0.185238
Category.Two.Defects                   0

In [499]:
X = train.drop('Country.of.Origin', axis = 1)
y = train['Country.of.Origin']

In [502]:
from sklearn.model_selection import StratifiedKFold
from imblearn.over_sampling import SMOTE
from SVM import MulticlassSVMClassifier
import numpy as np

def multiclass_svm_grid_search(X, y, X_test, param_grid, n_splits=5, random_seed=3):
    # Set the random seed for reproducibility
    np.random.seed(random_seed)
    
    # Initialize variables to store the best score and best parameters
    best_score = -np.inf
    best_params = None
    best_predict = []
    
    # Perform k-fold cross-validation
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=random_seed)
    for n_iters in param_grid['n_iters']:
        for lr in param_grid['lr']:
            for lambda_param in param_grid['lambda_param']:
                # Initialize SVM classifier with current hyperparameters
                svm = MulticlassSVMClassifier(n_iters = n_iters, lr = lr, lambda_param = lambda_param)
                
                # Initialize variables to store the scores for this combination of hyperparameters
                scores = []
                predict = []
                for train_idx, val_idx in skf.split(X, y):
                    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
                    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

                    smote = SMOTE(random_state=random_seed)

                    X_train, y_train = smote.fit_resample(X_train, y_train)

                    X_train, y_train = X_train.values, y_train.values
                    X_val, y_val = X_val.values, y_val.values
                    
                    # Train the model
                    svm.fit(X_train, y_train)

                    # Evaluate the model
                    predictions = svm.predict(X_val)
                    accuracy = svm.get_accuracy(y_val, predictions)
                    scores.append(accuracy)

                    #test prediction
                    predict.append(svm.predict(X_test))
                
                # Calculate the average score for this combination of hyperparameters
                avg_score = np.mean(scores)
                
                params = {'n_iters': n_iters, 'lr': lr, 'lambda_param': lambda_param}
                print(params,':',avg_score)
                
                # Check if the current combination of hyperparameters yields a better score
                if avg_score > best_score:
                    best_score = avg_score
                    best_params = {'n_iters': n_iters, 'lr': lr, 'lambda_param': lambda_param}
                    best_predict = predict[:]
    
    return best_params, best_score, best_predict

# Example usage:

# Define the parameter grid for grid search
param_grid = {
    'n_iters': [i* 100 for i in range(1,11)],
    'lr': [0.00001, 0.0001 ,0.001, 0.01 ],
    'lambda_param': [0.00001, 0.0001, 0.001, 0.01],
}

# Assuming X contains your features and y contains your target variable

# Perform grid search
best_params, best_score, best_predict = multiclass_svm_grid_search(X, y, test.values,  param_grid)

print("Best Parameters:", best_params)
print("Best Score:", best_score)
print("Best Predict:", best_predict)


{'n_iters': 100, 'lr': 1e-05, 'lambda_param': 1e-05} : 0.6273504273504273
{'n_iters': 100, 'lr': 1e-05, 'lambda_param': 0.0001} : 0.6273504273504273
{'n_iters': 100, 'lr': 1e-05, 'lambda_param': 0.001} : 0.6273504273504273
{'n_iters': 100, 'lr': 1e-05, 'lambda_param': 0.01} : 0.6273504273504273
{'n_iters': 100, 'lr': 0.0001, 'lambda_param': 1e-05} : 0.6991452991452991
{'n_iters': 100, 'lr': 0.0001, 'lambda_param': 0.0001} : 0.6991452991452991
{'n_iters': 100, 'lr': 0.0001, 'lambda_param': 0.001} : 0.6991452991452991
{'n_iters': 100, 'lr': 0.0001, 'lambda_param': 0.01} : 0.6923076923076923
{'n_iters': 100, 'lr': 0.001, 'lambda_param': 1e-05} : 0.6683760683760684
{'n_iters': 100, 'lr': 0.001, 'lambda_param': 0.0001} : 0.6683760683760684
{'n_iters': 100, 'lr': 0.001, 'lambda_param': 0.001} : 0.6666666666666667
{'n_iters': 100, 'lr': 0.001, 'lambda_param': 0.01} : 0.6341880341880343
{'n_iters': 100, 'lr': 0.01, 'lambda_param': 1e-05} : 0.6735042735042736
{'n_iters': 100, 'lr': 0.01, 'lambd

In [448]:
prediction_test = np.array(best_predict)
final_predictions = np.apply_along_axis(lambda x: np.argmax(np.bincount(x)), axis=0, arr=prediction_test)

In [449]:
final_predictions

array([3, 2, 2, 0, 3, 2, 0, 3, 3, 3, 0, 3, 0, 3, 0, 2, 0, 1, 1, 3, 2, 3,
       2, 0, 0, 3, 2, 3, 3, 3, 3, 1, 2, 3, 3, 3, 3, 2, 0, 1, 0, 3, 0, 2,
       0, 0, 2, 0, 1, 2, 0, 0, 3, 1, 0, 1, 0, 0, 2, 2, 0, 2, 0, 2, 0, 0,
       3, 3, 0, 2, 2, 1, 0, 0, 0, 2, 2, 3, 0, 2, 3, 1, 3, 3, 0, 3, 1, 0,
       0, 2, 3, 3, 3, 2, 3, 2, 0, 0, 1, 1, 2, 1, 3, 0, 1, 0, 2, 1, 0, 0,
       0, 2, 0, 2, 0, 2, 0, 0, 2, 3, 0, 1, 1, 1, 3, 3, 2, 2, 1, 2, 3, 0,
       3, 3, 3, 0, 3, 0, 0, 2, 1, 1, 0, 0, 1, 0, 0])

In [450]:
train = pd.read_csv("/Users/mingu/Desktop/CODING/Kaggle Project 1/DATASET/train.csv")
test = pd.read_csv("/Users/mingu/Desktop/CODING/Kaggle Project 1/DATASET/test.csv")

In [451]:
test['Country.of.Origin'] = final_predictions

In [452]:
from sklearn.metrics import accuracy_score

### Altitude <- 1,2 혼동이 되는 경우가 있기 때문에 후처리 시 앞으로 배정
def map_country_based_on_altitude(country, altitude_text):
    if isinstance(altitude_text, str):  # 문자열인지 확인
        if 'msn' in altitude_text:
            if country not in [1, 2]:
                return 1
        elif 'psn' in altitude_text or 'ft' in altitude_text:
            return 2
    return country

test['Country.of.Origin'] = test.apply(lambda row: map_country_based_on_altitude(row['Country.of.Origin'], row['Altitude']), axis=1)

###Farm.Name

farm_country_mapping = train[train['Farm.Name'].isin(['santa maria', 'several', 'various']) == False].groupby('Farm.Name')['Country.of.Origin'].first().to_dict()

# 후처리 함수 정의
def postprocess_country(df, farm_country_mapping):
    for index, row in df.iterrows():
        farm_name = row['Farm.Name']
        if farm_name in farm_country_mapping:
            df.at[index, 'Country.of.Origin'] = farm_country_mapping[farm_name]
    return df

# 후처리 적용
validation = postprocess_country(test, farm_country_mapping)


### Lot.Number
import re

def map_country_of_origin_postprocessing(value):
    if re.match(r'^\d+/\d+[A-Za-z]$', str(value)) or str(value).startswith('431') or re.match(r'^\d{3}/\d{2}$', str(value)) or 'Lot' in str(value):
        return 0
    elif re.match(r'^3-\d{2}-\d{4}$', str(value)) or re.match(r'^\d{2}-\d{4}$', str(value)):
        return 1
    elif re.match(r'^11/\d+/\d+$', str(value)) or re.match(r'^11-\d+-\d+$', str(value)):
        return 2

def check_country_of_origin(value):
    # Combine all the conditions into a single regular expression pattern
    pattern = (
        r'^\d+/\d+[A-Za-z]$|^431|^(\d{3}/\d{2})|Lot'  # Condition 1
        r'|^3-\d{2}-\d{4}$|^\d{2}-\d{4}$'  # Condition 2
        r'|^11/\d+/\d+$|^11-\d+-\d+$'  # Condition 3
    )
    
    # Check if the value matches any of the conditions
    if re.match(pattern, str(value)):
        return True
    else:
        return False
    
# 후처리 함수 정의
def postprocess_country_of_origin(df):
    for index, row in df.iterrows():
        if check_country_of_origin(row['Lot.Number']):
            df.at[index, 'Country.of.Origin'] = map_country_of_origin_postprocessing(row['Lot.Number'])
    return df

# test_data에 후처리 적용
test = postprocess_country_of_origin(test)


In [453]:
test['Country.of.Origin'].values

array([3, 2, 2, 0, 3, 2, 1, 3, 3, 3, 0, 3, 1, 3, 1, 2, 1, 1, 2, 3, 2, 3,
       2, 2, 0, 3, 2, 3, 3, 3, 3, 1, 2, 3, 3, 0, 3, 2, 0, 1, 1, 3, 0, 2,
       0, 0, 2, 0, 1, 2, 0, 0, 3, 2, 1, 1, 1, 0, 2, 2, 0, 2, 0, 2, 1, 0,
       3, 1, 0, 2, 3, 2, 0, 1, 0, 2, 2, 3, 0, 3, 3, 1, 3, 3, 0, 3, 1, 3,
       1, 2, 3, 3, 3, 2, 3, 2, 0, 0, 1, 1, 2, 1, 3, 0, 1, 0, 2, 1, 0, 0,
       1, 2, 1, 3, 0, 2, 1, 0, 1, 3, 0, 1, 3, 1, 3, 2, 2, 2, 1, 2, 3, 2,
       3, 3, 1, 0, 3, 0, 1, 2, 1, 1, 1, 0, 1, 0, 1])

In [454]:
sub = pd.read_csv("/Users/mingu/Desktop/CODING/Kaggle Project 1/DATASET/submission.csv")

In [455]:
sub['Country.of.Origin'] = test['Country.of.Origin'].values

In [456]:
sub['Country.of.Origin']

0      3
1      2
2      2
3      0
4      3
      ..
142    1
143    0
144    1
145    0
146    1
Name: Country.of.Origin, Length: 147, dtype: int64

In [457]:
sub.to_csv('/Users/mingu/Desktop/CODING/Kaggle Project 1/SUBMISSION/baseline_oversampling.csv', index=False)