In [2]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import LabelEncoder
import numpy as np
import pandas as pd

In [3]:
df_train = pd.read_csv("FIFA_train.csv")

In [4]:
print("결측치 수:")
print(df_train.isnull().sum())

결측치 수:
id                  0
name                0
age                 0
continent           0
contract_until      0
position            0
prefer_foot         0
reputation          0
stat_overall        0
stat_potential      0
stat_skill_moves    0
value               0
dtype: int64


In [5]:
df = df_train.copy()

In [6]:
df.head()

Unnamed: 0,id,name,age,continent,contract_until,position,prefer_foot,reputation,stat_overall,stat_potential,stat_skill_moves,value
0,0,L. Messi,31,south america,2021,ST,left,5.0,94,94,4.0,110500000.0
1,3,De Gea,27,europe,2020,GK,right,4.0,91,93,1.0,72000000.0
2,7,L. Suárez,31,south america,2021,ST,right,5.0,91,91,3.0,80000000.0
3,8,Sergio Ramos,32,europe,2020,DF,right,4.0,91,91,3.0,51000000.0
4,9,J. Oblak,25,europe,2021,GK,right,3.0,90,93,1.0,68000000.0


In [7]:
label_cols = df.select_dtypes(include='object').columns
label_encoders = {}

In [8]:
for col in label_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col].astype(str))
    label_encoders[col] = le

In [9]:
X = df.drop(columns=['id', 'name', 'value', 'continent'])
y = df['value']

In [10]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)

In [11]:
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

In [12]:
y_pred = model.predict(X_valid)

In [13]:
rmse = np.sqrt(mean_squared_error(y_valid, y_pred))
r2 = r2_score(y_valid, y_pred)

In [14]:
print("모델 평가 결과")
print(f"RMSE (평균 제곱근 오차): {rmse:,.2f}")
print(f"R² Score (설명력): {r2:.4f}")

모델 평가 결과
RMSE (평균 제곱근 오차): 1,245,461.33
R² Score (설명력): 0.9657


In [15]:
results_df = pd.DataFrame({
    'Actual': y_valid.values,
    'Predicted': y_pred,
    'Difference': y_valid.values - y_pred
}).reset_index(drop=True)

In [16]:
print("예측 결과 예시:")
print(results_df.head())

예측 결과 예시:
      Actual     Predicted     Difference
0   600000.0  6.067500e+05   -6750.000000
1   975000.0  9.627815e+05   12218.452381
2   210000.0  2.577500e+05  -47750.000000
3   800000.0  8.011500e+05   -1150.000000
4  4400000.0  3.994000e+06  406000.000000


In [17]:
def safe_label_transform(le, values):
    known_classes = set(le.classes_)
    return [le.transform([v])[0] if v in known_classes else -1 for v in values]

In [18]:
df_test = pd.read_csv("FIFA_test.csv")
df_test_original = df_test.copy()

In [19]:
for col in df_test.select_dtypes(include='object').columns:
    if col in label_encoders:
        le = label_encoders[col]
        df_test[col] = safe_label_transform(le, df_test[col].astype(str))
    else:
        df_test[col] = df_test[col].astype(str)

In [20]:
X_test = df_test.drop(columns=['id', 'name', 'continent'])
y_test_pred = model.predict(X_test)

In [21]:
df_test_original['predicted_value'] = y_test_pred

In [22]:
print(df_test_original[['name', 'position', 'age', 'stat_overall', 'predicted_value']].head())

                name position  age  stat_overall  predicted_value
0  Cristiano Ronaldo       ST   33            94       61995000.0
1          Neymar Jr       ST   26            92       73555000.0
2       K. De Bruyne       MF   27            91       65680000.0
3          E. Hazard       ST   27            91       68335000.0
4          L. Modrić       MF   32            91       63010000.0


In [24]:
df_test_original.to_csv("FIFA_test_with_prediction.csv", index=False)

print("FIFA_test_with_prediction.csv 파일이 성공적으로 저장되었습니다!")

FIFA_test_with_prediction.csv 파일이 성공적으로 저장되었습니다!
