<a href="https://colab.research.google.com/github/siravitgonarm/Project/blob/main/Data_Mining_Project_(Animation_Preference_Prediction).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_selection import mutual_info_classif
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import CategoricalNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report

In [None]:
df = pd.read_csv('/content/drive/MyDrive/Data/datamine final test.csv')

In [None]:
df.columns = [
    'gender', 'watch_frequency', 'watch_platform_text', 'platform_youtube', 'platform_netflix',
    'platform_bilibili', 'platform_iqiyi', 'platform_viu', 'platform_online_sites',
    'platform_files', 'platform_tv', 'favorite_genre', 'episodes_per_session', 'watch_reason_text',
    'reason_stress_relief', 'reason_art_sound', 'reason_story', 'reason_characters',
    'reason_good_message', 'reason_trending', 'reason_inspiration', 'ending_preference_text',
    'ending_happy', 'ending_sad', 'ending_open', 'ending_twist', 'ending_sequel',
    'ending_realistic', 'ending_touching', 'ending_fun', 'selection_criteria_text',
    'criteria_cover', 'criteria_teaser', 'criteria_story', 'criteria_soundtrack',
    'criteria_characters', 'criteria_va', 'criteria_friends_recommend', 'character_connection',
    'preferred_language'
]

In [None]:
X = df.drop(columns=['favorite_genre', 'watch_platform_text', 'watch_reason_text', 'ending_preference_text', 'selection_criteria_text'])
y = df['favorite_genre']

encoder = LabelEncoder()
y_encoded = encoder.fit_transform(y)
y_labels = encoder.classes_

for col in X.select_dtypes(include='object').columns:
    X[col] = encoder.fit_transform(X[col])

In [None]:
#Information Gain
info_gain = mutual_info_classif(X, y_encoded, discrete_features=True)
info_gain_series = pd.Series(info_gain, index=X.columns)

top_5_features = info_gain_series.sort_values(ascending=False).head(5)

print("5 ปัจจัยที่มีอิทธิพลสูงสุดต่อการทำนายแนวอนิเมะที่ชอบ:")
print(top_5_features)
print("-" * 50)

X_top5 = X[top_5_features.index]


5 ปัจจัยที่มีอิทธิพลสูงสุดต่อการทำนายแนวอนิเมะที่ชอบ:
character_connection    0.157513
episodes_per_session    0.129570
preferred_language      0.047054
ending_happy            0.046351
reason_story            0.045579
dtype: float64
--------------------------------------------------


In [None]:
# แบ่งข้อมูลเป็น Train 60% และ Test 40%
X_train, X_test, y_train, y_test = train_test_split(
    X_top5,
    y_encoded,
    train_size=0.6,
    test_size=0.4,
    random_state=42,
    stratify=y_encoded
)

print(f" - ชุดข้อมูลสำหรับ Train: {X_train.shape[0]} ตัวอย่าง")
print(f" - ชุดข้อมูลสำหรับ Test: {X_test.shape[0]} ตัวอย่าง")
print("-" * 50)

 - ชุดข้อมูลสำหรับ Train: 197 ตัวอย่าง
 - ชุดข้อมูลสำหรับ Test: 132 ตัวอย่าง
--------------------------------------------------


In [None]:
#K-Fold สำหรับ Cross-Validation
kfold = KFold(n_splits=5, shuffle=True, random_state=42)
results = {}

In [None]:
#Decision Tree
print("Decision Tree")
dt_model = DecisionTreeClassifier(max_depth=10, random_state=42)

#K-Fold
cv_scores_dt = cross_val_score(dt_model, X_train, y_train, cv=kfold, scoring='accuracy')
results['Decision Tree'] = {'kfold_mean': cv_scores_dt.mean()}
print(f"K-Fold Cross-Validation Accuracy: {cv_scores_dt.mean():.4f} (+/- {cv_scores_dt.std():.4f})")

#สอนโมเดลและทดสอบ
dt_model.fit(X_train, y_train)
y_pred_dt = dt_model.predict(X_test)
accuracy_dt = accuracy_score(y_test, y_pred_dt)
results['Decision Tree']['test_accuracy'] = accuracy_dt
print(f"Test Set Accuracy: {accuracy_dt:.4f}\n")
print("Classification Report:")
print(classification_report(y_test, y_pred_dt, target_names=y_labels, zero_division=0))
print("-" * 50)

Decision Tree
K-Fold Cross-Validation Accuracy: 0.3251 (+/- 0.0457)
Test Set Accuracy: 0.3258

Classification Report:
                    precision    recall  f1-score   support

 คอมเมดี้ (Comedy)       0.33      0.44      0.38        25
    ดราม่า (Drama)       0.47      0.45      0.46        20
 แฟนตาซี (Fantasy)       0.32      0.27      0.29        30
  แอคชั่น (Action)       0.23      0.27      0.25        33
โรแมนติก (Romance)       0.40      0.25      0.31        24

          accuracy                           0.33       132
         macro avg       0.35      0.34      0.34       132
      weighted avg       0.34      0.33      0.33       132

--------------------------------------------------


In [None]:
#Random Forest
print("Random Forest")
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)

cv_scores_rf = cross_val_score(rf_model, X_train, y_train, cv=kfold, scoring='accuracy')
results['Random Forest'] = {'kfold_mean': cv_scores_rf.mean()}
print(f"K-Fold Cross-Validation Accuracy: {cv_scores_rf.mean():.4f} (+/- {cv_scores_rf.std():.4f})")

rf_model.fit(X_train, y_train)
y_pred_rf = rf_model.predict(X_test)
accuracy_rf = accuracy_score(y_test, y_pred_rf)
results['Random Forest']['test_accuracy'] = accuracy_rf
print(f"Test Set Accuracy: {accuracy_rf:.4f}\n")
print("Classification Report:")
print(classification_report(y_test, y_pred_rf, target_names=y_labels, zero_division=0))
print("-" * 50)

Random Forest
K-Fold Cross-Validation Accuracy: 0.3347 (+/- 0.0445)
Test Set Accuracy: 0.3485

Classification Report:
                    precision    recall  f1-score   support

 คอมเมดี้ (Comedy)       0.30      0.28      0.29        25
    ดราม่า (Drama)       0.56      0.45      0.50        20
 แฟนตาซี (Fantasy)       0.35      0.40      0.38        30
  แอคชั่น (Action)       0.26      0.30      0.28        33
โรแมนติก (Romance)       0.40      0.33      0.36        24

          accuracy                           0.35       132
         macro avg       0.38      0.35      0.36       132
      weighted avg       0.36      0.35      0.35       132

--------------------------------------------------


In [None]:
#Naive Bayes
print("Naive Bayes")
nb_model = CategoricalNB()

cv_scores_nb = cross_val_score(nb_model, X_train, y_train, cv=kfold, scoring='accuracy')
results['Naive Bayes'] = {'kfold_mean': cv_scores_nb.mean()}
print(f"K-Fold Cross-Validation Accuracy: {cv_scores_nb.mean():.4f} (+/- {cv_scores_nb.std():.4f})")

nb_model.fit(X_train, y_train)
y_pred_nb = nb_model.predict(X_test)
accuracy_nb = accuracy_score(y_test, y_pred_nb)
results['Naive Bayes']['test_accuracy'] = accuracy_nb
print(f"Test Set Accuracy: {accuracy_nb:.4f}\n")
print("Classification Report:")
print(classification_report(y_test, y_pred_nb, target_names=y_labels, zero_division=0))
print("-" * 50)

Naive Bayes
K-Fold Cross-Validation Accuracy: 0.3603 (+/- 0.0578)
Test Set Accuracy: 0.3939

Classification Report:
                    precision    recall  f1-score   support

 คอมเมดี้ (Comedy)       0.48      0.48      0.48        25
    ดราม่า (Drama)       0.65      0.55      0.59        20
 แฟนตาซี (Fantasy)       0.34      0.43      0.38        30
  แอคชั่น (Action)       0.28      0.33      0.31        33
โรแมนติก (Romance)       0.38      0.21      0.27        24

          accuracy                           0.39       132
         macro avg       0.43      0.40      0.41       132
      weighted avg       0.41      0.39      0.39       132

--------------------------------------------------


In [None]:
#K-Nearest Neighbors (KNN)
print("K-Nearest Neighbors (KNN)")
knn_model = KNeighborsClassifier(n_neighbors=5) # n_neighbors คือค่า K, 5 เป็นค่าเริ่มต้นที่ดี

cv_scores_knn = cross_val_score(knn_model, X_train, y_train, cv=kfold, scoring='accuracy')
results['K-Nearest Neighbors'] = {'kfold_mean': cv_scores_knn.mean()}
print(f"K-Fold Cross-Validation Accuracy: {cv_scores_knn.mean():.4f} (+/- {cv_scores_knn.std():.4f})")

knn_model.fit(X_train, y_train)
y_pred_knn = knn_model.predict(X_test)
accuracy_knn = accuracy_score(y_test, y_pred_knn)
results['K-Nearest Neighbors']['test_accuracy'] = accuracy_knn
print(f"Test Set Accuracy: {accuracy_knn:.4f}\n")
print("Classification Report:")
print(classification_report(y_test, y_pred_knn, target_names=y_labels, zero_division=0))
print("-" * 50)

K-Nearest Neighbors (KNN)
K-Fold Cross-Validation Accuracy: 0.3659 (+/- 0.0469)
Test Set Accuracy: 0.2879

Classification Report:
                    precision    recall  f1-score   support

 คอมเมดี้ (Comedy)       0.37      0.40      0.38        25
    ดราม่า (Drama)       0.30      0.30      0.30        20
 แฟนตาซี (Fantasy)       0.26      0.30      0.28        30
  แอคชั่น (Action)       0.24      0.27      0.26        33
โรแมนติก (Romance)       0.31      0.17      0.22        24

          accuracy                           0.29       132
         macro avg       0.30      0.29      0.29       132
      weighted avg       0.29      0.29      0.28       132

--------------------------------------------------


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# 5. สรุปผลการเปรียบเทียบโมเดล
# ==============================================================================
summary_df = pd.DataFrame(results).T
summary_df.columns = ['K-Fold Mean Accuracy', 'Test Set Accuracy']
summary_df = summary_df.sort_values(by='Test Set Accuracy', ascending=False)

print("สรุปผลการเปรียบเทียบประสิทธิภาพของโมเดล")
print(summary_df)
print("\n" + "="*50)
print(f"โมเดลที่มีประสิทธิภาพสูงสุดคือ: {summary_df.index[0]}")
print(f"ด้วยความแม่นยำบน Test Set: {summary_df['Test Set Accuracy'].iloc[0]:.2%}")
print("="*50)


สรุปผลการเปรียบเทียบประสิทธิภาพของโมเดล
                     K-Fold Mean Accuracy  Test Set Accuracy
Naive Bayes                      0.360256           0.393939
Random Forest                    0.334744           0.348485
Decision Tree                    0.325128           0.325758
K-Nearest Neighbors              0.365897           0.287879

โมเดลที่มีประสิทธิภาพสูงสุดคือ: Naive Bayes
ด้วยความแม่นยำบน Test Set: 39.39%
