In [15]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import GaussianNB

In [16]:
dataframe = pd.read_csv('./dataset/processed_train_combined.csv')

In [17]:
dataframe.head()

Unnamed: 0,Relation,Target,Relatum,Attribute,Entity,Numeric Value
0,adjoin_s.adjoins,/en/united_arab_emirates,/en/saudi_arabia,internet_users_percent_population.2006-01-01.rate,/en/united_arab_emirates,0.60216
1,adjoin_s.adjoins,/en/united_arab_emirates,/en/saudi_arabia,brain_drain_percent.2000-01-01.rate,/en/united_arab_emirates,0.014367
2,adjoin_s.adjoins,/en/united_arab_emirates,/en/saudi_arabia,gdp_real.2000-01-01.adjusted_value,/en/united_arab_emirates,0.020973
3,adjoin_s.adjoins,/en/united_arab_emirates,/en/saudi_arabia,gender_balance_members_of_parliament.2002-01-0...,/en/united_arab_emirates,0.033113
4,adjoin_s.adjoins,/en/united_arab_emirates,/en/saudi_arabia,diesel_price_liter.2002-01-01.amount,/en/united_arab_emirates,0.639344


In [18]:
categorical_columns = ['Relation', 'Target', 'Relatum']

In [19]:
from scipy import sparse
def prepare_data(file_path, encoder, scaler):
    df = pd.read_csv(file_path)
    df.fillna(0, inplace=True)
    df[categorical_columns] = df[categorical_columns].astype(str)
    
    X_categorical = encoder.transform(df[categorical_columns])
    X_numeric = scaler.transform(df[['Numeric Value']])

    X_numeric_adjusted = X_numeric + abs(X_numeric.min()) + 1

    X = sparse.hstack((X_categorical, sparse.csr_matrix(X_numeric_adjusted)))
    y = df['Numeric Value'].astype(int)

    return X, y

In [20]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler
encoder = OneHotEncoder(sparse_output=True, handle_unknown='ignore')
scaler = StandardScaler()


In [21]:
# Eğitim seti
df_train = pd.read_csv('./dataset/processed_train_combined.csv')
df_train.fillna(0, inplace=True)
df_train[categorical_columns] = df_train[categorical_columns].astype(str)
X_train_categorical = encoder.fit_transform(df_train[categorical_columns])
X_train_numeric = scaler.fit_transform(df_train[['Numeric Value']])

In [22]:
# Negatif değerleri ayarlayın
X_train_numeric += abs(X_train_numeric.min()) + 1

In [23]:
X_train = sparse.hstack((X_train_categorical, sparse.csr_matrix(X_train_numeric)))
y_train = df_train['Numeric Value'].astype(int)

In [24]:
# Test ve doğrulama setleri
X_test, y_test = prepare_data('./dataset/processed_test_combined.csv', encoder, scaler)
X_val, y_val = prepare_data('./dataset/processed_validation_combined.csv', encoder, scaler)


In [25]:
from sklearn.metrics import classification_report
# Model değerlendirme fonksiyonu
def evaluate_model(model, X_train, y_train, X_test, y_test, model_name):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    report = classification_report(y_test, y_pred, output_dict=True)
    return {
        'Model Name': model_name,
        'Accuracy': report['accuracy'],
        'Precision': report['macro avg']['precision'],
        'Recall': report['macro avg']['recall'],
        'F1-Score': report['macro avg']['f1-score']
    }

In [26]:
results = []

In [27]:
for k in [3, 7, 11]:
    results.append(evaluate_model(KNeighborsClassifier(n_neighbors=k), X_train, y_train, X_val, y_val, f"KNN with K={k}"))

In [28]:
for layers in [(32,), (32, 32), (32, 32, 32)]:
    results.append(evaluate_model(MLPClassifier(hidden_layer_sizes=layers, max_iter=1000), X_train, y_train, X_val, y_val, f"MLP with layers {layers}"))

In [29]:
from sklearn.naive_bayes import MultinomialNB
results.append(evaluate_model(MultinomialNB(), X_train, y_train, X_val, y_val, "Naive Bayes"))

In [30]:
results_df = pd.DataFrame(results)

In [31]:
table = {
    'Estimators': results_df['Model Name'].tolist(),
    'Accuracy': results_df['Accuracy'].tolist(),
    'Precision': results_df['Precision'].tolist(),
    'Recall': results_df['Recall'].tolist(),
    'F1_Score': results_df['F1-Score'].tolist()
}

table_df = pd.DataFrame(table)

In [32]:
print(table_df)

                     Estimators  Accuracy  Precision    Recall  F1_Score
0                  KNN with K=3  0.972603   0.954585  0.977605  0.965262
1                  KNN with K=7  0.967860   0.949498  0.970161  0.959151
2                 KNN with K=11  0.963460   0.944640  0.963438  0.953471
3         MLP with layers (32,)  0.944896   0.929050  0.927312  0.928176
4      MLP with layers (32, 32)  0.949998   0.935536  0.934182  0.934856
5  MLP with layers (32, 32, 32)  0.937888   0.922915  0.914057  0.918357
6                   Naive Bayes  0.739345   0.633835  0.572950  0.575262
