# Twitch explicit language detection using traditional models and graph neural network models 

### Import modules

In [None]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, recall_score, f1_score
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold, cross_val_score
import xgboost as xgb
import json
import torch
from torch import nn
from torch_geometric.data import Data
from torch_geometric.nn import GCNConv
import torch.nn.functional as F
import torch.optim as optim
from torch_geometric.data import Data
import matplotlib.pyplot as plt

### Load and preprocess tabular data

In [None]:
data_path = 'data'

df = pd.read_csv(f'{data_path}/ENGB/ENGB_target.csv')
df['partner'] = LabelEncoder().fit_transform(df['partner'])
df[['days', 'views']] = StandardScaler().fit_transform(df[['days', 'views']])
X = df[['days', 'views', 'partner']]
y = df['mature']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=142)


### Define Random Forest classifier with hypertuning

In [None]:
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_features': ['sqrt', 'log2'],
    'max_depth' : [4,5,6,7,8],
    'criterion' :['gini', 'entropy']
}
CV_rfc = GridSearchCV(estimator=RandomForestClassifier(random_state=42), param_grid=param_grid, cv=5)
CV_rfc.fit(X_train, y_train)
print("Best parameters:", CV_rfc.best_params_)

y_pred = CV_rfc.predict(X_test)
rfc_report = classification_report(y_test, y_pred)
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

importances = CV_rfc.best_estimator_.feature_importances_
feature_names = X.columns
feature_importances = sorted(zip(importances, feature_names), reverse=True)
print("Feature importances:")
for importance, name in feature_importances:
    print(f"{name}: {importance}")

rf_accuracy = accuracy_score(y_test, y_pred)
rf_recall = recall_score(y_test, y_pred)
rf_f1 = f1_score(y_test, y_pred)
print(f"Accuracy: {rf_accuracy}")
print(f"Recall: {rf_recall}")
print(f"F1 score: {rf_f1}")


In [None]:
best_index = CV_rfc.best_index_
cv_results = CV_rfc.cv_results_
rf_results = [
    cv_results[f'split{i}_test_score'][best_index] for i in range(5)
]

print("Test scores for each fold for the best parameter combination:")
for i, test_score in enumerate(rf_results):
    print(f"Fold {i + 1} test score: {test_score}")

### Define Gradient boost classifier

In [None]:
param_grid = {
    'n_estimators': [50, 100, 150],
    'learning_rate': [0.01, 0.1, 0.3],
    'max_depth': [3, 4, 5],
    'subsample': [0.8, 0.9, 1],
    'colsample_bytree': [0.8, 0.9, 1],
}

xgboost = GridSearchCV(xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss'),
                           param_grid, 
                           cv=5, 
                           verbose=2,
                           n_jobs=-1)
xgboost.fit(X_train, y_train)

print("Best parameters:", xgboost.best_params_)

best_model = xgboost.best_estimator_
y_pred = best_model.predict(X_test)

print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

xgb_accuracy = accuracy_score(y_test, y_pred)
xgb_recall = recall_score(y_test, y_pred)
xgb_f1 = f1_score(y_test, y_pred)
print(f"Accuracy: {xgb_accuracy}")
print(f"Recall: {xgb_recall}")
print(f"F1 score: {xgb_f1}")


In [None]:
best_index = xgboost.best_index_
cv_results = xgboost.cv_results_
xgb_scores = [
    cv_results[f'split{i}_test_score'][best_index] for i in range(5)
]

print("Test scores for each fold for the best parameter combination:")
for i, test_score in enumerate(xgb_scores):
    print(f"Fold {i + 1} test score: {test_score}")

## GCN model

### Embed data in the graph

In [None]:
def load_data(df):
    data_edges = pd.read_csv(data_path + '/ENGB/ENGB_edges.csv')
    data_target = df
    with open(data_path + '/ENGB/ENGB_features.json') as f:
        node_features_json = json.load(f)
    node_features_df = pd.DataFrame.from_dict(node_features_json, orient='index')
    node_features_df.index.name = 'id'
    node_features_df.reset_index(inplace=True)
    return data_edges, data_target, node_features_df

### Preprocessing

In [None]:
def preprocess_data(data_target, data_edges, node_features_df):
    data_target = data_target.drop(columns=['id'])
    data_target = data_target.rename(columns={'new_id': 'id'})
    data_edges = data_edges.rename(columns={'from': 'from_id', 'to': 'to_id'})
    data_target['mature'] = data_target['mature'].astype(int)
    data_target['partner'] = data_target['partner'].astype(int)
    data_target['days'] = (data_target['days'] - data_target['days'].min()) / (data_target['days'].max() - data_target['days'].min())
    data_target['views'] = (data_target['views'] - data_target['views'].min()) / (data_target['views'].max() - data_target['views'].min())
    node_features = torch.tensor(data_target.drop(columns=['id']).values, dtype=torch.float)
    data_target['id'] = data_target['id'].astype('int64')
    node_features_df['id'] = node_features_df['id'].astype('int64')
    data_target = pd.merge(data_target, node_features_df, on='id')
    data_target = data_target.fillna(0)
    return data_target, data_edges, node_features

def prepare_data(data_target, data_edges, node_features):
    edge_index = torch.tensor(data_edges.values, dtype=torch.long).t().contiguous()
    edge_index = torch.cat([edge_index, edge_index[[1, 0]]], dim=1)
    node_ids = data_target['id']
    labels = torch.tensor(data_target['mature'].values, dtype=torch.long)
    train_indices, test_indices = train_test_split(range(len(node_ids)), test_size=0.20, stratify=labels)
    train_mask = torch.zeros(len(node_ids), dtype=torch.bool).scatter_(0, torch.tensor(train_indices), True)
    test_mask = torch.zeros(len(node_ids), dtype=torch.bool).scatter_(0, torch.tensor(test_indices), True)
    data = Data(x=node_features, edge_index=edge_index, y=labels, train_mask=train_mask, test_mask=test_mask)
    return data

### Define the model

In [None]:
class GCN(nn.Module):
    def __init__(self, node_features):
        super(GCN, self).__init__()
        self.conv1 = GCNConv(node_features.shape[1], 32)
        self.conv2 = GCNConv(32, 2)

    def forward(self, data):
        x, edge_index = data.x, data.edge_index
        x = F.relu(self.conv1(x, edge_index))
        x = F.dropout(x, training=self.training)
        x = self.conv2(x, edge_index)
        return F.log_softmax(x, dim=1)

### Train the model

In [None]:
def train(model, data, optimizer, criterion, train_mask):
    model.train()
    optimizer.zero_grad()
    out = model(data)
    loss = criterion(out[train_mask], data.y[train_mask])
    loss.backward()
    optimizer.step()
    return loss.item()

### Validate

In [None]:
def test(model, data, mask):
    model.eval()
    with torch.no_grad():
        logits = model(data)
        preds = logits[mask].max(1)[1]
        labels = data.y[mask]
        acc = accuracy_score(labels.cpu(), preds.cpu())
        f1 = f1_score(labels.cpu(), preds.cpu(), average='macro')
        recall = recall_score(labels.cpu(), preds.cpu(), average='macro')
    return acc, f1, recall

### Run the GCN model

In [None]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=1234)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
data_edges, data_target, node_features_df = load_data(df)
data_target, data_edges, node_features = preprocess_data(data_target, data_edges, node_features_df)
data = prepare_data(data_target, data_edges, node_features)
all_loss_values = []
all_accuracy_values = []
all_f1_values = []
all_recall_values = []
EPOCHS = 2000
history = []
best_acc = 0
PATIENCE = 25
gcn_cv_accuracies = []
gcn_cv_f1s = []
gcn_cv_recalls = []
epochs_no_improve = 0

for train_index, test_index in skf.split(data.x, data.y):
    train_index = torch.tensor(train_index, dtype=torch.int64)
    test_index = torch.tensor(test_index, dtype=torch.int64)
    train_mask = torch.zeros(len(data.y), dtype=torch.bool).scatter_(0, train_index, True)
    test_mask = torch.zeros(len(data.y), dtype=torch.bool).scatter_(0, test_index, True)

    model = GCN(data.x).to(device)
    data = data.to(device)
    optimizer = optim.Adam(model.parameters(), lr=0.01)
    criterion = nn.CrossEntropyLoss()
    epochs = 2000
    loss_values = []
    accuracy_values = []
    f1_values = []
    recall_values = []

    for epoch in range(epochs):
        loss = train(model, data, optimizer, criterion, train_mask)
        acc, f1, recall = test(model, data, test_mask)
        loss_values.append(loss)
        accuracy_values.append(acc)
        f1_values.append(f1)
        recall_values.append(recall)
        print(f'Epoch: {epoch + 1:03d}, Loss: {loss:.4f}, Acc: {acc:.4f}')
    if acc > best_acc:
        best_acc = acc
        epochs_no_improve = 0
    else:
        epochs_no_improve += 1
    
    if epochs_no_improve == PATIENCE:
        print(f"Accuracy has not improved in the last {PATIENCE} epochs, stopping training.")
        break

    all_loss_values.append(loss_values)
    all_accuracy_values.append(accuracy_values)
    all_f1_values.append(f1_values)
    all_recall_values.append(recall_values)
    final_acc, final_f1, final_recall = test(model, data, test_mask)
    gcn_cv_accuracies.append(final_acc)
    gcn_cv_f1s.append(final_f1)
    gcn_cv_recalls.append(final_recall)

avg_loss_values = [sum(x)/len(x) for x in zip(*all_loss_values)]
avg_accuracy_values = [sum(x)/len(x) for x in zip(*all_accuracy_values)]
avg_f1_values = [sum(x)/len(x) for x in zip(*all_f1_values)]
avg_recall_values = [sum(x)/len(x) for x in zip(*all_recall_values)]

print("Average Loss: ", avg_loss_values[-1])
print("Average Accuracy: ", avg_accuracy_values[-1])
print("Average F1 Score: ", avg_f1_values[-1])
print("Average Recall: ", avg_recall_values[-1])
print("Cross validation accuracies: ", gcn_cv_accuracies)
print("Cross validation F1 scores: ", gcn_cv_f1s)
print("Cross validation recalls: ", gcn_cv_recalls)

### Extract node embeddings

In [None]:
with torch.no_grad():
    gnn_embeddings = model(data).cpu().numpy()

### Combine original features with GNN embeddings

In [None]:
gnn_embeddings_df = pd.DataFrame(gnn_embeddings, index=data_target['id'], columns=[f'embedding_{i}' for i in range(gnn_embeddings.shape[1])])

combined_df = pd.merge(df, gnn_embeddings_df, left_on='new_id', right_index=True)

X_combined = combined_df.drop(columns=['mature'])
y_combined = combined_df['mature']

X_train_combined, X_test_combined, y_train_combined, y_test_combined = train_test_split(X_combined, y_combined, test_size=0.3, random_state=142)

### Train Gradient Boost on combined feature set

In [None]:
xgboost.fit(X_train_combined, y_train_combined)

print("Best parameters for combined model:", xgboost.best_params_)

best_xgb_combined = xgboost.best_estimator_
y_pred_combined = best_xgb_combined.predict(X_test_combined)

print(classification_report(y_test_combined, y_pred_combined))
print(confusion_matrix(y_test_combined, y_pred_combined))

hybrid_accuracy = accuracy_score(y_test_combined, y_pred_combined)
hybrid_recall = recall_score(y_test_combined, y_pred_combined)
hybrid_f1 = f1_score(y_test_combined, y_pred_combined)
print(f"Combined Model Accuracy after hyperparameter tuning: {hybrid_accuracy}")
print(f"Recall: {hybrid_recall}")
print(f"F1 score: {hybrid_f1}")


In [None]:
best_index = xgboost.best_index_
cv_results = xgboost.cv_results_
hybrid_scores = [
    cv_results[f'split{i}_test_score'][best_index] for i in range(5)
]

print("Test scores for each fold for the best parameter combination:")
for i, test_score in enumerate(hybrid_scores):
    print(f"Fold {i + 1} test score: {test_score}")

### Plot results

In [None]:
plt.figure(figsize=(12, 6))

plt.subplot(1, 2, 1)
plt.plot(loss_values, label='Train Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.title('Loss over time (for GCN)')
plt.legend()

plt.subplot(1, 2, 2)
plt.plot(accuracy_values, label='GCN Test Accuracy')
plt.axhline(y=rf_accuracy, color='green', linestyle='--', label='Random Forest Test Accuracy')
plt.axhline(y=xgb_accuracy, color='red', linestyle='--', label='XGBoost Test Accuracy')
plt.axhline(y=hybrid_accuracy, color='blue', linestyle='--', label='Hybrid Model Test Accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.title('Accuracy over time')
plt.legend()

plt.tight_layout()
plt.show()

In [None]:
import numpy as np
import matplotlib.pyplot as plt

plt.figure(figsize=(8, 6))
recall_heights = [rf_recall, xgb_recall, avg_recall_values[-1], hybrid_recall]
bars = ('Random Forest', 'Gradient Boost', 'GCN', 'Hybrid model')
y_pos = np.arange(len(recall_heights))
plt.xticks(y_pos, bars)
plt.bar(y_pos, recall_heights)
plt.ylim(0, 1)
plt.title('Recall scores', fontsize=16, color='#323232')
plt.xlabel('Model', fontsize=14)
plt.ylabel('Score', fontsize=14)
plt.tight_layout()
plt.show()

plt.figure(figsize=(8, 6))
f1_heights = [rf_f1, xgb_f1, avg_f1_values[-1], hybrid_f1]
y_pos = np.arange(len(f1_heights))
plt.xticks(y_pos, bars)
plt.bar(y_pos, f1_heights)
plt.ylim(0, 1)
plt.title('F1 scores', fontsize=16, color='#323232')
plt.xlabel('Model', fontsize=14)
plt.ylabel('Score', fontsize=14)
plt.tight_layout()
plt.show()

plt.figure(figsize=(8, 6))
acc_heights = [rf_accuracy, xgb_accuracy, avg_accuracy_values[-1], hybrid_accuracy]
y_pos = np.arange(len(acc_heights))
plt.xticks(y_pos, bars)
plt.bar(y_pos, acc_heights)
plt.ylim(0, 1)
plt.title('Accuracies', fontsize=16, color='#323232')
plt.xlabel('Model', fontsize=14)
plt.ylabel('Score', fontsize=14)
plt.tight_layout()
plt.show()


### Statistical tests

In [None]:
from scipy.stats import ttest_rel, f_oneway, friedmanchisquare

t_statistic, t_p_value = ttest_rel(rf_results, xgb_scores)
print(f"t-test for Random forest and XGBoost: t-statistic: {t_statistic}, p-value: {t_p_value}")

t_statistic, t_p_value = ttest_rel(rf_results, gcn_cv_accuracies)
print(f"t-test for Random forest and GCN: t-statistic: {t_statistic}, p-value: {t_p_value}")

t_statistic, t_p_value = ttest_rel(rf_results, hybrid_scores)
print(f"t-test for Random forest and hybrid model: t-statistic: {t_statistic}, p-value: {t_p_value}")

t_statistic, t_p_value = ttest_rel(xgb_scores, gcn_cv_accuracies)
print(f"t-test for XGBoost and GCN: t-statistic: {t_statistic}, p-value: {t_p_value}")

t_statistic, t_p_value = ttest_rel(xgb_scores, hybrid_scores)
print(f"t-test for XGBoost and hybrid model: t-statistic: {t_statistic}, p-value: {t_p_value}")

t_statistic, t_p_value = ttest_rel(gcn_cv_accuracies, hybrid_scores)
print(f"t-test for GCN and hybrid model: t-statistic: {t_statistic}, p-value: {t_p_value}")


anova_f_statistic, anova_p_value = f_oneway(rf_results, xgb_scores, gcn_cv_accuracies, hybrid_scores)
print(f"ANOVA: F-statistic: {anova_f_statistic}, p-value: {anova_p_value}")

friedman_statistic, friedman_p_value = friedmanchisquare(rf_results, xgb_scores, gcn_cv_accuracies, hybrid_scores)
print(f"Friedman test: statistic: {friedman_statistic}, p-value: {friedman_p_value}")