# Twitch explicit language detection using traditional models and graph neural network models 

### Import modules

In [None]:
import torch
from torch_geometric.data import Data
from torch_geometric.nn import GCNConv
import torch.nn.functional as F
import torch.optim as optim
from torch_geometric.data import Data
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split, GridSearchCV
import matplotlib.pyplot as plt
from torch import nn
import json
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import xgboost as xgb

### Define Random Forest classifier

In [None]:
data_path = 'data'

# Load and preprocess data
df = pd.read_csv(f'{data_path}/ENGB/ENGB_target.csv')
df['partner'] = LabelEncoder().fit_transform(df['partner'])
df[['days', 'views']] = StandardScaler().fit_transform(df[['days', 'views']])
X = df[['days', 'views', 'partner']]
y = df['mature']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=142)

# Model: RandomForest Classifier with GridSearch for hyperparameter tuning
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth' : [4,5,6,7,8],
    'criterion' :['gini', 'entropy']
}
CV_rfc = GridSearchCV(estimator=RandomForestClassifier(random_state=42), param_grid=param_grid, cv=5)
CV_rfc.fit(X_train, y_train)
print("Best parameters:", CV_rfc.best_params_)

# Evaluate the model
y_pred = CV_rfc.predict(X_test)
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

# Feature importances
importances = CV_rfc.best_estimator_.feature_importances_
feature_names = X.columns
feature_importances = sorted(zip(importances, feature_names), reverse=True)
print("Feature importances:")
for importance, name in feature_importances:
    print(f"{name}: {importance}")


In [None]:
rf_accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {rf_accuracy}")

### Define Gradient boost classifier

In [None]:
df = pd.read_csv(f'{data_path}/ENGB/ENGB_target.csv')
df['partner'] = LabelEncoder().fit_transform(df['partner'])
df[['days', 'views']] = StandardScaler().fit_transform(df[['days', 'views']])
X = df[['days', 'views', 'partner']]
y = df['mature']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=142)

xgb_model = xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss')
xgb_model.fit(X_train, y_train)

y_pred = xgb_model.predict(X_test)
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

print("Feature Importances:")
for feature, importance in zip(X.columns, xgb_model.feature_importances_):
    print(f'{feature}: {importance:.4f}')

param_grid = {
    'n_estimators': [50, 100, 150],
    'learning_rate': [0.01, 0.1, 0.3],
    'max_depth': [3, 4, 5],
    'subsample': [0.8, 0.9, 1],
    'colsample_bytree': [0.8, 0.9, 1],
}

grid_search = GridSearchCV(xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss'),
                           param_grid, 
                           cv=3, 
                           verbose=2,
                           n_jobs=-1)
grid_search.fit(X_train, y_train)

print("Best parameters:", grid_search.best_params_)

best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)
print(classification_report(y_test, y_pred))

confusion_matrix = confusion_matrix(y_test, y_pred)
print(classification_report)
print(confusion_matrix)

cls_report = classification_report(y_test, y_pred, output_dict=True)
xgb_accuracy = accuracy_score(y_test, y_pred)
print(f"accuracy: {xgb_accuracy}")


## GCN model

## Load data

In [None]:
def load_data():
    data_edges = pd.read_csv(data_path + '/ENGB/ENGB_edges.csv')
    data_target = pd.read_csv(data_path + '/ENGB/ENGB_target.csv')
    with open(data_path + '/ENGB/ENGB_features.json') as f:
        node_features_json = json.load(f)
    node_features_df = pd.DataFrame.from_dict(node_features_json, orient='index')
    node_features_df.index.name = 'id'
    node_features_df.reset_index(inplace=True)
    return data_edges, data_target, node_features_df

### Preprocessing

In [None]:
def preprocess_data(data_target, data_edges, node_features_df):
    data_target = data_target.drop(columns=['id'])
    data_target = data_target.rename(columns={'new_id': 'id'})
    data_edges = data_edges.rename(columns={'from': 'from_id', 'to': 'to_id'})
    data_target['mature'] = data_target['mature'].astype(int)
    data_target['partner'] = data_target['partner'].astype(int)
    data_target['days'] = (data_target['days'] - data_target['days'].min()) / (data_target['days'].max() - data_target['days'].min())
    data_target['views'] = (data_target['views'] - data_target['views'].min()) / (data_target['views'].max() - data_target['views'].min())
    node_features = torch.tensor(data_target.drop(columns=['id']).values, dtype=torch.float)
    data_target['id'] = data_target['id'].astype('int64')
    node_features_df['id'] = node_features_df['id'].astype('int64')
    data_target = pd.merge(data_target, node_features_df, on='id')
    data_target = data_target.fillna(0)
    return data_target, data_edges, node_features

def prepare_data(data_target, data_edges, node_features):
    edge_index = torch.tensor(data_edges.values, dtype=torch.long).t().contiguous()
    edge_index = torch.cat([edge_index, edge_index[[1, 0]]], dim=1)
    node_ids = data_target['id']
    node_to_idx = {node_id: idx for idx, node_id in enumerate(node_ids)}
    labels = torch.tensor(data_target['mature'].values, dtype=torch.long)
    x = torch.eye(len(node_ids))
    train_indices, test_indices = train_test_split(range(len(node_ids)), test_size=0.20, stratify=labels)
    train_mask = torch.zeros(len(node_ids), dtype=torch.bool).scatter_(0, torch.tensor(train_indices), True)
    test_mask = torch.zeros(len(node_ids), dtype=torch.bool).scatter_(0, torch.tensor(test_indices), True)
    data = Data(x=node_features, edge_index=edge_index, y=labels, train_mask=train_mask, test_mask=test_mask)
    return data

### Define the model

In [None]:
class GCN(nn.Module):
    def __init__(self, node_features):
        super(GCN, self).__init__()
        self.conv1 = GCNConv(node_features.shape[1], 32)
        self.conv2 = GCNConv(32, 2)

    def forward(self, data):
        x, edge_index = data.x, data.edge_index
        x = F.relu(self.conv1(x, edge_index))
        x = F.dropout(x, training=self.training)
        x = self.conv2(x, edge_index)
        return F.log_softmax(x, dim=1)

### Train the model

In [None]:
def train(model, data, optimizer, criterion):
    model.train()
    optimizer.zero_grad()
    out = model(data)
    loss = criterion(out[data.train_mask], data.y[data.train_mask])
    loss.backward()
    optimizer.step()
    return loss.item()

### Validate

In [None]:
def test(model, data):
    model.eval()
    out = model(data)
    pred = out.argmax(dim=1)
    correct = (pred[data.test_mask] == data.y[data.test_mask]).sum().item()
    accuracy = correct / int(data.test_mask.sum())
    return accuracy

### Plots

In [None]:
def plot_results(loss_values, gcn_acc, rf_acc, xgb_acc):
    # Plotting
    plt.figure(figsize=(12, 6))
    plt.subplot(1, 2, 1)
    plt.plot(loss_values, label='Train Loss')
    plt.xlabel('Epochs')
    plt.ylabel('Loss')
    plt.title('Loss over time')
    plt.legend()

    plt.subplot(1, 2, 2)
    plt.plot(gcn_acc, label='Graph Convolutional Neural Network Test Accuracy')
    plt.axhline(y=rf_acc, color='green', linestyle='--', label='Random Forest Test Accuracy')
    plt.axhline(y=xgb_acc, color='red', linestyle='--', label='Gradient Boost Test Accuracy')
    plt.xlabel('Epochs')
    plt.ylabel('Accuracy')
    plt.title('Accuracy over time')
    plt.legend()

    plt.tight_layout()
    plt.show()

### Main

In [None]:
accuracy_values = []
loss_values = []
def main():
    data_edges, data_target, node_features_df = load_data()
    data_target, data_edges, node_features = preprocess_data(data_target, data_edges, node_features_df)
    data = prepare_data(data_target, data_edges, node_features)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = GCN(node_features).to(device)
    data = data.to(device)
    optimizer = optim.Adam(model.parameters(), lr=0.01)
    criterion = nn.CrossEntropyLoss() 
    epochs = 2000
    for epoch in range(epochs):
        loss = train(model, data, optimizer, criterion)
        acc = test(model, data)
        loss_values.append(loss)
        accuracy_values.append(acc)
        print(f'Epoch: {epoch + 1:03d}, Loss: {loss:.4f}, Acc: {acc:.4f}')


main()

### Plot accuracies

In [None]:
plot_results(loss_values, accuracy_values, rf_accuracy, xgb_accuracy)