In [3]:
import pandas as pd

raw_data = pd.read_excel("../data/adjusted-labels-multiclass.xlsx")
raw_data.dropna(subset=['Sentence'], inplace=True)  # Get rid of anything NaN

sentences = raw_data["Sentence"]
labels = raw_data.drop(columns=["Sentence"])

In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.multioutput import MultiOutputClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import train_test_split

# Step 1: Prepare your dataset
X_train, X_test, y_train, y_test = train_test_split(sentences, labels, test_size=0.2, random_state=42)

# Step 2: Feature extraction
tfidf_vectorizer = TfidfVectorizer(max_features=1000)  # Example, you can choose parameters as needed
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

# Step 3: Model building (CNN classifier)
mlp_classifier = MLPClassifier(hidden_layer_sizes=(100,), max_iter=500, alpha=0.001,
                               solver='adam', verbose=10, random_state=42, tol=0.0001)

# Wrap the MLP classifier in MultiOutputClassifier for multilabel classification
multi_label_classifier = MultiOutputClassifier(mlp_classifier, n_jobs=-1)

# Step 4: Training
multi_label_classifier.fit(X_train_tfidf, y_train)

# Step 5: Evaluation
predictions = multi_label_classifier.predict(X_test_tfidf)
print("Classification Report:")
print(f"Accuracy: {accuracy_score(y_test, predictions):.2f}")
print(classification_report(y_test, predictions, zero_division=0))

Classification Report:
Accuracy: 0.57
              precision    recall  f1-score   support

           0       0.92      0.96      0.94       451
           1       0.93      0.48      0.63        27
           2       0.70      0.25      0.37        28
           3       0.90      0.56      0.69        66
           4       0.88      0.70      0.78        80
           5       0.89      0.50      0.64        34
           6       0.87      0.87      0.87       149
           7       0.86      0.85      0.86       150
           8       0.90      0.66      0.76        67
           9       0.76      0.38      0.51        58
          10       0.86      0.61      0.71        49

   micro avg       0.89      0.79      0.84      1159
   macro avg       0.86      0.62      0.71      1159
weighted avg       0.88      0.79      0.82      1159
 samples avg       0.85      0.80      0.80      1159
