In [76]:
import json
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix

In [77]:
# load json
with open('training_data.json', 'r', encoding='utf-8') as infile:
    json_items = json.loads(infile.read())

item_values = []

for item in json_items:
    item_class = item['class']
    text = item['text']
    punctuation_after_first_word = item['punctuation_after_first_word']
    square_bracket = item['square_bracket']
    square_bracket_with_punctuation = item['square_bracket_with_punctuation']
    parentheses = item['parentheses']
    parentheses_with_punctuation = item['parentheses_with_punctuation']
    category_word = item['category_word']
    first_word_frequency = item['first_word_frequency']
    values = [
        item_class, 
        text, 
        punctuation_after_first_word, 
        square_bracket, 
        square_bracket_with_punctuation, 
        parentheses, parentheses_with_punctuation,
        category_word, 
        first_word_frequency,
            ]
    item_values.append(values)

print(item_values[:5])

[[1, 'Kromatik (se Kromatisk), färglära; mus., (ymnigt) användande af kromatiska tonföljder.', 0, 0, 0, 1, 1, 0, 0.00011706860220088972], [1, 'Kromatisk (af grek. chroma, färg, modulation), eg. färgad. - 1. Mus. Ett klangsläkte, hvars tetrakord bestodo af två halftoner och en liten ters, t. ex. a b h d, kallades a', 0, 0, 0, 1, 0, 1, 0.0012877546242097868], [1, 'Kromatisk adaptation, bot., vissa algers (Oscillatoria) förmåga att antaga komplementfärgen till färgen af det ljus, i hvilket de odlas.  G. L-m.', 0, 0, 0, 0, 0, 0, 0.0012877546242097868], [1, "Kromatoforer (af grek. chroma, färg, och fo'ros, bärare), bot. Se Cell, sp. 1390.", 0, 0, 0, 1, 0, 1, 0.00011706860220088972], [1, "Kromatolys (af grek. chroma, färg, och ly'sis, upplösning), anat., förlust af cellkärnans färgbara innehåll (kromatin), hvarpå kan följa ett försvinnande af cellens kärna (", 0, 0, 0, 1, 0, 0, 0.00011706860220088972]]


In [78]:
# convert to feature matrix and y vector
# all key-value pairs should be included except for text
# class becomes the y vector
X = np.array([])
y = np.array([])

for values in item_values:
    X = np.vstack([X, values[2:]]) if X.size else np.array(values[2:])
    y = np.append(y, values[0])

# Split dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [79]:
print(X.shape)
print(y.shape)

(4607, 7)
(4607,)


In [80]:
# Create a logistic regression model
logistic_regression_model = LogisticRegression()

# Fit the model to the training data
logistic_regression_model.fit(X_train, y_train)
print(np.abs(logistic_regression_model.coef_))

# Predict on the testing data
predictions = logistic_regression_model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, predictions)
print("Accuracy:", accuracy) 
# evaluate with f1-score, confusion matrix on test set

confusion_matrix(y_true=y_test, y_pred=predictions)

[[2.71871697 2.40225744 2.07221135 3.09499408 0.31301931 1.48484654
  7.12812971]]
Accuracy: 0.9013015184381779


array([[118,  35],
       [ 56, 713]], dtype=int64)

In [81]:
# check the data that was misclassified

misclassified = y_test - predictions

non_zero_indices = [index for index, value in enumerate(misclassified) if value != 0]

hej = [np.append(X_test[i], predictions[i]) for i in non_zero_indices]
hej

[array([0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.00046827, 0.        ]),
 array([1.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
        0.00000000e+00, 0.00000000e+00, 3.51205807e-04, 1.00000000e+00]),
 array([1.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.00468274, 1.        ]),
 array([0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 1.00000000e+00,
        0.00000000e+00, 0.00000000e+00, 1.17068602e-04, 1.00000000e+00]),
 array([0.00000000e+00, 1.00000000e+00, 0.00000000e+00, 0.00000000e+00,
        0.00000000e+00, 0.00000000e+00, 7.02411613e-04, 1.00000000e+00]),
 array([1.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
        0.00000000e+00, 1.00000000e+00, 2.34137204e-04, 1.00000000e+00]),
 array([1.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
        0.00000000e+00, 0.00000000e+00, 4.68274409e-04, 1.00000000e+00]),
 array([0.        , 0.        , 0.        