In [11]:
import json
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score


In [16]:
# load json
with open('training_data.json', 'r', encoding='utf-8') as infile:
    json_items = json.loads(infile.read())

item_values = []

for item in json_items:
    item_class = item['class']
    text = item['text']
    punctuation_after_first_word = item['punctuation_after_first_word']
    square_bracket = item['square_bracket']
    square_bracket_with_punctuation = item['square_bracket_with_punctuation']
    parentheses = item['parentheses']
    parentheses_with_punctuation = item['parentheses_with_punctuation']
    category_word = item['category_word']
    values = [item_class, text, punctuation_after_first_word, square_bracket, 
        square_bracket_with_punctuation, parentheses, parentheses_with_punctuation,
        category_word
            ]
    item_values.append(values)

print(item_values[:5])

[[1, 'Kromat, <i>kem</i>., kromsyrans salter. Se Krom.', 1, 0, 0, 0, 0, 0], [0, 'e Krom.. ', 0, 0, 0, 0, 0, 0], [1, 'Kromatgelatin [-sjelatin]. Se Kromgelatin.', 0, 1, 1, 0, 0, 0], [0, 'e Kromgelatin.. ', 0, 0, 0, 0, 0, 0], [1, 'Kromatik (se Kromatisk), färglära; <i>mus</i>., (ymnigt) användande af kromatiska tonföljder.', 0, 0, 0, 1, 1, 0]]


In [17]:
# convert to feature matrix and y vector
# all key-value pairs should be included except for text
# class becomes the y vector
X = np.array([])
y = np.array([])

for values in item_values:
    X = np.vstack([X, values[2:]]) if X.size else np.array(values[2:])
    y = np.append(y, values[0])

# Split dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [18]:
print(X.shape)
print(y.shape)

(16130, 6)
(16130,)


In [19]:
# Create a logistic regression model
logistic_regression_model = LogisticRegression()

# Fit the model to the training data
logistic_regression_model.fit(X_train, y_train)
print(np.abs(logistic_regression_model.coef_))

# Predict on the testing data
predictions = logistic_regression_model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, predictions)
print("Accuracy:", accuracy) 
# evaluate with f1-score, confusion matrix on test set

[[1.56681863 2.19881121 0.29867328 1.38432089 0.3459194  0.28483937]]
Accuracy: 0.8499690018598884
