In [None]:
import json
import numpy as np
import joblib
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix

In [None]:
# load json
with open('training_data.json', 'r', encoding='utf-8') as infile:
    json_items = json.loads(infile.read())

item_values = []

for item in json_items:
    item_class = item['class']
    text = item['text']
    punctuation_after_first_word = item['punctuation_after_first_word']
    square_bracket = item['square_bracket']
    square_bracket_with_punctuation = item['square_bracket_with_punctuation']
    parentheses = item['parentheses']
    parentheses_with_punctuation = item['parentheses_with_punctuation']
    category_word = item['category_word']
    first_word_frequency = item['first_word_frequency']
    values = [
        item_class, 
        text, 
        punctuation_after_first_word, 
        square_bracket, 
        square_bracket_with_punctuation, 
        parentheses, parentheses_with_punctuation,
        category_word, 
        first_word_frequency,
            ]
    item_values.append(values)

print(item_values[:5])

In [None]:
# convert to feature matrix and y vector
# all key-value pairs should be included except for text
# class becomes the y vector
X = np.array([])
y = np.array([])

for values in item_values:
    X = np.vstack([X, values[2:]]) if X.size else np.array(values[2:])
    y = np.append(y, values[0])

# Split dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
print(np.array([X[0]]).shape)
print(y.shape)

In [None]:
# Load the pre-trained logistic regression model from disk
model = joblib.load('logistic_regression_model.pkl')

# Now you can use the loaded model for prediction
predictions = model.predict([X_test[0]])
predictions[0]

# Evaluate the model
#accuracy = accuracy_score(y_test, predictions)
#print("Accuracy:", accuracy) 

In [None]:
import classification_utils as cu
line = "Kromatik (se Kromatisk), färglära; mus., (ymnigt) användande af kromatiska tonföljder."
x = cu.line_to_datapoint(line)
print(x)
model = joblib.load('logistic_regression_model.pkl')
prediction = model.predict(x)[0]
print(prediction)


In [None]:
import regex_utils as ru
import regex as re
line = "Zenta, stad i Jugoslavien"
print(ru.get_headword_from_index(line))
# search = re.search(r"^.{1,20}?[.,]", line)
# if search:
#     print("hej")

In [None]:
line = "<b>Adolf Fredrik, <"
print(ru.get_headword_no_closing_bold_tag(line))

In [None]:
import regex as re

with open("chartres.txt", "r", encoding='utf-8') as f:
    volume_string = f.read()
    print(volume_string)
    volume_string = re.sub(r'^.*?<b>', r'<b>', volume_string, flags = re.MULTILINE)
    print(volume_string)

In [1]:
from utils import json_helpers as jh 

jh.read_items("encyclopedias_jsons/e1")

[{'headword': 'A',
  'entryid': 'e1_0_aa_9_0',
  'text': '<b>A</b> är den första <i>bokstafven</i> i alla indoeuropeiska språks alfabet utom i den vanliga runföljden, der det innehar det tionde rummet. Det är tillika det renaste och klaraste af alla <i>språk',
  'classifier_type': 0,
  'class': 0,
  'qid': '0',
  'e2_key': '',
  'e4_key': '',
  'cross_ref_key': '',
  'latitude': None,
  'longitude': None},
 {'headword': 'A',
  'entryid': 'e1_1_aa_11_0',
  'text': '<b>A,</b> Lat. prepos. Se Ab.',
  'classifier_type': 0,
  'class': 0,
  'qid': '0',
  'e2_key': '',
  'e4_key': '',
  'cross_ref_key': 'e1_42_aa_14_5',
  'latitude': None,
  'longitude': None},
 {'headword': 'Aa',
  'entryid': 'e1_2_aa_11_1',
  'text': '<b>Aa</b> (utt. a; Ach l. Aach, af Fornt. <i>aha</i>, vatten, flod; Lat. <i>aqua;</i> Sv. <i>å</i>), namn på en mängd floder i Frankrike, Tyskland, Schweiz och europeiska Ryssland. De mest betydande ä',
  'classifier_type': 0,
  'class': 0,
  'qid': '0',
  'e2_key': '',
  'e4_