## Fine tuning a Norwegian language BERT model 

In [4]:
# sklearn (vectorizer, classifier, gridsearch)
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import ConfusionMatrixDisplay, classification_report, confusion_matrix
from sklearn.neural_network import MLPClassifier
import matplotlib.pyplot as plt


# Auxiliary
import pandas as pd 
import numpy as np 
import json


# For transfomar model
import torch
from transformers import AutoTokenizer, AutoModelForMaskedLM

In [5]:
# tagging the data points with positive or negative

class TaggedPoint():
    def __init__(self, data, tag):
        self.data = data
        self.tag = tag

In [7]:
# opens the folder as per my file system and filters data based on rating and screen
# processing it norwegian text

with open("./norec-master/norec-master/data/metadata.json", "r", encoding="utf-8") as file:
    data = json.load(file)  
    filtered_data = {k:v for k,v in data.items() if v["category"] == "screen"}
    #for k,v in filtered_data.items():
    #    print(k,v)

    neg_reviews = {k:v for k,v in filtered_data.items() if v["rating"] in [1,2,3]}
    pos_reviews = {k:v for k,v in filtered_data.items() if v["rating"] in [5,6]}
    
print(type(data))
print("Negative reviews: ", len(neg_reviews))
print("Positive reviews: ", len(pos_reviews))

<class 'dict'>
Negative reviews:  5246
Positive reviews:  5166


In [8]:
# takes the key values in the dictionary items and adds .txt to them for easy comparison in dev, test, train folder
# by adding the data as TaggedPoint-objects, we can easily reuse many of the supplied functions

import os
import os.path 

def get_files(path):
    global neg_reviews, pos_reviews
    id_neg = [k+".txt" for k in neg_reviews.keys()]
    id_pos = [k+".txt" for k in pos_reviews.keys()]
    data = []

    for file in os.listdir(path):
        fn = path+file
        if os.path.isfile(fn):
            if file in id_neg:
                with open(fn, "r", encoding="utf-8") as file:
                    content = file.read()
                    data.append(TaggedPoint(content, 0))
            elif file in id_pos:
                with open(fn, "r", encoding="utf-8") as file:
                    content = file.read()
                    data.append(TaggedPoint(content, 1))

    return data


test = get_files("./norec-master/norec-master/data/test/")
val = get_files("./norec-master/norec-master/data/dev/")
train = get_files("./norec-master/norec-master/data/train/")

print(len(test), len(val), len(train))

1006 1054 8352


In [9]:
#load model and tokenizer 

tokenizer = AutoTokenizer.from_pretrained("ltg/norbert3-small")
model = AutoModelForMaskedLM.from_pretrained("ltg/norbert3-small", trust_remote_code=True)

In [11]:
print(test.text)

# train_lens = [len(i.split()) for i in test] 
# plt.hist(train_lens)

[<__main__.TaggedPoint object at 0x0000025E2CE85960>, <__main__.TaggedPoint object at 0x0000025E2CE858A0>, <__main__.TaggedPoint object at 0x0000025E2CE84F10>, <__main__.TaggedPoint object at 0x0000025E2CE85B40>, <__main__.TaggedPoint object at 0x0000025E2CE87C40>, <__main__.TaggedPoint object at 0x0000025E2CE84A00>, <__main__.TaggedPoint object at 0x0000025E2CE84460>, <__main__.TaggedPoint object at 0x0000025E2CE85750>, <__main__.TaggedPoint object at 0x0000025E2CE86C50>, <__main__.TaggedPoint object at 0x0000025E2CE84D30>, <__main__.TaggedPoint object at 0x0000025E2CE84B20>, <__main__.TaggedPoint object at 0x0000025E2CE87C10>, <__main__.TaggedPoint object at 0x0000025E2CE86D70>, <__main__.TaggedPoint object at 0x0000025E2CE85540>, <__main__.TaggedPoint object at 0x0000025E1E6EA950>, <__main__.TaggedPoint object at 0x0000025E1EC5F9D0>, <__main__.TaggedPoint object at 0x0000025E1EC5E410>, <__main__.TaggedPoint object at 0x0000025E1EC5D8A0>, <__main__.TaggedPoint object at 0x0000025E1EC