# Semesteroppgave

### Text preparation of test sets


In [1]:
# imports for data preparation 
import os
import os.path
import re
import json

# imports for machine learning
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import ConfusionMatrixDisplay, classification_report, confusion_matrix
from sklearn.neural_network import MLPClassifier
import matplotlib.pyplot as plt
import seaborn as sns


# Auxiliary
import numpy as np
from random import shuffle


# sentiment analyse for nowegain, norec corpus
# norec sentences , github
# 123 neg, 56 pos
# aruca finetuning sentiment
# small norbert3 
# kushtrin, sentiment 
# mlp classifier på topp


### Investageting the size of the directories

In [2]:
def check_dir(pn):
    for folder in os.listdir(pn):
        full_path = pn+folder
        if os.path.isdir(full_path):
            folder_size = sum(os.path.getsize(os.path.join(full_path, file)) for file in os.listdir(full_path))
            folder_size_kb = round(folder_size / 1024, 2)      
            num_files = sum(1 for f in os.listdir("./"+full_path))

        print(f"{folder} size: {folder_size_kb} kB, {num_files} files")
    print("\n")


In [3]:
print("VG contains: ")
check_dir("./vg/")

# print("Nordlys contains: ")
# check_dir("./vg/")


VG contains: 
2005 size: 50307.78 kB, 264 files
2006 size: 74925.64 kB, 360 files
2007 size: 67835.89 kB, 351 files
2008 size: 50588.86 kB, 359 files
2009 size: 148471.12 kB, 352 files
2010 size: 129299.64 kB, 359 files
2011 size: 103358.62 kB, 359 files




### Cleaning the test data and storing it

In [4]:
# text prepartion function

def clean_text(text):
    text = text.lower()
    text = text.replace("¶", "")
    text = re.sub(r'<[^>]*>|^\s*</[^>]*>$', '', text, flags=re.MULTILINE) # removing xml tags 
    text = re.sub(r'\b\w*_{1}\w{10,}\b', '', text, flags=re.MULTILINE)
    text = re.sub(r'[^\w\s.,!?]', '', text, flags=re.MULTILINE) # removing non-word chars and whitespaces
    text = re.sub(r'https?://(?:www\.)?[\w\.-]+\.\w{2,}(?:/\S*)?', '', text, flags=re.MULTILINE) # remove url
    
    return text


In [5]:
# 

def create_dir(dir_path):
    pattern = re.compile(r">(.*?)##", re.DOTALL) # regex compile pattern, separates to article 

    for file_name in os.listdir(f"{dir_path}"): # iterate every filname in target dir
        file_path = os.path.join(dir_path, file_name)
        with open(file_path, "r", encoding="latin-1") as infile: # read files
            html_text = infile.read()
            matches = pattern.findall(html_text) # separates every article based on regex pattern (from > to ##)
            
                        
        file_wo_format = file_name[:-6] # file name without format
        path_split = dir_path.split("/") 
        new_dir_path = f"./test_{path_split[-2]}/test_{path_split[-1]}" 
        os.makedirs(new_dir_path,exist_ok=True)

        with open(f"./{new_dir_path}/{file_wo_format}.txt", "w") as outfile: # cleans text further and writes to new format
            for text in matches:
                cleaned_text = clean_text(text.strip())
                outfile.write(cleaned_text.strip()+"\n")
     

In [6]:
# creating test file directories for different news papers with years

search_paths = ["./vg",] 


for paths in search_paths:
    for path in os.listdir(paths):
        create_dir(f"./{paths}/{path}")


#### Deleting articles that does not meet the requirements (elimination by search phrase)
#### Assigning articles to synthetic ids
#### Creating ajoining metadata in json file

In [33]:
# function used for iterating through each new test folder
# takes list as param to find kewords and rearrange articles

def create_test_set(path, phrases):
    metadata_dict = {} # dict for metadata file 
    pattern1 = re.compile(r'\b(?:' + '|'.join(phrases) + r')\b', re.IGNORECASE) # joins words with phrases that are relevant
    exclude_words = {"eksamen", "Eksamen"} # excludes words that are completely irrelevant

    for file_name in os.listdir(path):
        file_path = path+"/"+file_name
        split_path = file_path.split("/")
        id_name = file_name[:-4]

        with open(file_path, "r", encoding="latin-1") as file:
            content = file.read()
            articles = re.split(r'\n\s*\n', content)

        count = 1
        for article in articles:
            if re.search(pattern1, article):
                if not any(exclude in article.lower() for exclude in exclude_words):
                    out_file = f"{id_name}-{count}"
                    out_file_path = f"./test/{out_file}.txt"
                    metadata_dict[out_file] = {"paper":split_path[-3], "year":split_path[-2], "sentiment_score":None}
                    with open(out_file_path, "w", encoding="latin-1") as file:
                        file.write(article)
                    count += 1

    return metadata_dict
    # with open("test_metadata.json", "w") as json_file:
    #     json.dump(metadata_dict, json_file, indent=4)
            

In [37]:
# list of search paths from work work directory
# including directory for vg, fedrelandsvennen, nordlys

metadata_dict = {}
search_phrases = ["samers?", "samenes?", "samisk", "sápmi", "sametinget", "reindrift", "reinsdyr", "urbefolknings rettigheter"]
search_paths = ["./test_vg"]


if not os.path.exists("./test"):
    os.mkdir("./test")
    print("Directory created successfully.")
else: 
    print("Directory already exists")


for paths in search_paths:
    for path in os.listdir(paths):
        meta_temp = create_test_set(f"./{paths}/{path}", search_phrases)
        metadata_dict.update(meta_temp)

with open("./test_metadata.json", "w") as json_file:
    json.dump(metadata_dict, json_file, indent=4)


Directory created successfully.


#### Making sure metadata and num files are same length

In [35]:
count = 0
for files in os.listdir("./test"):
    count += 1
print(count)

205


In [36]:
with open("test_metadata.json", "r") as f:
    d = json.load(f)
print(len(d))

205
