# Semesteroppgave

### Text preparation of test sets


In [1]:
# imports for data preparation 
import os
import os.path
import re

# imports for machine learning
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import ConfusionMatrixDisplay, classification_report, confusion_matrix
from sklearn.neural_network import MLPClassifier
import matplotlib.pyplot as plt
import seaborn as sns


# Auxiliary
import numpy as np
from random import shuffle

class TaggedPoint():
    def __init__(self, data, tag):
        self.data = data
        self.tag = tag

# sentiment analyse for nowegain, norec corpus
# norec sentences , github
# 123 neg, 56 pos
# aruca finetuning sentiment
# small norbert3 
# kushtrin, sentiment 
# mlp classifier på topp


### Investageting the size of the directories

In [2]:
def check_dir(pn):
    for folder in os.listdir(pn):
        new_path = pn+folder
        if os.path.isdir(new_path):
            folder_size = sum(os.path.getsize(os.path.join(new_path, file)) for file in os.listdir(new_path))
            folder_size_kb = round(folder_size / 1024, 2)      
            num_files = sum(1 for f in os.listdir("./"+new_path))

        
        print(f"{folder} size: {folder_size_kb} kB, {num_files} files")
    print("\n")
    #print(f"{folder}_size: {os.path.getsize()}")


In [3]:
print("VG contains: ")
check_dir("./vg/")

print("Fedrelandsvennen contains : ")
check_dir("./fv/")

print("Nordlys contains: ")
check_dir("./vg/")


# test set på år 
# trnne på norec


VG contains: 
2005 size: 50307.78 kB, 264 files
2006 size: 74914.96 kB, 360 files
2007 size: 67835.89 kB, 351 files
2008 size: 50588.86 kB, 359 files
2009 size: 148471.12 kB, 352 files
2010 size: 129299.64 kB, 359 files
2011 size: 103358.62 kB, 359 files


Fedrelandsvennen contains : 
2005 size: 34287.34 kB, 260 files
2006 size: 53280.71 kB, 359 files
2007 size: 288423.57 kB, 351 files
2008 size: 79907.46 kB, 343 files
2009 size: 33930.09 kB, 166 files
2010 size: 88293.11 kB, 361 files
2011 size: 75608.53 kB, 359 files


Nordlys contains: 
2005 size: 50307.78 kB, 264 files
2006 size: 74914.96 kB, 360 files
2007 size: 67835.89 kB, 351 files
2008 size: 50588.86 kB, 359 files
2009 size: 148471.12 kB, 352 files
2010 size: 129299.64 kB, 359 files
2011 size: 103358.62 kB, 359 files




### Cleaning the test data and storing it

In [4]:
# text prepartion function

def clean_text_func(text):
    text = text.lower()
    text = text.replace("¶", "")
    text = re.sub(r'<[^>]*>|^\s*</[^>]*>$', '', text, flags=re.MULTILINE)
    text = re.sub(r'[^\w\s.,!?]|\_\w{35,}', '', text, flags=re.MULTILINE)
    return text


In [5]:
# 

def create_test_dir(dir_path):
    pattern = re.compile(r">(.*?)##", re.DOTALL) # regex compile pattern

    for file_name in os.listdir(f"{dir_path}"): # iterate every filname in target dir
        file_path = os.path.join(dir_path, file_name)
        with open(file_path, "r", encoding="latin-1") as infile: # read files
            html_text = infile.read()
            matches = pattern.findall(html_text) # separates every article based on regex pattern (from > to ##)
            
                        
        file_wo_format = file_name[:-6] # file name without format
        path_split = dir_path.split("/") 
        new_dir_path = f"./test_{path_split[-2]}/test_{path_split[-1]}" 
        os.makedirs(new_dir_path,exist_ok=True)

        with open(f"./{new_dir_path}/{file_wo_format}.txt", "w") as outfile: # cleans text further and writes to new format
            for text in matches:
                cleaned_text = clean_text_func(text.strip())
                outfile.write(cleaned_text.strip()+"\n")
     

In [6]:
# creating test file directories for different news papers with years

search_paths = ["./vg", "./fv", "./nl"] 


for paths in search_paths:
    for path in os.listdir(paths):
        create_test_dir(f"./vg/{path}")
        create_test_dir(f"./{paths}/{path}")


### Deleting articles that does not meet the requirements (elimination by search phrase)

In [7]:
# function used for iterating through each new test folder
# takes list as param to find kewords and rearrange articles

def phrase_search(path, phrases):
    pattern = re.compile(r'\b(?:' + '|'.join(phrases) + r')\b', re.IGNORECASE)
    exclude_words = {"eksamen", "Eksamen"}

    for fn in os.listdir(path):
        fp = path+"/"+fn
        sp = fp.split("/")

        with open(fp, "r", encoding="latin-1") as file:
            content = file.read()
            articles = re.split(r'\n\s*\n', content.strip())


        filtered_articles = []
        for article in articles:
            if re.search(pattern, article):
                if not any(exclude in article.lower() for exclude in exclude_words):
                    filtered_articles.append(article)


        with open(f"./{sp[-3]}_sami_articles.txt", "a", encoding="latin-1") as file:
            file.write("\n\n\n".join(filtered_articles))
            #file.write("\n\n")


In [10]:
# list of search paths from work work directory
# including directory for vg, fedrelandsvennen, nordlys



search_phrases = ["same", "samer", "samene", "samisk", "sápmi", "sametinget", "reindrift", "urbefolkning"]
search_phrases = search_phrases+[word.capitalize() for word in search_phrases]
search_tpaths = ["./test_vg", "./test_fv", "./test_nl"] 



for paths in search_tpaths:
    for path in os.listdir(paths):
        phrase_search(f"./{paths}/{path}", search_phrases)


### Further text preparation