## 3. script
### This script cleans the dataframe and creates the adjacency matrix for the network
#### Loading packages

In [None]:
import re
import os
import glob
import pandas as pd

from ast import literal_eval
from langdetect import detect
from itertools import combinations

import requests
import json
import time
from fuzzywuzzy.process import dedupe as fuzzy_dedupe

##### First, we need to load the CSV file we just created. Replace with your directory within the quotations marks. It should point to the 'per_dacy_large.csv' file within the 'csv' folder. 

In [136]:
df = pd.read_csv('C:\\Users\\Sarah\\Desktop\\Cultural_data_science\\Exam\\ibsen_network\\data\\csv\\per_dacy_large.csv', converters={'dacy_large': literal_eval})

In [137]:
df

Unnamed: 0,files,text,dacy_large
0,BREV_B1844-1871ht_18670308FH,,[]
1,BREV_B1844-1871ht_B18260306NTB,,[]
2,BREV_B1844-1871ht_B18440520PL,"Du maa virkelig undskylde, at jeg først nu bes...","[Hedevall, Reimann, Reimann, Johan, Carl Aamod..."
3,BREV_B1844-1871ht_B18441006HSte,"Tilgiv at jeg ikke før har besvaret Dit Brev, ...","[Reimann, Mdm Reimann, Mdm R, Reimanns, Reiman..."
4,BREV_B1844-1871ht_B18450801AWE,\nJomfru M: Wahl hilses venskabeligst fra,[M: Wahl]
...,...,...,...
2444,BREV_B1890-1905ht_BudatNN_Hjerteligste,\nHjerteligste ønsker!Tør desværre ikke selv p...,[]
2445,BREV_B1890-1905ht_BudatNN_Med_udtrykket,\n\nMed udtrykket af min sympati for komitéens...,[]
2446,BREV_B1890-1905ht_BudatNN_Tallene,\nTallene må utvilsomt være skrevne af mig sel...,[]
2447,BREV_B1890-1905ht_BudatNN_Wenn_Sie,\n\n\n Wenn Sie keine andere Verwendung für Ih...,[]


##### We need to drop empty letters first. Then we need to remove the letters that were not written in Danish, as DaCy Large's predictions won't be good for these files.

In [138]:
df.dropna(subset = ["text"], inplace=True)

df['lang'] = df['text'].apply(detect)

In [139]:
df['lang'].unique()

array(['da', 'no', 'de', 'fr', 'en', 'it', 'nl', 'sv', 'vi', 'sw', 'af',
       'hu', 'id'], dtype=object)

In [140]:
df = df[(df["lang"] == 'da') | (df["lang"] == 'no') | (df["lang"] == 'sv')]

In [141]:
df = df[df['dacy_large'].map(lambda d: len(d)) > 0]

##### Now we can create the adjacency matrix. The data needs to be in dictionary format for the functions to work.

In [145]:
df = df[['text','dacy_large']]

In [146]:
data = df.to_dict('index')

##### With the data in dictionary format, we can create a function that removes duplicate names in each letter.

In [147]:
def remove_dupes(article):
    contains_dupes = list(data[key]['dacy_large'])
    deduped = fuzzy_dedupe(contains_dupes)
    return deduped

In [148]:
for key in data:
    if data[key]['text'] != '':
        people = remove_dupes(str(data[key]['dacy_large']))
        data[key]['people'] = people



##### Then we can create the adjacency matrix by making each entity a key and other entities in the same letter values of that key.

In [149]:
entities = {}

for key in data:
    people = data[key]['people']
    
    doc_ents = []
    for person in people:
            doc_ents.append(person)
    
    for ent in doc_ents:
        try:
            entities[ent].extend([doc for doc in doc_ents if doc != ent])
        except:
            entities[ent] = [doc for doc in doc_ents if doc != ent]

##### Now, we can create a dataframe again from the dictionary.

In [150]:
df = pd.DataFrame([entities])
df = df.transpose()

In [151]:
df.index.name = 'source'
df.reset_index(inplace=True)

##### We also want each name on its own row, so we are exploding the target values. Additionally, we remove all names shsorter than 3 letters, as it wouldn't be possible to get the full name of the individual from so little.

In [152]:
df = df.rename(columns={0: 'target'})
df = df.explode('target')

df = df.dropna(subset = ['target'])

In [153]:
mask = (df['target'].str.len() > 3) & (df['source'].str.len() > 3)
df = df.loc[mask]

In [154]:
df

Unnamed: 0,source,target
0,Hedevall,Reimann
0,Hedevall,Johan
0,Hedevall,Carl Aamodt
1,Reimann,Hedevall
1,Reimann,Johan
...,...,...
1891,Laura Fitinghoff,Henrik Ibsen
1893,Wang,Peer Gynt
1894,Helga,Sigurds
1895,Hr. Skuespiller Garmann,Henrik Ibsen


##### The dataframe is now ready to be saved as a CSV file and imported to OpenRefine for the next preprocessing steps. The path should point to the CSV folder again.

In [1]:
path = (r"C:\\Users\\Sarah\\Desktop\\Cultural_data_science\\Exam\\ibsen_network\\data\\csv")

In [156]:
df.to_csv(os.path.join(r'data_links.csv'), encoding = 'utf-8', index = True)

#### To reproduce the code within OpenRefine, the 'data_links.csv' file should be opened as a project. The TXT files named 'procedures_openrefine_target.txt' and 'procedures_openrefine_target.txt' should be applied to the dataframe in the project. Simply open them and copypaste the contents into the 'Redo'. Then, save the CSV file as 'OR_data_links.csv'.