# Article Similarity

Here is the script, that I used to filter the similar articles out. I was after the articles, that were written by the same group, but each author submitted one. This was allowed in the previous years.

In [1]:
#import libaries
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from constants import folder_meicogsci
import pandas
import string
import scipy
import nltk
import os

In [2]:
# preparing the helping functions and lists
all_files = os.listdir(folder_meicogsci)
stop_words = set(nltk.corpus.stopwords.words("english"))
lem = nltk.stem.WordNetLemmatizer()
stem = nltk.stem.porter.PorterStemmer()

In [3]:
# checking the number of files
number_of_files = len(all_files)

In [4]:
# preprocesing the texts
all_texts_and_ids = dict()
for filename in all_files:
    text_name = filename.split("_")[-1].replace(".txt", "")
    with open(os.path.join(folder_meicogsci, filename)) as f:
        data_file = f.readlines()
    # the the text is being put in the format, that makes it more informative to compare
    data_file = " ".join(data_file)
    data_file = data_file.replace("... title:", "")
    data_file = data_file.lower()
    data_file = data_file.replace("-", " ").replace("/", " ").replace("â€“", " ") 
    # here, the test is being split into words and compared based on that
    data_words = nltk.tokenize.word_tokenize(data_file)
    data_words = [w.strip() for w in data_words if w.strip()]
    all_texts_and_ids[text_name] = data_words

In [5]:
# helping function
def dummy(doc):
    return doc

In [6]:
# preparing for creating a model
all_texts = [text for text in all_texts_and_ids.values()]

In [7]:
vectorizer = CountVectorizer(tokenizer=dummy, preprocessor=dummy)

In [8]:
vectorizer_fit = vectorizer.fit_transform(all_texts)

In [9]:
# creating a model
tfidf_vectoring = TfidfTransformer()

In [10]:
tfidf_response = tfidf_vectoring.fit_transform(vectorizer_fit)

In [11]:
# putting TF-IDF into pandas
test = pandas.DataFrame(tfidf_response.toarray(), columns = vectorizer.get_feature_names())

In [12]:
all_ids = [current_id for current_id in all_texts_and_ids.keys()]

In [13]:
# helping function
def get_file_year(f1):
    f1 = int(f1)
    return [int(filename.split("_")[1]) for filename in all_files if int(filename.split("_")[2].split(".")[0]) == f1][0]

In [14]:
# caluclate the distance between articles
all_distances = []
for i in range(number_of_files):
    for j in range(number_of_files):
        if i > j:
            d = scipy.spatial.distance.cosine(test.iloc[[i]], test.iloc[[j]])
            all_distances.append([d, all_ids[i], all_ids[j], get_file_year(all_ids[i]), get_file_year(all_ids[j])])

In [16]:
all_distances = pandas.DataFrame(all_distances, columns=["Cosine", "ID1", "ID2", "year1", "year2"])

The articles here are the ones that already survived the filtering. 

In [17]:
# sort by differences in articles
all_distances.sort_values(by="Cosine", ascending=True).head(20)

Unnamed: 0,Cosine,ID1,ID2,year1,year2
209273,0.04767,360,407,2012,2013
274175,0.091252,378,225,2012,2011
29993,0.101792,773,629,2015,2014
238843,0.149487,1037,1166,2017,2018
83502,0.167938,256,294,2011,2012
55725,0.170033,565,700,2014,2015
73750,0.17536,892,631,2016,2014
201008,0.23122,219,361,2011,2012
112224,0.238934,411,424,2013,2013
3909,0.254491,722,801,2015,2016
