In [4]:
import pandas as pd
from bs4 import BeautifulSoup
from os import listdir
import glob
import re
import warnings
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
import gensim
import gensim.corpora as corpora
import pyLDAvis
import pyLDAvis.gensim_models
warnings.filterwarnings('ignore')
ENG_STOP = stopwords.words("ENGLISH")
location = r"C:\Users\rohee\Downloads\FBIS\*"
from copkmeans.cop_kmeans import cop_kmeans
import random

from keybert import KeyBERT
from sentence_transformers import SentenceTransformer

In [148]:
df = pd.DataFrame(columns=["Doc","Text","Date"])

for file in list(glob.glob(location)):
    with open(file, 'r') as f:
        res = f.read()
    soup = BeautifulSoup(open(file,'r').read())
    docs= []
    for doc in soup.find_all("doc"):
        docs.append([doc])
    for doc in docs:
        soup = BeautifulSoup(str(doc))
        text = soup.find("text")
        text = str(text)
        text = text.replace("<text>","").replace("</text>","").replace("\n","")
        date = soup.find("date1")
        date = str(date).replace("<date1>","").replace("</date1>","").replace("\n","")
        docno = soup.find("docno")
        docno = str(docno).replace("<docno>","").replace("</docno>","").replace("\n","")
        dict_ = {"Doc":docno,"Text":text,"Date":date}
        df= df.append(dict_,ignore_index=True)

In [168]:
def check_alpha(text):
    for x in text.split():
        if x.isalpha():
            return x.lower()

In [170]:
df["Month"] = df["Date"].apply(lambda x: check_alpha(str(x)))

In [150]:
df.to_pickle("dataframe_trec")

In [11]:

def cleanhtml(raw_html):
    CLEANR = re.compile('<.*?>') 
    cleantext = re.sub(CLEANR, '', raw_html)
    cleantext = cleantext.replace("/"," ")
    res = []
    for word in cleantext.split(" "):
        word = word.lower()
        if word.isalpha() and word not in ENG_STOP:
            res.append(word)   
    return res

In [12]:
data = df.Text.to_list()
cleaned_data = [cleanhtml(doc) for doc in data]

[['party',
  'preferences',
  'newspapers',
  'former',
  'yugoslav',
  'republic',
  'macedonia',
  'published',
  'results',
  'opinion',
  'indicating',
  'relative',
  'popularity',
  'political',
  'attitudes',
  'toward',
  'political',
  'january',
  'edition',
  'skopje',
  'newspaper',
  'vecer',
  'macedonian',
  'published',
  'pages',
  'results',
  'opinion',
  'poll',
  'conducted',
  'agency',
  'november',
  'according',
  'respondents',
  'classified',
  'age',
  'paper',
  'explain',
  'methodology',
  'give',
  'margin',
  'purpose',
  'paper',
  'cited',
  'results',
  'unidentified',
  'poll',
  'made',
  'may',
  'approval',
  'disapproval',
  'ten',
  'macedonian',
  'politicians',
  'november',
  'may',
  'kiro',
  'president',
  'republic',
  'vasil',
  'former',
  'macedonian',
  'official',
  'federal',
  'yugoslavia',
  'ljubomir',
  'interior',
  'minister',
  'stojan',
  'parliamentary',
  'chairman',
  'branko',
  'prime',
  'minister',
  'vlado',
  'defe

In [13]:
## Just for checking
docs = [" ".join(doc) for doc in cleaned_data]
temp = pd.DataFrame(docs, columns=["text"])
temp.tail()

Unnamed: 0,text
489,cso aleksandr kinsburskiy sergey turanov econo...
490,cso igor candidate economic sciences sector he...
491,cso pavel administrator affairs president russ...
492,none
493,spanish article civil guard seized kilos high ...


In [17]:
## Word Embedding

GoogleModel = gensim.models.KeyedVectors.load_word2vec_format(r"C:\Users\rohee\Downloads\GoogleNews-vectors-negative300.bin.gz", binary=True,)

# Defining a function which takes text input and returns one vector for each sentence
def FunctionText2Vec(inpTextData, max_features):
    vectorizer = CountVectorizer(stop_words='english', max_features=1000)
    X = vectorizer.fit_transform(inpTextData)
    CountVectData=pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names())
    
    # Creating empty dataframe to hold sentences
    W2Vec_Data=pd.DataFrame()
    WordsVocab=CountVectData.columns[:]
    # Looping through each row for the data
    for i in range(CountVectData.shape[0]):

        # initiating a sentence with all zeros
        Sentence = np.zeros(300)

        # Looping thru each word in the sentence and if its present in 
        # the Word2Vec model then storing its vector
        for word in WordsVocab[CountVectData.iloc[i,:]>=1]:
            #print(word)
            if word in GoogleModel.key_to_index.keys():    
                Sentence=Sentence+GoogleModel[word]
        # Appending the sentence to the dataframe
        W2Vec_Data=W2Vec_Data.append(pd.DataFrame([Sentence]), ignore_index=True)
    return pd.concat([W2Vec_Data, CountVectData], axis=1)

In [19]:
# Calling the function to convert all the text data to Word2Vec Vectors
Data=FunctionText2Vec(temp['text'], max_features=10000)
# Checking the new representation for sentences
Data.shape

(494, 1300)

## LDA Topic Modelling

In [22]:
## LDA
id2word = corpora.Dictionary(cleaned_data)
texts = cleaned_data
corpus = [id2word.doc2bow(text) for text in texts]

lda_model = gensim.models.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=20, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)

def format_topics_sentences(ldamodel=lda_model, corpus=corpus, texts=data):
    # Init output
    sent_topics_df = pd.DataFrame()

    # Get main topic in each document
    for i, row in enumerate(ldamodel[corpus]):
        row = sorted(row[0], key=lambda x: (x[1]), reverse=True)
        # Get the Dominant topic, Perc Contribution and Keywords for each document
        for j, (topic_num, prop_topic) in enumerate(row):
            if j == 0:  # => dominant topic
                wp = ldamodel.show_topic(topic_num)
                topic_keywords = ", ".join([word for word, prop in wp])
                sent_topics_df = sent_topics_df.append(pd.Series([int(topic_num), topic_keywords]), ignore_index=True)
            else:
                break
    sent_topics_df.columns = ['Dominant_Topic', 'Topic_Keywords']

    # Add original text to the end of the output
    contents = pd.Series(texts)
    sent_topics_df = pd.concat([sent_topics_df, contents], axis=1)
    return(sent_topics_df)


df_topic_sents_keywords = format_topics_sentences(ldamodel=lda_model, corpus=corpus, texts=cleaned_data)

## Constrained Clustering

In [59]:
must_link_dummy = [(0, 10), (0, 20), (0, 30)]
cannot_link_dummy = [(1, 10), (2, 10), (3, 10)]

In [35]:
df_topic_sents_keywords.reset_index(inplace=True)


In [38]:
topics = df_topic_sents_keywords[["index","Dominant_Topic"]]

In [68]:
must_link = []
cannot_link = []

for doc in range(len(topics)):
    for other_doc in range(len(topics)):
        if doc != other_doc:
            if topics.iloc[doc,:]["Dominant_Topic"]==topics.iloc[other_doc,:]["Dominant_Topic"]:
                must_link.append((doc,other_doc))
            else:
                cannot_link.append((doc,other_doc))

In [None]:
sampled_must_link = random.sample(must_link,int(len(must_link)*0.10))
sampled_cannot_link = random.sample(cannot_link,int(len(cannot_link)*0.10))

In [96]:
clusters, centers = cop_kmeans(dataset=np.array(Data), k=15, ml=sampled_must_link,cl=sampled_cannot_link)

In [70]:
cluster_res = pd.DataFrame(columns=["Doc","Cluster"])
cluster_res["Doc"] = topics["index"]
cluster_res["Cluster"] = clusters


# Key Phrase Extraction

In [10]:
kw_model = KeyBERT('all-MiniLM-L6-v2')

In [None]:
# Finding top 5 key phrases for each document
keywords = kw_model.extract_keywords(docs, keyphrase_ngram_range=(3, 3), stop_words='english',
                              use_maxsum=True, nr_candidates=20, top_n=5)

In [None]:
# Embedding top 5 key phrases using Sentence Transformer
model = SentenceTransformer('all-MiniLM-L6-v2')
kw = keywords
for i, d in enumerate(kw):
    for j, k in enumerate(d):
        kw[i][j] = kw[i][j]+(model.encode(kw[i][j][0]),)

In [None]:
# Combining the top 5 embeddings using Weighted Average
weights = []
embeddings = []
for i, d in enumerate(kw):
    w = []
    emb = []
    for j, k in enumerate(d):
        w.append(kw[i][j][1])
        emb.append(kw[i][j][2])
    weights.append(w)
    embeddings.append(emb)
embeddings = np.array(embeddings)

final_vector = []
for i in range(0, embeddings.shape[0]):
    emb = embeddings[i]
    weight = weights[i]
    final_vector.append(np.average(emb.tolist(), axis=0, weights=weight))
final_vector_df = pd.DataFrame(final_vector)