In [297]:
import pandas as pd
from bs4 import BeautifulSoup
from os import listdir
import glob
import re
import warnings
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
import gensim
import gensim.corpora as corpora
import pyLDAvis
import pyLDAvis.gensim_models
warnings.filterwarnings('ignore')
ENG_STOP = stopwords.words("ENGLISH")
location = r"C:\Users\rohee\Downloads\FBIS\*"
from copkmeans.cop_kmeans import cop_kmeans
import random
from scipy.spatial.distance import cdist
from sklearn.metrics.pairwise import cosine_similarity


In [281]:
def check_alpha(text):
    for x in text.split():
        if x.isalpha():
            return x.lower()

In [287]:

def get_documents():
    df = pd.DataFrame(columns=["Doc","Text","Date"])
    for file in list(glob.glob(location)):
        with open(file, 'r') as f:
            res = f.read()
        soup = BeautifulSoup(open(file,'r').read())
        docs= []
        for doc in soup.find_all("doc"):
            docs.append([doc])
        for doc in docs:
            soup = BeautifulSoup(str(doc))
            text = soup.find("text")
            text = str(text)
            text = text.replace("<text>","").replace("</text>","").replace("\n","")
            date = soup.find("date1")
            date = str(date).replace("<date1>","").replace("</date1>","").replace("\n","")
            docno = soup.find("docno")
            docno = str(docno).replace("<docno>","").replace("</docno>","").replace("\n","")
            dict_ = {"Doc":docno,"Text":text,"Date":date}
            df= df.append(dict_,ignore_index=True)
    return df

In [288]:
df = get_documents()

In [289]:
#df.to_pickle("dataframe_trec")
original_df = df
df = df[:100]

In [292]:
## clean the data
def cleanhtml(raw_html):
    CLEANR = re.compile('<.*?>') 
    cleantext = re.sub(CLEANR, '', raw_html)
    cleantext = cleantext.replace("/"," ")
    res = []
    for word in cleantext.split(" "):
        word = word.lower()
        if word.isalpha() and word not in ENG_STOP:
            res.append(word)   
    return res
data = df.Text.to_list()
cleaned_data = [cleanhtml(doc) for doc in data]

In [253]:
## Just for checking
#docs = [" ".join(doc) for doc in cleaned_data]
#temp = pd.DataFrame(docs, columns=["text"])

In [295]:
## Word Embedding

GoogleModel = gensim.models.KeyedVectors.load_word2vec_format(r"C:\Users\rohee\Downloads\COP\GoogleNews-vectors-negative300.bin.gz", binary=True,)



In [321]:
def get_bow(text,max_features):
    vectorizer = CountVectorizer(tokenizer=lambda doc: doc, lowercase=False,max_features=max_features)
    bow = vectorizer.fit_transform(cleaned_data)
    text_d = bow.toarray()
    temp = pd.DataFrame(text_d, columns = vectorizer.get_feature_names())
    return temp

In [324]:
# Defining a function which takes text input and returns one vector for each sentence
def FunctionText2Vec(inpTextData, cleaned_data,max_features):
    CountVectData=get_bow(cleaned_data,max_features=max_features)
    
    # Creating empty dataframe to hold sentences
    W2Vec_Data=pd.DataFrame()
    WordsVocab=CountVectData.columns[:]
    # Looping through each row for the data
    for i in range(CountVectData.shape[0]):

        # initiating a sentence with all zeros
        Sentence = np.zeros(300)

        # Looping thru each word in the sentence and if its present in 
        # the Word2Vec model then storing its vector
        for word in WordsVocab[CountVectData.iloc[i,:]>=1]:
            #print(word)
            if word in GoogleModel.key_to_index.keys():    
                Sentence=Sentence+GoogleModel[word]
        # Appending the sentence to the dataframe
        W2Vec_Data=W2Vec_Data.append(pd.DataFrame([Sentence]), ignore_index=True)
    return pd.concat([W2Vec_Data, CountVectData], axis=1)

In [325]:
# Calling the function to convert all the text data to Word2Vec Vectors
Data=FunctionText2Vec(df['Text'], cleaned_data,max_features=10000)
# Checking the new representation for sentences
Data.shape

(100, 10300)

## LDA Topic Modelling

In [326]:
## LDA (only using BOW)
id2word = corpora.Dictionary(cleaned_data)
texts = cleaned_data
corpus = [id2word.doc2bow(text) for text in texts]

lda_model = gensim.models.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=20, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)

def format_topics_sentences(ldamodel=lda_model, corpus=corpus, texts=data):
    # Init output
    sent_topics_df = pd.DataFrame()

    # Get main topic in each document
    for i, row in enumerate(ldamodel[corpus]):
        row = sorted(row[0], key=lambda x: (x[1]), reverse=True)
        # Get the Dominant topic, Perc Contribution and Keywords for each document
        for j, (topic_num, prop_topic) in enumerate(row):
            if j == 0:  # => dominant topic
                wp = ldamodel.show_topic(topic_num)
                topic_keywords = ", ".join([word for word, prop in wp])
                sent_topics_df = sent_topics_df.append(pd.Series([int(topic_num), topic_keywords]), ignore_index=True)
            else:
                break
    sent_topics_df.columns = ['Dominant_Topic', 'Topic_Keywords']

    # Add original text to the end of the output
    contents = pd.Series(texts)
    sent_topics_df = pd.concat([sent_topics_df, contents], axis=1)
    return(sent_topics_df)


df_topic_sents_keywords = format_topics_sentences(ldamodel=lda_model, corpus=corpus, texts=cleaned_data)

## Constrained Clustering

In [59]:
must_link_dummy = [(0, 10), (0, 20), (0, 30)]
cannot_link_dummy = [(1, 10), (2, 10), (3, 10)]

In [35]:
df_topic_sents_keywords.reset_index(inplace=True)
topics = df_topic_sents_keywords[["index","Dominant_Topic"]]

In [96]:
clusters, centers = cop_kmeans(dataset=np.array(Data), k=15, ml=sampled_must_link,cl=sampled_cannot_link)

In [182]:
cluster_res = pd.DataFrame(columns=["Doc","Cluster"])
cluster_res["Doc"] = topics["index"]
cluster_res["Cluster"] = clusters


In [254]:
#set(cluster_res[cluster_res["Cluster"]==11]["Doc"])-set(topics[topics["Dominant_Topic"]==5]["index"])

In [395]:
## get topic probabilities
topic_prob_df = pd.DataFrame(columns=["Doc"]+["topic_"+str(i) for i in range(0,20)])
for i in range(len(df)):
    doc = lda_model.get_document_topics(bow=corpus[i],minimum_probability=numpy.finfo(float).eps)
    topic_prob_df.loc[len(topic_prob_df)] = [str(i)]+list(map(lambda x: x[1],doc))

topic_prob_df.set_index("Doc",inplace=True)

## get cosine of the topic probabilities
cosine_array = cosine_similarity(topic_prob_df)
similarity_df = pd.DataFrame(columns=["Doc_"+str(i) for i in range(len(df))])
for doc in cosine_array:
    similarity_df.loc[len(similarity_df)] = list(doc)


In [402]:
pd. set_option('display.max_columns', None)

In [403]:
similarity_df

Unnamed: 0,Doc_0,Doc_1,Doc_2,Doc_3,Doc_4,Doc_5,Doc_6,Doc_7,Doc_8,Doc_9,Doc_10,Doc_11,Doc_12,Doc_13,Doc_14,Doc_15,Doc_16,Doc_17,Doc_18,Doc_19,Doc_20,Doc_21,Doc_22,Doc_23,Doc_24,Doc_25,Doc_26,Doc_27,Doc_28,Doc_29,Doc_30,Doc_31,Doc_32,Doc_33,Doc_34,Doc_35,Doc_36,Doc_37,Doc_38,Doc_39,Doc_40,Doc_41,Doc_42,Doc_43,Doc_44,Doc_45,Doc_46,Doc_47,Doc_48,Doc_49,Doc_50,Doc_51,Doc_52,Doc_53,Doc_54,Doc_55,Doc_56,Doc_57,Doc_58,Doc_59,Doc_60,Doc_61,Doc_62,Doc_63,Doc_64,Doc_65,Doc_66,Doc_67,Doc_68,Doc_69,Doc_70,Doc_71,Doc_72,Doc_73,Doc_74,Doc_75,Doc_76,Doc_77,Doc_78,Doc_79,Doc_80,Doc_81,Doc_82,Doc_83,Doc_84,Doc_85,Doc_86,Doc_87,Doc_88,Doc_89,Doc_90,Doc_91,Doc_92,Doc_93,Doc_94,Doc_95,Doc_96,Doc_97,Doc_98,Doc_99
0,1.000000,0.000102,0.000267,0.000146,0.000220,0.000148,0.000160,0.000143,0.000153,0.000479,0.000153,0.000507,0.000163,0.000116,0.000152,0.826060,0.000206,0.000236,0.000255,1.000000,0.000302,0.000355,0.000104,0.000332,0.000375,0.000151,0.000137,0.000212,0.000137,0.000159,0.000266,0.000145,0.000390,0.000134,0.014001,0.999990,0.000194,0.000093,0.000150,0.482689,0.000165,0.000181,0.000127,0.023792,0.000142,0.000105,0.000276,0.000135,0.000191,0.000141,0.000326,0.000160,0.000175,0.000148,0.000148,0.000228,0.000177,0.000103,0.000203,0.000116,0.000178,0.000190,0.000726,0.000134,0.000117,0.000132,0.000113,0.000193,0.000149,0.000207,0.000141,0.000134,0.000141,0.000134,0.000141,0.000176,0.000119,0.000319,0.000152,0.000449,1.000000,0.000134,0.000644,0.000530,0.000496,0.000337,0.000254,0.000257,0.000311,0.000384,0.000265,0.001606,0.000255,0.000971,0.000192,0.000574,0.000567,0.000694,0.000290,0.000285
1,0.000102,1.000000,0.000170,0.000086,0.000096,0.000046,0.000078,0.000063,0.000072,0.000453,0.000072,0.000447,0.000090,0.000040,0.000079,0.563665,0.000125,0.000111,0.099547,0.000165,0.000263,0.000299,0.000020,0.000173,0.000296,0.000039,0.000031,0.000103,0.000026,0.000086,0.490093,0.000043,0.000354,0.000076,0.000040,0.004533,0.000125,0.000033,0.000072,0.875818,0.000085,0.000078,0.000078,0.000123,0.999685,0.000048,0.649196,0.000055,0.000085,0.000076,0.000226,0.000066,0.000072,0.000095,0.000075,0.000093,0.000099,0.000020,0.000143,0.000028,0.000099,0.000100,0.000614,0.000052,0.000041,0.000072,0.000022,0.000141,0.000065,0.000121,0.000054,0.000030,0.000054,0.000030,0.000054,0.000120,0.000052,0.000264,0.000063,0.000380,0.000054,0.000046,0.066715,0.000457,0.000466,0.000288,1.000000,0.000209,0.000273,0.000302,0.000160,0.001626,0.000207,0.000985,0.000140,0.000503,0.000546,0.124685,0.000233,0.000210
2,0.000267,0.000170,1.000000,0.000240,0.000364,0.000247,0.002467,0.000237,0.000252,0.000778,0.000253,0.165341,0.911471,0.000192,0.000250,0.000623,0.000339,0.000390,0.000422,0.000361,0.000493,0.911577,0.000173,0.095285,0.000613,0.089527,0.011808,0.142321,0.022098,0.911469,0.794626,0.000242,0.000634,0.000222,0.000286,0.000244,0.911488,0.375565,0.009094,0.000290,0.000272,0.000300,0.002982,0.149765,0.000234,0.000173,0.693442,0.168151,0.851011,0.000233,0.000536,0.024743,0.000290,0.000244,0.000245,0.000379,0.000293,0.000173,0.000333,0.167617,0.000294,0.000314,0.375406,0.000223,0.000194,0.002346,0.000188,0.375692,0.000247,0.869702,0.000234,0.000223,0.000234,0.000223,0.000234,0.000290,0.000197,0.059724,0.243615,0.663590,0.000196,0.000223,0.001046,0.000864,0.000806,0.000549,0.000415,0.000420,0.000508,0.000628,0.012169,0.002604,0.000416,0.001575,0.375691,0.000935,0.000921,0.001129,0.000474,0.000466
3,0.000146,0.000086,0.000240,1.000000,0.000191,0.000126,0.000140,0.000125,0.000133,0.000453,0.000134,0.000475,0.000145,0.000099,0.000134,0.000353,0.000184,0.000206,0.117586,0.000202,0.000283,0.000330,0.000086,0.000293,0.000346,0.823874,0.000115,0.000186,0.000114,0.000141,0.000240,0.000123,0.000367,0.000120,0.000144,0.000132,0.000175,0.000080,0.000132,0.000153,0.000145,0.000157,0.000114,0.000224,0.000124,0.000091,0.000249,0.000117,0.000166,0.000125,0.000296,0.000139,0.000152,0.000134,0.000131,0.000197,0.000157,0.000086,0.000185,0.000098,0.000158,0.000168,0.000676,0.000116,0.000100,0.000117,0.000094,0.000177,0.000129,0.000185,0.000121,0.000112,0.000121,0.000112,0.000121,1.000000,0.000104,0.000296,0.000132,0.000418,0.000103,0.000115,0.000609,0.261468,0.999999,0.000314,0.000233,0.000238,0.000291,0.000354,0.000237,0.355018,0.000236,0.000929,0.000176,0.000537,0.000538,0.000655,0.000268,0.000260
4,0.000220,0.000096,0.000364,0.000191,1.000000,0.930506,0.000189,0.000160,0.000177,0.000841,0.000177,0.000849,0.000206,0.000116,0.000185,0.000602,0.366440,0.815007,0.258070,0.000332,0.000501,0.000576,0.000084,0.776430,0.000584,0.000134,0.930060,0.000249,0.000110,0.000199,0.000369,0.930507,0.000665,0.000171,0.617216,0.000192,0.000267,0.000094,0.000175,0.000204,0.000200,0.347689,0.000170,0.284230,0.023477,0.000120,0.000385,0.000146,0.000215,0.000175,0.688018,0.014843,0.000190,0.000203,0.000178,0.556611,0.000224,0.000084,0.000295,0.028610,0.000226,0.000233,0.151097,0.000141,0.000118,0.000165,0.000091,0.000287,0.000166,0.000269,0.366347,0.930501,0.366347,0.930501,0.366347,0.000250,0.000133,0.000512,0.030751,0.000731,0.000135,0.370821,0.049039,0.000874,0.000867,0.000552,0.000388,0.000408,0.000519,0.930623,0.933792,0.002952,0.000405,0.001787,0.000285,0.927903,0.001007,0.364280,0.218186,0.366531
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,0.000574,0.000503,0.000935,0.000537,0.927903,0.996975,0.000659,0.000621,0.000642,0.001056,0.000642,0.001254,0.000628,0.000548,0.000607,0.001066,0.000748,0.873425,0.189376,0.000668,0.000766,0.000940,0.000556,0.741748,0.001074,0.000837,0.996514,0.000878,0.061514,0.000621,0.000914,0.996977,0.000923,0.000512,0.661661,0.000552,0.000679,0.000436,0.000627,0.000733,0.000662,0.000789,0.000456,0.291662,0.025538,0.000444,0.000944,0.000602,0.000825,0.000554,0.737308,0.008247,0.000777,0.000519,0.000601,0.583098,0.000682,0.000556,0.000655,0.000604,0.000683,0.000758,0.001912,0.000612,0.000550,0.000513,0.071665,0.000599,0.000647,0.023904,0.000643,0.996976,0.000643,0.996976,0.000643,0.000587,0.000518,0.000862,0.008789,0.001182,0.000503,0.005440,0.077806,0.001358,0.001106,0.000872,0.000750,0.000711,0.000781,0.996996,0.993060,0.003101,0.000704,0.001869,0.000598,1.000000,0.001211,0.001581,0.233961,0.001720
96,0.000567,0.000546,0.000921,0.000538,0.001007,0.000784,0.000683,0.000652,0.000669,0.000821,0.000669,0.001051,0.000637,0.000590,0.000623,0.000955,0.000745,0.001035,0.001082,0.000622,0.000650,0.000818,0.000615,0.001357,0.000973,0.000846,0.000783,0.000912,0.000815,0.000634,0.000893,0.000780,1.000000,0.000519,0.000981,0.000555,0.000667,0.000468,0.002332,0.000762,0.000681,0.000830,0.000453,0.001102,0.000599,0.000464,0.000920,0.000638,0.000866,0.000566,0.001023,0.000753,0.000822,0.000511,0.000620,0.001080,0.000692,0.000615,0.000625,0.000663,0.000693,0.000777,0.001659,0.000652,0.000591,0.000523,0.000669,0.000563,0.000681,0.000774,0.000684,0.000769,0.000684,0.000769,0.000684,0.000567,0.000545,0.000760,0.000716,0.001026,0.000526,0.000681,0.001094,0.001162,0.000868,0.000750,0.000689,0.000634,0.000659,0.001004,0.000964,0.188907,0.999999,0.999998,0.000562,0.001211,1.000000,0.001257,0.000730,0.000824
97,0.000694,0.124685,0.001129,0.000655,0.364280,0.000927,0.000821,0.000780,0.000803,0.001110,0.000803,0.001375,0.000772,0.000700,0.000752,0.071096,0.992210,0.001242,0.235334,0.000779,0.000846,0.001054,0.000722,0.230581,0.001233,0.000998,0.000922,0.001096,0.000957,0.000767,0.061869,0.000922,0.000997,0.000629,0.001154,0.001227,0.000819,0.000556,0.000783,0.109545,0.000821,0.941726,0.000554,0.035294,0.124717,0.000557,0.081639,0.000761,0.001037,0.000684,0.001268,0.021562,0.000981,0.000626,0.000747,0.035894,0.000839,0.000722,0.000775,0.077962,0.000840,0.000938,0.407991,0.000776,0.000701,0.000633,0.000785,0.000703,0.000815,0.000941,0.992200,0.000905,0.992200,0.000905,0.992200,0.000700,0.000652,0.000974,0.083658,0.001323,0.000630,0.992192,0.139375,0.001507,0.001168,0.000971,0.124887,0.000809,0.000860,0.001271,0.020014,0.003074,0.000799,0.001849,0.000702,0.001581,0.001257,1.000000,0.000928,0.992166
98,0.000290,0.000233,0.000474,0.000268,0.218186,0.234344,0.000319,0.000297,0.000309,0.000628,0.000310,0.000713,0.000310,0.000257,0.000296,0.000581,0.000375,0.205402,0.044744,0.000354,0.000432,0.000522,0.000253,0.174564,0.000580,0.000356,0.234228,0.000425,0.000335,0.000306,0.000466,0.234342,0.000534,0.000254,0.155616,0.000275,0.000345,0.000205,0.000303,0.000354,0.000323,0.000377,0.000230,0.068772,0.006144,0.000214,0.000482,0.000286,0.000395,0.000272,0.173486,0.002108,0.000370,0.000263,0.000293,0.137207,0.000337,0.000253,0.000341,0.000277,0.000338,0.000371,0.001064,0.000289,0.000258,0.000253,0.000275,0.000315,0.000310,0.000384,0.000304,0.234333,0.000304,0.234333,0.000304,0.000302,0.000248,0.000475,0.000322,0.000658,0.000242,0.001426,0.000841,0.000763,0.000655,0.000488,0.000401,0.000389,0.000442,0.234519,0.233515,0.001948,0.000385,0.001176,0.000314,0.233961,0.000730,0.000928,1.000000,0.000464


In [404]:
doc1 = lda_model.get_document_topics(bow=corpus[0],minimum_probability=numpy.finfo(float).eps)
doc2 = lda_model.get_document_topics(bow=corpus[19],minimum_probability=numpy.finfo(float).eps)
1-cosine(list(map(lambda x: x[1],doc1)),list(map(lambda x: x[1],doc2)))

0.9999999403953552

In [411]:

def get_constraints(lower_threshold, upper_threshold):
    must_link = []
    cannot_link = []
    for doc in range(len(similarity_df)):
        for other_doc in range(len(similarity_df)):
            if doc != other_doc:
                if similarity_df.iloc[doc,other_doc]>upper_threshold:
                    must_link.append((doc,other_doc))
                elif similarity_df.iloc[doc,other_doc]<lower_threshold:
                    cannot_link.append((doc,other_doc))
    return must_link,cannot_link

In [412]:
must_link,cannot_link = get_constraints(0.10,0.90)

In [None]:
sampled_must_link = random.sample(must_link,int(len(must_link)*0.10))
sampled_cannot_link = random.sample(cannot_link,int(len(cannot_link)*0.10))

In [415]:
df

Unnamed: 0,Doc,Text,Date
0,FBIS3-1,"POLITICIANS, PARTY PREFERENCES Summary: N...",1 March 1994
1,FBIS3-2,INTRODUCTION This guide is intended to ...,2 March 1994
2,FBIS3-3,SUMMARY Pyong...,2 March 1994
3,FBIS3-4,The following summaries highlight informat...,2 March 1994
4,FBIS3-5,SUMMARY Russia...,3 March 1994
...,...,...,...
95,FBIS3-96,"Language: <f p=""105""> English </f>Article Type...",1 Mar 1994
96,FBIS3-97,"Language: <f p=""105""> French </f>Article Type:...",1 Mar 1994
97,FBIS3-98,"Language: <f p=""105""> Tigrinya </f>Article Typ...",1 Mar 1994
98,FBIS3-99,"Language: <f p=""105"">English </f>Article Type:...",1 Mar 1994


In [417]:
Data.shape

(100, 10300)