IMPORTING NECESSARY LIBRARIES

In [2]:
import numpy as np
import pandas as pd
import spacy
from tqdm import tqdm
from gensim.models import doc2vec
from collections import namedtuple
from sklearn.cluster import DBSCAN
from scipy import spatial
import operator
from dask.distributed import Client
from sklearn.externals.joblib import parallel_backend
import datetime

In [3]:
df = pd.read_csv(r"Eluvio_DS_Challenge.csv")

In [4]:
df.head()

Unnamed: 0,time_created,date_created,up_votes,down_votes,title,over_18,author,category
0,1201232046,2008-01-25,3,0,Scores killed in Pakistan clashes,False,polar,worldnews
1,1201232075,2008-01-25,2,0,Japan resumes refuelling mission,False,polar,worldnews
2,1201232523,2008-01-25,3,0,US presses Egypt on Gaza border,False,polar,worldnews
3,1201233290,2008-01-25,1,0,Jump-start economy: Give health care to all,False,fadi420,worldnews
4,1201274720,2008-01-25,4,0,Council of Europe bashes EU&UN terror blacklist,False,mhermans,worldnews


In [5]:
title_list = list(df['title'])
print(title_list[:10])

['Scores killed in Pakistan clashes', 'Japan resumes refuelling mission', 'US presses Egypt on Gaza border', 'Jump-start economy: Give health care to all ', 'Council of Europe bashes EU&UN terror blacklist', 'Hay presto! Farmer unveils the  illegal  mock-Tudor castle he tried to hide behind 40ft hay bales', 'Strikes, Protests and Gridlock at the Poland-Ukraine Border', 'The U.N. Mismanagement Program', 'Nicolas Sarkozy threatens to sue Ryanair ', 'US plans for missile shields in Polish town met with resistance [video]']


CREATING VECTORS FOR THE TITLE

In [6]:
docs = []
analyzedDocument = namedtuple('AnalyzedDocument', 'words tags')
for i, title in enumerate(title_list[:20000]):
    words = title.lower().split()
    tags = [i]
    docs.append(analyzedDocument(words, tags))

model = doc2vec.Doc2Vec(docs, size = 100, window = 300, min_count = 1, workers = 4)



In [8]:
vec_list = []
#print(len(model.docvecs))
for i in range(len(model.docvecs)):
    vec_list.append(list(model.docvecs[i]))
vec_array = np.array(vec_list)
#list(model.docvecs[999])

In [None]:


#db_s = DBSCAN(eps = 0.5,min_samples = 2,metric = 'cosine')
#with parallel_backend('dask'):
#    db_s.fit(vec_array)


CLUSTERING BASED ON DBSCAN(Density Based Spatial Clustering) and DASK(Parallelization of the task)

In [9]:
num_classes = {}
client = Client()
time_now = datetime.datetime.now()
for i in tqdm(np.arange(0.5,1,0.1)):
    db_scan = DBSCAN(eps = i,min_samples = 2,metric = 'cosine')
    with parallel_backend('dask'):
        db_scan.fit(vec_array)
    num_classes.update({i:len(pd.Series(db_scan.labels_).value_counts())})
db_scan = DBSCAN(eps = 0.08,min_samples = 2,metric = 'cosine').fit(vec_array)
print(datetime.datetime.now() - time_now)

100%|████████████████████████████████████████████████████████████████████████████████████| 5/5 [01:26<00:00, 17.31s/it]


0:01:31.110785


In [11]:
#num_classes
res = pd.DataFrame({'label':db_scan.labels_,'title':title_list[:20000]})
exam_res = {}
eve_df = {}
labels = list(res['label'])
#print(labels)
unique_labels = np.unique(labels)
for l in unique_labels:
    exam_res[l] = res[res.label==l].title.tolist()
    print(len(exam_res[l]))
    eve_df[l] = df[df['title'].isin(exam_res[l])][['date_created','title']]
    eve_df[l]['date_created'] = pd.to_datetime(eve_df[l]['date_created'])
    eve_df[l] = eve_df[l].sort_values(by= 'date_created')

10442
9410
148


In [12]:
eve_df[0]

Unnamed: 0,date_created,title
1,2008-01-25,Japan resumes refuelling mission
3,2008-01-25,Jump-start economy: Give health care to all
4,2008-01-25,Council of Europe bashes EU&UN terror blacklist
5,2008-01-25,Hay presto! Farmer unveils the illegal mock-...
6,2008-01-25,"Strikes, Protests and Gridlock at the Poland-U..."
...,...,...
19999,2008-11-26,Congestion Pricing
19978,2008-11-26,Protesters force Bangkok airport to close - an...
31722,2009-04-07,The dark side of Dubai
33119,2009-04-24,The dark side of Dubai


RECREATING VECTORS FOR THE TITLES PRESENT IN A CLUSTER

In [13]:
title_list_eve_df = {}
model_eve_df = {}
#docs_eve_df = {}
for l in unique_labels:
    title_list_eve_df[l] = list(eve_df[l]['title'])
    #print(eve_df[l])
    docs_eve_df = []
    analyzedDocument = namedtuple('AnalyzedDocument', 'words tags')
    for i, title in enumerate(title_list_eve_df[l]):
        words = title.lower().split()
        tags = [i]
        #print(words)
        docs_eve_df.append(analyzedDocument(words, tags))

    
    #print(docs_eve_df[l])
    model_eve_df[l] = doc2vec.Doc2Vec(docs_eve_df, size = 100, window = 300, min_count = 1, workers = 4)



In [15]:
v = {}
for l in unique_labels:
    v[l] = []
    for i in range(len(model_eve_df[l].docvecs)):
        v[l].append(model_eve_df[l].docvecs[i])
    

CREATING A REPRESENTATION FOR EACH CLUSTER

In [16]:
res_v = {}
for l in unique_labels:
    res_v[l] = [sum(x) for x in zip(*v[l])]

LOADING WORD EMBEDDINGS USING GLOVE WORD VECTORS

In [17]:
emb_ind = {}
f = open('glove.6B.100d.txt',encoding = 'utf-8')
for line in f:
    val = line.split()
    word = val[0]
    vec = np.asarray(val[1:],dtype = 'float32')
    emb_ind[word] = vec

In [18]:
input_text = "thai"
norm_input_text = input_text.lower()
print(emb_ind[norm_input_text])

[-0.46305  -0.24295   0.1877    0.41373  -0.084503  0.57225  -0.9377
 -0.30792   0.17755   0.073116 -0.13558   0.55189   0.41187   0.4645
 -0.42855  -0.1143   -0.011692 -1.4277    0.42467   0.57495   0.90547
 -0.36729   0.39568  -0.045399 -0.53152   0.12222   0.9691   -0.65784
 -0.20899  -0.17851  -0.84276   0.96525  -0.13295  -0.19877   0.17407
  0.2779    0.026158  0.28856  -0.11211  -0.76247  -0.75057  -0.33531
  0.80206  -0.42961   1.1994    0.57101  -0.8373    0.69643  -0.40706
 -0.91319  -0.70247  -0.39663   0.46995   0.48096  -0.12052  -0.35177
 -1.0832    0.67436   2.2427    0.65199  -0.63183   0.033356 -0.21366
  0.22413  -0.23586   0.98331  -0.60243  -0.066084  0.29879  -0.055371
  0.46147   0.21237  -0.26871   0.14616  -0.70399   1.0516   -0.5702
  1.0974   -0.57968   0.26906   0.85709  -0.86308   0.34488  -0.64482
 -0.23066  -0.33316  -0.47587   0.7897    0.23109  -0.061796  0.89546
 -0.16581  -0.43084  -0.11343  -0.41524   0.37104  -0.3025   -0.4017
  0.17938   0.04789 ]


COSINE SIMILARITY BETWEEN WORD AND CLUSTER REPRESENTATION

In [19]:
cos_sim = {}
for l in unique_labels:
    cos_sim[l] = 1-spatial.distance.cosine(emb_ind[norm_input_text],res_v[l])

In [20]:
#cos_sim[1]
cluster_ind = max(cos_sim.items(),key = operator.itemgetter(1))[0]
print(cluster_ind)

-1


COSINE SIMILARITY BETWEEN WORD AND TITLES IN THE CLUSTER CHOSEN IN THE PREVIOUS STEP

In [21]:
cos_sim_in_cluster = {}
for i in range(len(model_eve_df[cluster_ind].docvecs)):
    cos_sim_in_cluster[i] = 1-spatial.distance.cosine(model_eve_df[cluster_ind].docvecs[i],emb_ind[norm_input_text])


In [22]:
lt = list(cos_sim_in_cluster.values())
#sorted(lt,reverse = True)
#lt.sort(reverse = True)
#print(lt)
#print(lt[:15])
title_ind = sorted(range(len(lt)),key = lambda i:lt[i])[-15:]

In [23]:
title_ind

[10296,
 1735,
 5750,
 3463,
 155,
 9237,
 1960,
 1483,
 8951,
 9042,
 6048,
 2246,
 5963,
 2554,
 925]

IDENTIFYING TITLES

In [25]:
eve_df[cluster_ind].iloc[title_ind]

Unnamed: 0,date_created,title
19595,2008-11-21,You found the Puppy Tosser. Let s see if you c...
3014,2008-04-06,Two documentaries about the future of our food...
10620,2008-07-01,Polish president throws the EU into new crisis...
5998,2008-05-09,Is Global Warming to Blame for the Cyclone
287,2008-02-19,Myanmar says constitution draft is ready - New...
17768,2008-10-21,ID cards on UK driving licences - law can t s...
3398,2008-04-10,Police State Torch Run
2487,2008-03-31,An Iraqi s Story: My Brother Is Dead...and I ...
17205,2008-10-10,Two of this week s Nobel Prize winners talk ab...
17379,2008-10-14,My eight-year-old daughter - who has kidney pr...
