In [5]:
#%matplotlib inline
import pandas as pd
import numpy as np
import pkg_resources
import matplotlib as mpl
#import pylab as ply

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import make_pipeline
from sklearn.manifold import TSNE
from sklearn.cluster import KMeans
import pandas as pd

import examples.misc.lexical_analysis as lexan
from sfbistats import utils

from bokeh.plotting import figure, show
from bokeh.io import output_notebook
from bokeh import models as bkm
output_notebook()

In [6]:
stopword_dict = lexan.get_stopwords()
# add some stuff that isn't in the stopwords lists
stopword_dict['-'] = True
stopword_dict['dot'] = True
stopword_dict['fr'] = True
stopword_dict['www'] = True
stopword_dict['http'] = True
stopword_dict['al'] = True

In [7]:
job_list = utils.load_from_json(open("/home/ludovic/miniconda2/envs/sfbistatsenv/SFBIStats/resources/jobs.json","r"))
#corpus_content = list()
#for job in job_list:
#    corpus_content.append(job['description'])
df_mail = pd.DataFrame(job_list,columns=['description', 'title', 'submission_date', 'contract_type', 'contract_subtype', 'city'])
corpus_content = df_mail['description']
df_mail.head()

Unnamed: 0,description,title,submission_date,contract_type,contract_subtype,city
0,Le défi majeur que doit relever aujourd'hui l'...,[Stage M1] Développement d'outils pour l'écoto...,2012-11-13,Stage,,Villeurbanne
1,Advances in synthetic biology promise to give ...,Stage M2: In silico analysis of toxicity for s...,2013-11-25,Stage,,Evry
2,Poste de biostatisticien à l'Institut Curie: a...,Poste de biostatisticien à l'Institut Curie: a...,2014-03-03,CDD,Post-doc / IR,Paris
3,Subject: thèse en métagénomique et assignation...,Thèse en métagénomique et assignation taxonomi...,2013-03-07,Thèse,,Lille
4,Les sarcomes indifférenciés des tissus mous so...,Détection de mutations somatiques dans les ré...,2015-10-12,Stage,,Bordeaux


In [8]:
print("Start text mining")
lex_dic = lexan.build_lex_dic(corpus_content, stopword_dict)
# keep only significant stuff >= 3 occurences
lex_dic = {key: value for key, value in lex_dic.items() if value >= 3 }
print(len(lex_dic.keys()))

Start text mining
12516


In [9]:
tf = TfidfVectorizer(stop_words=stopword_dict.keys())#, analyzer=lambda w: lex_dic.keys())
svd = TruncatedSVD(n_components=50)
lsa = make_pipeline(tf, svd)

X = lsa.fit_transform(corpus_content)
print("X")
print(X)
Y = TSNE().fit_transform(X)
print("Y")
print(Y)

X
[[  1.88304900e-01  -6.38469498e-02   1.06870141e-02 ...,   8.42894259e-03
    5.79758309e-03  -1.46494595e-02]
 [  4.65383320e-02   1.04637582e-01   8.43258079e-03 ...,  -8.78852815e-03
    2.85343623e-02  -2.96346365e-02]
 [  3.42087586e-01  -3.93916652e-02  -2.27869977e-01 ...,  -2.98474065e-02
   -1.23589961e-02  -6.06701667e-02]
 ..., 
 [  1.74977725e-01   6.84507797e-02  -1.49769386e-02 ...,  -3.30125313e-02
    1.24218157e-03   1.20090433e-02]
 [  2.53757066e-01  -8.97823386e-02  -4.34064247e-02 ...,  -5.25823665e-02
    6.90074036e-02   9.92018012e-03]
 [  2.05891982e-01  -6.84282901e-02   7.78269804e-02 ...,  -2.03196269e-04
   -1.52780311e-02  -2.27543918e-02]]
Y
[[-13.4581066    6.18622482]
 [  2.4028286    1.39685525]
 [  4.71272386  16.96647094]
 ..., 
 [ -2.82767805   7.83626061]
 [-15.75994942  13.37107663]
 [ -6.65913978   0.38781997]]


In [10]:
Yx, Yy = zip(*Y)
df_mail['x'] = pd.Series(Yx)
df_mail['y'] = pd.Series(Yy)
df_mail.head()

Unnamed: 0,description,title,submission_date,contract_type,contract_subtype,city,x,y
0,Le défi majeur que doit relever aujourd'hui l'...,[Stage M1] Développement d'outils pour l'écoto...,2012-11-13,Stage,,Villeurbanne,-13.458107,6.186225
1,Advances in synthetic biology promise to give ...,Stage M2: In silico analysis of toxicity for s...,2013-11-25,Stage,,Evry,2.402829,1.396855
2,Poste de biostatisticien à l'Institut Curie: a...,Poste de biostatisticien à l'Institut Curie: a...,2014-03-03,CDD,Post-doc / IR,Paris,4.712724,16.966471
3,Subject: thèse en métagénomique et assignation...,Thèse en métagénomique et assignation taxonomi...,2013-03-07,Thèse,,Lille,-2.153798,1.67259
4,Les sarcomes indifférenciés des tissus mous so...,Détection de mutations somatiques dans les ré...,2015-10-12,Stage,,Bordeaux,-8.193922,12.078352


In [11]:
km = KMeans(n_clusters=20, init='k-means++', n_init=50)
km.fit(X)
df_mail['kmean_group'] = pd.Series(km.labels_)
df_mail.head()

Unnamed: 0,description,title,submission_date,contract_type,contract_subtype,city,x,y,kmean_group
0,Le défi majeur que doit relever aujourd'hui l'...,[Stage M1] Développement d'outils pour l'écoto...,2012-11-13,Stage,,Villeurbanne,-13.458107,6.186225,2
1,Advances in synthetic biology promise to give ...,Stage M2: In silico analysis of toxicity for s...,2013-11-25,Stage,,Evry,2.402829,1.396855,0
2,Poste de biostatisticien à l'Institut Curie: a...,Poste de biostatisticien à l'Institut Curie: a...,2014-03-03,CDD,Post-doc / IR,Paris,4.712724,16.966471,10
3,Subject: thèse en métagénomique et assignation...,Thèse en métagénomique et assignation taxonomi...,2013-03-07,Thèse,,Lille,-2.153798,1.67259,2
4,Les sarcomes indifférenciés des tissus mous so...,Détection de mutations somatiques dans les ré...,2015-10-12,Stage,,Bordeaux,-8.193922,12.078352,2


In [12]:
#colors = [
#    "#%02x%02x%02x" % (int(r),int(g),int(b)) for r,g,b,_ in 255*mpl.cm.Dark2(mpl.colors.Normalize()(km.labels_))
#]
colors = list()
for r,g,b,_ in mpl.cm.hsv(mpl.colors.Normalize()(df_mail['kmean_group'])):
    colors.append(mpl.colors.rgb2hex((r,g,b)))
df_mail['color'] = pd.Series(colors)
#for i in km.labels_:
#    c = i*10
#    rgb = (c,c,c)
#    colors.append(mpl.colors.rgb2hex(rgb))
len(df_mail.index)

1509

In [13]:
p = figure(plot_width=800, plot_height=800)
scatter_mail = p.scatter(source=bkm.ColumnDataSource(df_mail), x='x', y='y', size=10, fill_color='color', alpha=0.7)
hover_tool = bkm.HoverTool(tooltips=[('Title', '<p>@title</p>'), ('City', '<p>@city</p>'), ('Type', '<p>@contract_type</p>'), ('Subtype', '<p>@contract_subtype</p>'), ('Date', '<p>@submission_date</p>')], renderers=[scatter_mail])
p.add_tools(hover_tool)
show(p)