In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.cluster import KMeans 
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA
from sklearn.preprocessing import normalize
from sklearn.metrics import pairwise_distances

import nltk
import string

import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use('fivethirtyeight')


# email module has some useful functions
import os, sys, email,re


In [43]:
df = pd.read_csv('projectfinder.csv')
df.head()
df.describe()

Unnamed: 0,skill_summary,title,description
count,5886,5886,5360
unique,5597,5376,5282
top,"dokumentation, 2nd level support, windows, sup...",Technischer Support,Projektbeschreibung \n\n ...
freq,17,23,19


In [42]:
title = df.title
description = df.description
skill = df.skill_summary


In [104]:
# load nltk's German stopwords'
stopwords = nltk.corpus.stopwords.words('german')
stopwords_eng = nltk.corpus.stopwords.words('english')
stopwords.extend(stopwords_eng)
aa = ['gute', 'kunde', 'm/w', 'f\xc3\xbcr']
stopwords.extend(aa)

In [105]:
# load nltk's SnowballStemmer
from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer("german")

In [106]:
#here I define a tokenizer and stemmer which returns the set of stems in the text that it is passed

def tokenize_and_stem(text):
    #convert text list from list to a string
    text_to_string = ''.join(list(text))
    # sentence tokenization first, then by word to ensure that punctuation is caught as it's own token
    list_of_tokens = [word for sentence in nltk.sent_tokenize(text_to_string) for word in nltk.word_tokenize(sentence)]
    cleaned_tokens = []
    #extract only alphabet tokens and add to cleaned_tokens
    for token in list_of_tokens:
        if re.search('[a-zA-Z]', token):
            cleaned_tokens.append(token)
    stemmed_tokens = [stemmer.stem(token) for token in cleaned_tokens]
    return stemmed_tokens

# this is used only for presentational purposes
def tokenize_only(text):
    #convert text list from list to a string
    text_to_string = ''.join(list(text))
    # first tokenize by sentence, then by word to ensure that punctuation is caught as it's own token
    tokens = [word.lower() for sent in nltk.sent_tokenize(text_to_string) for word in nltk.word_tokenize(sent)]
    filtered_tokens = []
    # filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation)
    for token in tokens:
        if re.search('[a-zA-Z]', token):
            filtered_tokens.append(token)
    return filtered_tokens

In [107]:
description_clean = [word for word in description if word not in stopwords]
stemmed_text = []
tokenized_text = []
for word in description_clean:
    stemmed_word = tokenize_and_stem(word) #for each item in 'synopses', tokenize/stem
    all_tokenized = tokenize_only(word)
    stemmed_text.extend(stemmed_word) #extend the 'totalvocab_stemmed' list
    tokenized_text.extend(all_tokenized)

In [108]:
vocab_frame = pd.DataFrame({'words': tokenized_text}, index = stemmed_text)
print('there are ' + str(vocab_frame.shape[0]) + ' items in vocab_frame')

there are 989680 items in vocab_frame


In [109]:
vocab_frame.head()

Unnamed: 0,words
business,business
intelligenc,intelligence
analyst,analyst
m/w,m/w
tableau,tableau


In [110]:
from sklearn.feature_extraction.text import TfidfVectorizer

#define vectorizer parameters
tfidf_vectorizer = TfidfVectorizer(max_df=0.8, max_features=200000,
                                 min_df=0.2, stop_words=stopwords,
                                 use_idf=True, tokenizer=tokenize_and_stem, ngram_range=(1,3))

%time tfidf_matrix = tfidf_vectorizer.fit_transform(description) #fit the vectorizer to description

print(tfidf_matrix.shape)

CPU times: user 50.5 s, sys: 210 ms, total: 50.7 s
Wall time: 50.9 s
(5360, 43)


In [92]:
terms = tfidf_vectorizer.get_feature_names()
terms

['aktuell',
 'anforder',
 'angab',
 'asap',
 'aufgab',
 'auslast',
 'berat',
 'bereich',
 'bewerb fur',
 'bitt',
 'dau',
 'direkt bewerb fur',
 'einsatzort',
 'entwickl',
 'erfahr',
 'erstell',
 'freu',
 'fur',
 'fur kund',
 'gern',
 'gmbh',
 'gruss',
 'gut',
 'interess',
 'java',
 'kenntnis',
 'konn',
 'kund',
 'm/w/d',
 'monat',
 'ort',
 'profil',
 'projekt',
 'sap',
 'send',
 'skill',
 'sowi',
 'start',
 'team',
 'uber',
 'unt',
 'unterstutz',
 'verfugbar']

In [111]:
from sklearn.metrics.pairwise import cosine_similarity
dist = 1 - cosine_similarity(tfidf_matrix)
dist

array([[-2.22044605e-16,  1.00000000e+00,  8.64780483e-01, ...,
         7.62872459e-01,  5.47389137e-01,  5.81232584e-01],
       [ 1.00000000e+00,  0.00000000e+00,  1.00000000e+00, ...,
         1.00000000e+00,  1.00000000e+00,  1.00000000e+00],
       [ 8.64780483e-01,  1.00000000e+00,  0.00000000e+00, ...,
         9.67935728e-01,  8.54688701e-01,  8.19076957e-01],
       ...,
       [ 7.62872459e-01,  1.00000000e+00,  9.67935728e-01, ...,
         0.00000000e+00,  5.86348037e-01,  5.67144662e-01],
       [ 5.47389137e-01,  1.00000000e+00,  8.54688701e-01, ...,
         5.86348037e-01, -2.22044605e-16,  8.01994160e-01],
       [ 5.81232584e-01,  1.00000000e+00,  8.19076957e-01, ...,
         5.67144662e-01,  8.01994160e-01, -4.44089210e-16]])

In [112]:
from sklearn.cluster import KMeans

num_clusters = 3 

km = KMeans(n_clusters=num_clusters)

%time km.fit(tfidf_matrix)

clusters = km.labels_.tolist()

CPU times: user 19.4 s, sys: 260 ms, total: 19.6 s
Wall time: 19.8 s


In [113]:
from sklearn.externals import joblib

joblib.dump(km,  'doc_cluster.pkl')

km = joblib.load('doc_cluster.pkl')
clusters = km.labels_.tolist()

In [114]:
projects = {'description': description, 'cluster': clusters }

frame = pd.DataFrame(projects, index = [clusters] , columns = ['cluster', 'description'])

In [115]:
frame['cluster'].value_counts()

0    3575
1    1100
2     685
Name: cluster, dtype: int64

In [116]:
from __future__ import print_function

print("Top terms per cluster:")
print()
#sort cluster centers by proximity to centroid
order_centroids = km.cluster_centers_.argsort()[:, ::-1] 

for i in range(num_clusters):
    print("Cluster %d words:" % i, end='')
    
    for ind in order_centroids[i, :6]: #replace 6 with n words per cluster
        print(' %s' % vocab_frame.ix[terms[ind].split(' ')].values.tolist()[0][0].encode('utf-8', 'ignore'), end=',')
    print() #add whitespace
    print() #add whitespace
    
    #print("Cluster %d titles:" % i, end='')
    #for title in frame.ix[i]['title'].values.tolist():
     #   print(' %s,' % title, end='')
    print() #add whitespace
    print() #add whitespace
    
print()
print()

Top terms per cluster:

Cluster 0 words:

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  if sys.path[0] == '':


 b'f\xc3\xbcr', b'erfahrung', b'kenntnisse', b'entwicklung', b'gute', b'kunde',



Cluster 1 words: b'sap', b'beratung', b'f\xc3\xbcr', b'erfahrung', b'kunde', b'projektes',



Cluster 2 words: b'skills', b'teams', b'java', b'start', b'asap', b'profil',





