In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.cluster import KMeans 
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA
from sklearn.preprocessing import normalize
from sklearn.metrics import pairwise_distances

import nltk
nltk.download('stopwords')
nltk.download('punkt')
import string

import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use('fivethirtyeight')


# email module has some useful functions
import os, sys, email,re


[nltk_data] Downloading package stopwords to /home/ki/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/ki/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [3]:
df = pd.read_csv('../projectfinder.csv')
df.dropna(axis=0, inplace=True)
df.head()
df.describe()

Unnamed: 0,skill_summary,title,description
count,5360,5360,5360
unique,5083,4979,5282
top,"dokumentation, 2nd level support, windows, sup...",Technischer Support,Projektbeschreibung \n\n ...
freq,17,23,19


In [6]:
col = [col for col in df.columns]
col

['skill_summary', 'title', 'description']

In [16]:
title = df['title']
description = df['description'].tolist()
skill = df['skill_summary']
print(type(description))

['Business Intelligence Analyst (m/w) - Tableau Desktop \r\n \r\n \r\nFür ein Kundenprojekt suchen wir Sie als \r\n \r\n Business Intelligence Analyst (m/w) - Tableau Desktop. \r\n \r\nDer Kunde hat ein sehr hohes, kontinui ...',
 'Konzeption, Customizing sowie Softwareanpassungen mit Talend ESB \r\n Implementierung von Softwaresystemen mit Java \r\n Analyse sowie Design von Softwarearchitekturen',
 'Als Mitglied eines kleinen, dynamischen Teams in München lösen Sie gemeinsam mit den Kollegen die technischen Probleme unseres Referenzkunden.\r\n Servicezeiten: Montag - Freitag, von 07:00- 18:00 (zwei Schichten)\r\n Entgegennahme, Analyse und Zuordnung ein\xadge\xadhen\xadder Stö\xadrungen\r\n Übernahme und Dokumentation ein\xadge\xadhen\xadder Anfra\xadgen und Pro\xadble\xadme\r\n Erste Lösungsansätze per Telefon bzw. Remote\r\n Bearbeitung der Anfrage - remote\r\n Problemlösung , ggf. Weiter\xadlei\xadtung der Anfra\xadgen an nach\xadge\xadla\xadgerte Fach\xadab\xadtei\xadlungen\r\n Pr

In [7]:
# load nltk's German stopwords'
with open('stopwords-de.txt', 'r') as f:
    stopwords = f.read().splitlines()
print(len(stopwords))
stopwords_eng = nltk.corpus.stopwords.words('english')
stopwords.extend(stopwords_eng)
print(len(stopwords))

622
801


In [8]:
# load nltk's SnowballStemmer
from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer("german")

In [9]:
#here I define a tokenizer and stemmer which returns the set of stems in the text that it is passed

def tokenize_and_stem(text):
    #convert text list from list to a string
    text_to_string = ','.join(str(v) for v in text)
    
    # sentence tokenization and lowercasing first, then by word to ensure that punctuation is caught as it's own token
    list_of_tokens = [word.lower() for sentence in nltk.sent_tokenize(text_to_string) for word in nltk.word_tokenize(sentence)]
    
    #extract only alphanumic tokens and add to cleaned_tokens
    alphanumeric_tokens = [token for token in list_of_tokens if token.isalnum()]
    print(alphanumeric_tokens[:20])
    
    #stopwords removal
    cleaned = [word for word in alphanumeric_tokens if word not in stopwords]
    print(cleaned[:20])
    
    #stemming
    stemmed_tokens = [stemmer.stem(token) for token in cleaned]
    print(stemmed_tokens[:20])

    return stemmed_tokens

# this is used only for presentational purposes
def tokenize_only(text):
    #convert text list from list to a string
    text_to_string = ','.join(str(v) for v in text)
    # first tokenize by sentence, then by word to ensure that punctuation is caught as it's own token
    list_of_tokens = [word.lower() for sent in nltk.sent_tokenize(text_to_string) for word in nltk.word_tokenize(sent)]
    
    #stopwords removal
    cleaned = [word for word in list_of_tokens if word not in stopwords]
    
    # filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation)
    filtered_tokens = [token for token in cleaned if token.isalnum()]
    

    return filtered_tokens

In [10]:
stemmed_text = tokenize_and_stem(description)
tokenized_text = tokenize_only(description)
#for word in description:
 #   stemmed_word = tokenize_and_stem(word) #for each item in 'synopses', tokenize/stem
  #  all_tokenized = tokenize_only(word)
   # stemmed_text.extend(stemmed_word) #extend the 'totalvocab_stemmed' list
    #tokenized_text.extend(all_tokenized)
    


['business', 'intelligence', 'analyst', 'tableau', 'desktop', 'für', 'ein', 'kundenprojekt', 'suchen', 'wir', 'sie', 'als', 'business', 'intelligence', 'analyst', 'tableau', 'desktop', 'der', 'kunde', 'hat']
['business', 'intelligence', 'analyst', 'tableau', 'desktop', 'kundenprojekt', 'suchen', 'business', 'intelligence', 'analyst', 'tableau', 'desktop', 'kunde', 'hohes', 'kontinui', 'konzeption', 'customizing', 'softwareanpassungen', 'talend', 'esb']
['business', 'intelligenc', 'analyst', 'tableau', 'desktop', 'kundenprojekt', 'such', 'business', 'intelligenc', 'analyst', 'tableau', 'desktop', 'kund', 'hoh', 'kontinui', 'konzeption', 'customizing', 'softwareanpass', 'talend', 'esb']


In [125]:
vocab_frame = pd.DataFrame({'words': tokenized_text}, index = stemmed_text)
print('there are ' + str(vocab_frame.shape[0]) + ' items in vocab_frame')

there are 562931 items in vocab_frame


In [126]:
vocab_frame.head()

Unnamed: 0,words
business,business
intelligenc,intelligence
analyst,analyst
tableau,tableau
desktop,desktop


In [139]:

vectorizer = TfidfVectorizer(stop_words=stopwords)

X = vectorizer.fit_transform(description)
X

  'stop_words.' % sorted(inconsistent))


<5360x31978 sparse matrix of type '<class 'numpy.float64'>'
	with 490000 stored elements in Compressed Sparse Row format>

In [140]:
true_k = 5
model = KMeans(n_clusters=true_k)
model.fit(X)

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
    n_clusters=5, n_init=10, n_jobs=None, precompute_distances='auto',
    random_state=None, tol=0.0001, verbose=0)

In [141]:
order_centroids = model.cluster_centers_.argsort()[:, ::-1]
terms = vectorizer.get_feature_names()

In [144]:
for i in range(true_k):
    print()
    print(f'Cluster {i}')
    
    for ind in order_centroids[i, :10]:
        print(terms[ind])


Cluster 0
sap
bewerben
abap
direkt
melden
berater
fi
erfahrung
kenntnisse
hana

Cluster 1
contractor
https
impressum
www
de
münchen
informationen
frankfurt
main
betroffenenrechte

Cluster 2
hays
kontakt
passende
bewerben
positionen
profitieren
kostenfrei
erfahrung
vorteile
aufgaben

Cluster 3
experience
bewerben
knowledge
project
development
data
skills
sap
looking
working

Cluster 4
bewerben
direkt
melden
kenntnisse
erfahrung
java
entwicklung
kunden
projekt
entwickler


In [145]:

print("Prediction")
X = vectorizer.transform(["Produktumfeld der Firma VMWARE: Airwatch (sehr gute Kenntnisse)Netzwerktechnik: LAN, DMZ, Rechenzentrum (Vertiefte Kenntnisse) MS-Office Tools: Word, Excel, Visio, Powerpoint (Vertiefte Kenntnisse) Bereitschaft zur Sicherheitsüberprüfung Level 2 (Ü2)"])
predicted = model.predict(X)
print(predicted)

Prediction
[4]
