In [1]:
 %matplotlib inline
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
import gensim
from gensim.models import LdaModel
from gensim import models, corpora, similarities
import re
import time
from nltk import FreqDist
from scipy.stats import entropy
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style("darkgrid")
from pymongo import MongoClient
from nltk.tokenize import word_tokenize
import pickle
import string
from string import punctuation

In [2]:
 #Details for getting data from projectfinder
db_loc = {
    'ip' :'10.10.250.0',
    'port' : 27017,
    'database' : 'projectfinder',
    'collection' : 'hdfs_2'
}

In [3]:
def load_dataset_from_momgodb(db_obj):
    
    """
    This method loads a dataset as a pandas dataframe from MongoDB 
    
    Parameters:
    @db_obj (dict): Storing the ip address, port number, database name and collection name for dataset to be loaded
    
    Returns:
    panadas dataframe: Containing the loaded dataset
    """
    
    #Extracting the items from the inputted dictionary
    dbname = db_obj['database']
    ip = db_obj['ip']
    port = db_obj['port']
    collection = db_obj['collection']
    
    #Creating a connection to the database using MongoClient
    connection = MongoClient(ip, port)
    db = connection[dbname]
    
    #Excluding the fileds which are not needed in the dataframe 
    #Currenlty excluding the id associated with each document of the collection
    
    exclude_field = {'_id': False}
    raw_dataset = list(db[collection].find({}, projection=exclude_field))
    
    dataset = pd.DataFrame(raw_dataset)
    print(f'Data loaded from mongodb {collection} collection succesfully')
    return dataset

In [7]:
df = load_dataset_from_momgodb(db_loc)
df.shape

Data loaded from mongodb hdfs_2 collection succesfully


(253221, 1)

In [6]:
#df.drop('description', axis=1, inplace=True)

In [23]:
#df = df[df['ready'] != '']

In [24]:
#df = df.drop_duplicates()

In [25]:
#df.shape

(253221, 1)

In [4]:
db_data = {
    'ip' :'10.10.250.0',
    'port' : 27017,
    'db' : 'projectfinder',
    'coll' : 'hdfs_2'
}

In [5]:
def save_to_momgodb(df,db_):
    data = df.to_dict(orient='records')
    dbname = db_['db']
    ip = db_['ip']
    port = db_['port']
    coll = db_['coll']
    connection = MongoClient(ip,port)
    db = connection[dbname]
    col = db[coll].insert_many(data)
    print(f'data saved as {coll}')

In [30]:
save_to_momgodb(df, db_data)

data saved as hdfs_2


In [7]:
df.shape

(253221, 1)

In [9]:
df.head()

Unnamed: 0,ready
0,Projekt in Köln: Cloud Architekt (m/w) gesucht...
1,Für unseren Kunden suchen wir asap einen Senio...
2,Für ein Teilzeitprojekt in Hamburg sucht unser...
3,Projektbeschreibung \n\n ...
4,Future Consulting GmbH \r\nentscheiden Sie sic...


In [10]:
pickle_in = open('constants/stopwords.pickle',"rb")
stopwords_all = pickle.load(pickle_in)

In [11]:
len(stopwords_all)

13242

In [12]:
pickle_in = open('constants/stemmer_own.pickle',"rb")
stemmer_own = pickle.load(pickle_in)

In [13]:
len(stemmer_own)

14

In [14]:
stemmer_own

{'admin': 'admin',
 'verwaltung': 'administration',
 'architektur': 'architektur',
 'agil': 'agile',
 'app': 'application',
 'anwend': 'application',
 'automat': 'automate',
 'consultant': 'berater',
 'berat': 'berater',
 'cisco': 'cisco',
 'contin': 'continuous',
 'schnittstell': 'interface',
 'überwach': 'monitoring',
 'mobil': 'mobil'}

In [15]:
#german cities
from scripts_mod.bundeslander import Baden_Württemberg, Bayern, Berlin, Brandenburg, Bremen, Hamburg, Hessen, Mecklenburg_Vorpommern, Niedersachsen, Nordrhein_Westfalen, Rheinland_Pfalz, Saarland, Sachsen, Sachsen_Anhalt, Schleswig_Holstein, Thüringen, Ausland

All = Baden_Württemberg + Bayern + Berlin + Brandenburg + Bremen +Hamburg + Hessen + Mecklenburg_Vorpommern + Niedersachsen + Nordrhein_Westfalen + Rheinland_Pfalz + Saarland + Sachsen + Sachsen_Anhalt + Schleswig_Holstein + Thüringen + Ausland
cities = list(set([city.lower() for city in All]))

In [16]:
months = ['Januar', 'January','Februar', 'February', 'März', 'March', 'April', 'Mai', 'May', 'Juni', 'June', 'Juli', 
          'July', 'August', 'September', 'Oktober', 'October', 'November', 'Dezember', 'December']
months = [month.lower() for month in months]
print(months)

['januar', 'january', 'februar', 'february', 'märz', 'march', 'april', 'mai', 'may', 'juni', 'june', 'juli', 'july', 'august', 'september', 'oktober', 'october', 'november', 'dezember', 'december']


In [17]:
stopwords_all = list(set(stopwords_all + cities + months))
len(stopwords_all)

13242

In [15]:
a = df.values # Assuming you have two columns A and B
#df['C'] = np.where(a[:,1]>5,a[:,0],0.1*a[:,0]*a[:,1])

In [17]:
a[:,0]

array(['Projekt in Köln: Cloud Architekt (m/w) gesucht! \r\nMit meinem Kunden im öffentlichen Bereich suchen wir nach einem Cloud Architekten (m/w). \r\nIhre Kompetenzen: \r\n * ARIS und möglichst SPARX \r\n * Clou ...',
       'Für unseren Kunden suchen wir asap einen Senior-Berater Einführung EWM (Fokus: Retail) (w/m) in Bornheim in der Pfalz. \r\nUnsere Projektnummer: 1-4691 \r\nProjektbeschreibung: \r\nFür unser agiles Proje ...',
       'Für ein Teilzeitprojekt in Hamburg sucht unser Kunde Unterstützung von einem IT Manager (m/w) für die Handhabung von IDV gleichwertig zu IT-Applikationen, IT-Governance nach COBIT5, IT-Betrieb (Anpassungen an IT-Betriebsprozessen aufgrund BAIT, z.B. in Informationssicherheitsmanagement, Testmanagement, Benutzerberechtigungsmanagement), IT-Risikomanagement und Informationssicherheitsmanagement als auch Compliance. \nRahmendaten \n * Start: September 2017 \n * Laufzeit: 31.05.2018 \n * Einsatzort: Hamburg \n * Auslastung: Teilzeit vor Ort \n * Proje

In [18]:
def text_processing(text):
    """Normalize, tokenize, stem the original text string
    
    Args:
    text: string. String containing message for processing
       
    Returns:
    cleaned: list of strings. List containing normalized and stemmed word tokens with bigrams
    """

    try:
        text = re.sub(r'(\d)',' ',text.lower())
        text = re.sub('[%s]' % re.escape(string.punctuation), ' ', text)
        tokens = word_tokenize(text)
        tokens_cleaned = [word for word in tokens if word not in stopwords_all and len(word) > 1]
        
        stemmed_tokens = []
        stemmer_keys = list(stemmer_own.keys())
        for word in tokens_cleaned:
            for stemmer_key in stemmer_keys:
                if stemmer_key in word:
                    stemmed_word = stemmer_own[stemmer_key]
                    stemmed_tokens.append(stemmed_word)
                    break
            else:
                stemmed_tokens.append(word)
  
                

    except IndexError:
        pass

    return stemmed_tokens

In [14]:
print(pd.__version__)

0.25.0


In [15]:
df.shape

(253221, 1)

In [48]:
df.shape

(253221, 1)

In [54]:
df.ready[63301]

'P,r,o,j,e,k,t,b,e,s,c,h,r,e,i,b,u,n,g, ,\n,\n, , , , , , , , , , , , , , , , , , , , , , , , , , , , , ,\n, , , , , , , , , , , , , , ,M,e,l,d,e,n, ,S,i,e, ,s,i,c,h, ,j,e,t,z,t, ,a,n, ,u,n,d, ,b,e,w,e,r,b,e,n, ,S,i,e, ,s,i,c,h, ,d,i,r,e,k,t,!, ,\n, , , , , , , , , , , , , , ,J,e,t,z,t, ,b,e,w,e,r,b,e,n, ,\n, , , , , , , , , ,\n, , , , ,\n,\n, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ,F,ü,r, ,e,i,n,e,n, ,u,n,s,e,r,e,r, ,G,r,o,ß,k,u,n,d,e,n, ,s,u,c,h,e,n, ,w,i,r, ,d,e,r,z,e,i,t, ,e,i,n,e,n, ,f,r,e,i,b,e,r,u,f,l,i,c,h,e,n, ,M,i,t,a,r,b,e,i,t,e,r, ,o,d,e,r, ,e,i,n,e,n, ,M,i,t,a,r,b,e,i,t,e,r, ,z,u,r, ,F,e,s,t,a,n,s,t,e,l,l,u,n,g,,, ,d,e,r, ,d,i,e, ,n,a,c,h,f,o,l,g,e,n,d,e,n, ,A,n,f,o,r,d,e,r,u,n,g,e,n, ,e,r,f,ü,l,l,t,:, ,I,h,r,e, ,A,u,f,g,a,b,e,n,:, , ,S,t,ö,r,u,n,g,s,a,n,a,l,y,s,e, ,u,n,d, ,L,ö,s,u,n,g,s,e,n,t,w,i,c,k,l,u,n,g, ,i,n, ,e,i,n,e,m, ,k,o,m,p,l,e,x,e,n, ,I,T,-,U,m,f,e,l,d, ,V,e,r,g,a,b,e, ,v,o,n, ,B,e,r,e,c,h,t,i,g,u,n,g,e,n, ,i,n, ,A,c,t,i,v,e, ,D,i,r,e,c

In [55]:
np.array_split(df, 10)

[                                                   ready
 0      Projekt in Köln: Cloud Architekt (m/w) gesucht...
 1      Für unseren Kunden suchen wir asap einen Senio...
 2      Für ein Teilzeitprojekt in Hamburg sucht unser...
 3      Projektbeschreibung \n\n                      ...
 4      Future Consulting GmbH \r\nentscheiden Sie sic...
 ...                                                  ...
 25318  Projektbeschreibung \n\n                      ...
 25319  Für unseren Kunden aus dem Öffentlich-Rechtlic...
 25320  Projektbeschreibung \n\n                      ...
 25321  Eine der führenden deutschen Banken sucht für ...
 25322  Projektbeschreibung \n\n                      ...
 
 [25323 rows x 1 columns],
                                                    ready
 25323  Für unseren Kunden, ein Unternehmen der Mobili...
 25324  Projektbeschreibung \n\n                      ...
 25325  Projektbeschreibung \n\n                      ...
 25326  Für folgende Aufgabe suchen wir ein

In [56]:

df2 = df.ready[50640 : 177260]
df2.shape

(126620,)

In [57]:
df2.head()

50640    A,u,f,g,a,b,e,:, ,\n,*, ,A,n,a,l,y,s,e, ,d,e,s...
50641    P,r,o,j,e,k,t,b,e,s,c,h,r,e,i,b,u,n,g, ,\n,\n,...
50642    A,u,f,g,a,b,e,:, ,\n,*, ,E,n,t,w,i,c,k,e,l,n, ...
50643    A,u,f,g,a,b,e,:, ,\n,*, ,F,E,M,-,B,e,r,e,c,h,n...
50644    P,r,o,j,e,k,t,b,e,s,c,h,r,e,i,b,u,n,g, ,\n,\n,...
Name: ready, dtype: object

In [19]:
indexes_to_drop = range(50640,177260)
indexes_to_keep = set(range(df.shape[0])) - set(indexes_to_drop)
df_sliced = df.take(list(indexes_to_keep))
df_sliced.shape

(126601, 1)

In [20]:
df_sliced.head()

Unnamed: 0,ready
0,Projekt in Köln: Cloud Architekt (m/w) gesucht...
1,Für unseren Kunden suchen wir asap einen Senio...
2,Für ein Teilzeitprojekt in Hamburg sucht unser...
3,Projektbeschreibung \n\n ...
4,Future Consulting GmbH \r\nentscheiden Sie sic...


10

In [21]:
def split_data(df, n):
    data = np.array_split(df, n)
    return data
        

In [22]:
data = split_data(df_sliced, 5)

In [23]:
len(data)

5

In [24]:
#df1 = data[0]
#df2 = data[1]
#df3 = data[2] bad data
df4 = data[3]
df5 = data[4]

In [25]:
#all_df = df1 + df2 + df3 + df4 + df5
df3.shape

(25320, 1)

In [37]:
df5.head(100)

Unnamed: 0,ready
227901,Projektbeschreibung \n\n \n...
227902,Projektbeschreibung \n\n \n...
227903,Projektbeschreibung \n\n \n...
227904,Projektbeschreibung \n\n \n...
227905,Projektbeschreibung \n\n \n...
227906,Projektbeschreibung \n\n \n...
227907,Projektbeschreibung \n\n \n...
227908,Projektbeschreibung \n\n \n...
227909,Projektbeschreibung \n\n \n...
227910,Projektbeschreibung \n\n \n...


In [38]:
# Clean text and title and create new column "tokenized"
t1 = time.time()
df5['token_stem_spRm'] = df5['ready'].apply(text_processing)
t2 = time.time()

In [39]:
print("Time taken to prepare", len(df5), "projects documents:", (t2-t1)/60, "min")

Time taken to prepare 25320 projects documents: 16.882160449028014 min


In [35]:
df4.head()

Unnamed: 0,ready,token_stem_spRm
202581,Projektbeschreibung \n\n ...,"[base, sas, advanced, sas, sas, macros, autoca..."
202582,"Dear Network, \n \nI'm currently looking for a...","[dear, network, test, engineer, project, semic..."
202583,Projektbeschreibung \n\n ...,"[business, innovativer, google, reseller, dien..."
202584,"Für unseren Kunden, ein führendes Unternehmen ...","[führendes, finanzdienstleistungssektor, softw..."
202585,Projektbeschreibung \n\n \n...,"[projekt, offsite, sprachanforderung, developm..."


In [41]:
with open('df/df1.pickle', 'wb') as output:
    pickle.dump(df1, output)
with open('df/df2.pickle', 'wb') as output:
    pickle.dump(df2, output)

In [36]:
with open('df/df4.pickle', 'wb') as output:
    pickle.dump(df4, output)

In [15]:
# load nltk's German and english stopwords'
with open('constants/german_stopwords_full.txt', 'r') as f:
    stopwords_germ = f.read().splitlines()
with open('constants/stopwords_manual.txt', 'r') as f:
    stopwords_manual = f.read().splitlines()
stopwords_eng = nltk.corpus.stopwords.words('english')

In [None]:
pickle_in = open('df/stopwords.pickle',"rb")
stopwords_all = pickle.load(pickle_in)