# Importing Libraries

In [1]:
%matplotlib inline
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
import gensim
from gensim.models import LdaModel
from gensim import models, corpora, similarities
import re
import time
from nltk import FreqDist
from scipy.stats import entropy
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style("darkgrid")
from pymongo import MongoClient
from nltk.tokenize import word_tokenize
import pickle
import string
from string import punctuation

In [2]:
#Details for getting data from projectfinder
db_loc = {
    'ip' :'10.10.250.0',
    'port' : 27017,
    'database' : 'projectfinder',
    'collection' : 'itproject_clean'
}

In [3]:
#Details for storing data related to projectfinder
db_data = {
    'ip' :'10.10.250.0',
    'port' : 27017,
    'database' : 'projectfinder',
    'collection' : 'mldata1'
}

In [4]:
def load_dataset_from_momgodb(db_obj):
    
    """
    This method loads a dataset as a pandas dataframe from MongoDB 
    
    Parameters:
    @db_obj (dict): Storing the ip address, port number, database name and collection name for dataset to be loaded
    
    Returns:
    panadas dataframe: Containing the loaded dataset
    """
    
    #Extracting the items from the inputted dictionary
    dbname = db_obj['database']
    ip = db_obj['ip']
    port = db_obj['port']
    collection = db_obj['collection']
    
    #Creating a connection to the database using MongoClient
    connection = MongoClient(ip, port)
    db = connection[dbname]
    
    #Excluding the fileds which are not needed in the dataframe 
    #Currenlty excluding the id associated with each document of the collection
    
    exclude_field = {'_id': False}
    raw_dataset = list(db[collection].find({}, projection=exclude_field))
    
    dataset = pd.DataFrame(raw_dataset)
    print(f'Data loaded from mongodb {collection} collection succesfully')
    return dataset

In [5]:
def save_to_momgodb(df,db_):
    
    """
    This method saves a dataframe as a collection into a specified MongoDB database.
    
    Parameters:
    @df (pandas dataframe): Storing the dataset to be saved
    @db_ (dict): Details for the database where the given dataset is to be saved
    
    """
    
    #Convert data prsent in the dataframe to JSON format
    data = df.to_dict(orient='records')
    
     #Extracting the items from the inputted dictionary of database details
    dbname = db_['database']
    ip = db_['ip']
    port = db_['port']
    coll = db_['collection']
    
    #Creating a connection to the database using MongoClient
    connection = MongoClient(ip,port)
    db = connection[dbname]
    col = db[collection].insert_many(data)
    
    print(f'data saved as {coll}')

In [6]:
def load_dataset_from_json(data):
    with open(data) as f:
            d = json.load(f)
        #normalize json
    dataset= json_normalize(d)
    return dataset

In [7]:
df = load_dataset_from_momgodb(db_loc)
df.shape

Data loaded from mongodb itproject_clean collection succesfully


(14059, 25)

In [8]:
def get_required_dataset(original_dataset):
    
    #Select required colunms
    df = original_dataset[['description', 'bereich']]
    df = df[df['description'] != '']
    #df.rename(columns = {'description' : 'project', 'bereich' : 'class'})
    df['project'] = df['description']
    df['label'] = df['bereich']
    df.drop(['description', 'bereich'], axis=1, inplace=True)
    df = df[df['label'] != 'IT/Bauingenieur']
    df = df.drop_duplicates()
    return df

In [9]:
df = get_required_dataset(df)
df.shape
df.head()

Unnamed: 0,project,label
0,Für einen unserer Kunden aus dem Finanzdienstl...,Infr-Admin-Microsoft
1,Kann Profil leider nicht löschen.,IT/Consulting
2,Business Intelligence Analyst (m/w) - Tableau ...,Data-Sci-BI
3,"Konzeption, Customizing sowie Softwareanpassun...",Dev-Web-Backend
4,Es sollen mehrere Automatisierungen mit ubot S...,IT/IT


In [10]:
# shuffle the data
df = df.sample(frac=1.0)
df.reset_index(drop=True,inplace=True)
df.head()

Unnamed: 0,project,label
0,Wir suchen derzeit für ein Kundenprojekt eine/...,ERP-SAP
1,Für unseren Kunden in Düsseldorf suchen wir ab...,IT-Mgmt-Projectleiter
2,For our client we are lookging for a freelance...,Dev-Web-Fullstack
3,Projektbeschreibung \n\n ...,Dev-Web-Frontend
4,Projektbeschreibung \n\n ...,ERP-SAP


In [11]:
df.iloc[0,0]

'Wir suchen derzeit für ein Kundenprojekt eine/n  \n \nSenior SAP SD Berater (w/m) mit Erfahrung in S/4 HANA \n \nAufgaben: \n•\tBeratung im Bereich S/4 HANA und SAP SD \n \nSkills: \n•\tErfahrung im Bereich S/4 HANA  \n•\tErfahrung im Bereich SAP SD \n•\tFließende Englisch Kenntnisse, Kenntnisse in Französisch von Vorteil \n \nStart: ASAP \nDauer: 12 Monate \nAuslastung: 4-5 Tage die Woche  \nOrt: Frankfurt am Main \nKick-Off: 1 Woche in Frankreich \nRemote: teilweise möglich \n \nBei Interesse freuen wir uns auf Ihre aussagekräftige Bewerbung (Profil).'

In [12]:
# load nltk's German and english stopwords'
import nltk
with open('../german_stopwords_full.txt', 'r') as f:
    stopwords_germ = f.read().splitlines()
stopwords_eng = nltk.corpus.stopwords.words('english')

#german cities
from bundeslander import Baden_Württemberg, Bayern, Berlin, Brandenburg, Bremen, Hamburg, Hessen, Mecklenburg_Vorpommern, Niedersachsen, Nordrhein_Westfalen, Rheinland_Pfalz, Saarland, Sachsen, Sachsen_Anhalt, Schleswig_Holstein, Thüringen, Ausland
All = Baden_Württemberg + Bayern + Berlin + Brandenburg + Bremen +Hamburg + Hessen + Mecklenburg_Vorpommern + Niedersachsen + Nordrhein_Westfalen + Rheinland_Pfalz + Saarland + Sachsen + Sachsen_Anhalt + Schleswig_Holstein + Thüringen + Ausland
cities = list(set([city.lower() for city in All]))

months = ['Januar', 'January','Februar', 'February', 'März', 'March', 'April', 'Mai', 'May', 'Juni', 'June', 'Juli', 
          'July', 'August', 'September', 'Oktober', 'October', 'November', 'Dezember', 'December']
months = [month.lower() for month in months]
print(months)

stopwords_manual = [line.rstrip('\n') for line in open('stopwords_manual.txt')]
print(len(stopwords_manual))

['januar', 'january', 'februar', 'february', 'märz', 'march', 'april', 'mai', 'may', 'juni', 'june', 'juli', 'july', 'august', 'september', 'oktober', 'october', 'november', 'dezember', 'december']
844


In [13]:
stopwords_all = list(set(stopwords_germ + stopwords_eng + stopwords_manual + cities + months))
len(stopwords_all)

13240

In [14]:
stopwords_add = ['staatl']
stopwords_add = list(set(stopwords_add + stopwords_manual))
checker = list(set(stopwords_germ + stopwords_eng + cities + months))
stopwords_add.sort()
with open('stopwords_manual.txt', 'w') as f:
    for item in stopwords_add:
        if item not in checker:
            f.write("%s\n" % item)
stopwords_manual = [line.rstrip('\n') for line in open('stopwords_manual.txt')]
print(len(stopwords_manual))

844


In [15]:
stopwords_all = list(set(stopwords_germ + stopwords_eng + stopwords_manual + cities + months))
len(stopwords_all)

13240

In [16]:
stemmer_own = {
    
    'abgeschlossen': 'abgeschlossen',
    'admin': 'administration',  
    'verwaltung': 'administration',
    'architektur' : 'architekture',
    'agil' : 'agile',
    'analys': 'analyst',
    'app': 'application',
    'anwend' : 'application',
    'automat': 'automate',
   
    
    'consultant' : 'berater',
    'berat': 'berater',
    'bereich' : 'bereich',
    'cisco': 'cisco',
    'konzept' : 'concept',
    'container': 'containerization',
    'zertifi' : 'certificate',
    'certifi' : 'certificate',
    'design' : 'design',
    'engineer' : 'engineer',
    'ingenieur'  : 'engineer',
    'entwick': 'entwicklung',
    'develop': 'entwicklung',
    'program': 'entwicklung',
    'entwickler' : 'entwicklung',
    
    'extern': 'external',
    'framework': 'framework',
    'fix': 'fix',
    'globalen': 'global',
    'schnittstell': 'interface',
    'implement' : 'implementation', 
    'infrastr' : 'infrastructure',
    'informati' : 'informatik',
    'intern': 'internal',
    'manage' : 'management',
    'method' : 'method',
    'überwach' : 'monitoring',
    'mobil': 'mobil',
    'betrieb' : 'operation',
    
    'operat' : 'operation',
    'operie' : 'operation',
    'plattform' : 'platform',
    'projec' : 'project',
    'prozess' : 'process',
    'process' : 'process',
    'bearbeitung' : 'process',
    'scrum': 'scrum',
    'softwar': 'software',
    'spezifi' :'specification',
    'specifi' :'specification',
    'unterstützt' : 'support',
    'support' : 'support',
    'system': 'system',
    'anfoder': 'requirement',
    'tech' : 'tech',
    
}

In [17]:
def tokenization_data(text):
    """Normalize, tokenize and stem text string
    
    Args:
    text: string. String containing message for processing
       
    Returns:
    cleaned: list of strings. List containing normalized and stemmed word tokens
    """

    try:
        text = re.sub(r'(\d)',' ',text.lower())
        text = re.sub('[%s]' % re.escape(string.punctuation), ' ', text)
        tokens = word_tokenize(text)
        tokens_cleaned = [word for word in tokens if word not in stopwords_all and len(word) > 1]
        cleaned = []
        stemmer_keys = list(stemmer_own.keys())
        for word in tokens_cleaned:
            for stemmer_key in stemmer_keys:
                if stemmer_key in word:
                    stemmed_word = stemmer_own[stemmer_key]
                    cleaned.append(stemmed_word)
                    break
            else:
                cleaned.append(word)
  
                

    except IndexError:
        pass

    return cleaned

In [18]:
# Clean text and title and create new column "tokenized"
t1 = time.time()
df['token_stem_spRm'] = df['project'].apply(tokenization_data)
t2 = time.time()
print("Time taken to prepare", len(df), "projects documents:", (t2-t1)/60, "min")

Time taken to prepare 12130 projects documents: 8.13471433321635 min


In [19]:
df.head()

Unnamed: 0,project,label,token_stem_spRm
0,Wir suchen derzeit für ein Kundenprojekt eine/...,ERP-SAP,"[kundenprojekt, senior, sap, sd, berater, hana..."
1,Für unseren Kunden in Düsseldorf suchen wir ab...,IT-Mgmt-Projectleiter,"[management, test, data, concepts, identifying..."
2,For our client we are lookging for a freelance...,Dev-Web-Fullstack,"[lookging, engineer, design, management, engin..."
3,Projektbeschreibung \n\n ...,Dev-Web-Frontend,"[agierendes, react, entwicklung, erwartet, agi..."
4,Projektbeschreibung \n\n ...,ERP-SAP,"[sap, management, upgrades, ehp, unterstützen,..."


In [20]:
bigram = gensim.models.Phrases(df['token_stem_spRm'].tolist(), min_count=5, threshold=50) # higher threshold fewer phrases.
bigram_mod = gensim.models.phrases.Phraser(bigram)
def make_bigrams(text):
    return bigram_mod[text]

In [21]:
# Form Bigrams
df['token_stem_spRm_bigram'] = df['token_stem_spRm'].apply(make_bigrams)

In [22]:
df.head()

Unnamed: 0,project,label,token_stem_spRm,token_stem_spRm_bigram
0,Wir suchen derzeit für ein Kundenprojekt eine/...,ERP-SAP,"[kundenprojekt, senior, sap, sd, berater, hana...","[kundenprojekt, senior, sap, sd, berater, hana..."
1,Für unseren Kunden in Düsseldorf suchen wir ab...,IT-Mgmt-Projectleiter,"[management, test, data, concepts, identifying...","[management, test, data, concepts, identifying..."
2,For our client we are lookging for a freelance...,Dev-Web-Fullstack,"[lookging, engineer, design, management, engin...","[lookging, engineer, design, management, engin..."
3,Projektbeschreibung \n\n ...,Dev-Web-Frontend,"[agierendes, react, entwicklung, erwartet, agi...","[agierendes, react, entwicklung, erwartet, agi..."
4,Projektbeschreibung \n\n ...,ERP-SAP,"[sap, management, upgrades, ehp, unterstützen,...","[sap, management, upgrades, ehp, unterstützen,..."


In [23]:
# Obtain all bigrams
all_lines = df['token_stem_spRm_bigram'].tolist()
all_bigrams = list(set([word for words in all_lines for word in words if '_' in word]))
all_bigrams.sort()
print(len(all_bigrams))

4431


In [25]:
with open('all_bigrams_Approach_4.txt', 'w') as f:
    for item in all_bigrams:
        f.write("%s\n" % item)