# Importing Libraries

In [1]:
%matplotlib inline
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
import gensim
from gensim.models import LdaModel
from gensim import models, corpora, similarities
import re
import time
from nltk import FreqDist
from scipy.stats import entropy
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style("darkgrid")
from pymongo import MongoClient
from nltk.tokenize import word_tokenize
import pickle
import string
from string import punctuation

In [2]:
#Details for getting data from projectfinder
db_loc = {
    'ip' :'10.10.250.0',
    'port' : 27017,
    'database' : 'projectfinder',
    'collection' : 'itproject_clean'
}

In [3]:
#Details for storing data related to projectfinder
db_data = {
    'ip' :'10.10.250.0',
    'port' : 27017,
    'database' : 'projectfinder',
    'collection' : 'mldata1'
}

In [4]:
def load_dataset_from_momgodb(db_obj):
    
    """
    This method loads a dataset as a pandas dataframe from MongoDB 
    
    Parameters:
    @db_obj (dict): Storing the ip address, port number, database name and collection name for dataset to be loaded
    
    Returns:
    panadas dataframe: Containing the loaded dataset
    """
    
    #Extracting the items from the inputted dictionary
    dbname = db_obj['database']
    ip = db_obj['ip']
    port = db_obj['port']
    collection = db_obj['collection']
    
    #Creating a connection to the database using MongoClient
    connection = MongoClient(ip, port)
    db = connection[dbname]
    
    #Excluding the fileds which are not needed in the dataframe 
    #Currenlty excluding the id associated with each document of the collection
    
    exclude_field = {'_id': False}
    raw_dataset = list(db[collection].find({}, projection=exclude_field))
    
    dataset = pd.DataFrame(raw_dataset)
    print(f'Data loaded from mongodb {collection} collection succesfully')
    return dataset

In [5]:
def save_to_momgodb(df,db_):
    
    """
    This method saves a dataframe as a collection into a specified MongoDB database.
    
    Parameters:
    @df (pandas dataframe): Storing the dataset to be saved
    @db_ (dict): Details for the database where the given dataset is to be saved
    
    """
    
    #Convert data prsent in the dataframe to JSON format
    data = df.to_dict(orient='records')
    
     #Extracting the items from the inputted dictionary of database details
    dbname = db_['database']
    ip = db_['ip']
    port = db_['port']
    coll = db_['collection']
    
    #Creating a connection to the database using MongoClient
    connection = MongoClient(ip,port)
    db = connection[dbname]
    col = db[collection].insert_many(data)
    
    print(f'data saved as {coll}')

In [6]:
def load_dataset_from_json(data):
    with open(data) as f:
            d = json.load(f)
        #normalize json
    dataset= json_normalize(d)
    return dataset

In [7]:
df = load_dataset_from_momgodb(db_loc)
df.shape

Data loaded from mongodb itproject_clean collection succesfully


(14059, 25)

In [8]:
def get_required_dataset(original_dataset):
    
    #Select required colunms
    df = original_dataset[['description', 'bereich']]
    df = df[df['description'] != '']
    #df.rename(columns = {'description' : 'project', 'bereich' : 'class'})
    df['project'] = df['description']
    df['label'] = df['bereich']
    df.drop(['description', 'bereich'], axis=1, inplace=True)
    df = df[df['label'] != 'IT/Bauingenieur']
    df = df.drop_duplicates()
    return df

In [9]:
df = get_required_dataset(df)
df.shape
df.head()

Unnamed: 0,project,label
0,Für einen unserer Kunden aus dem Finanzdienstl...,Infr-Admin-Microsoft
1,Kann Profil leider nicht löschen.,IT/Consulting
2,Business Intelligence Analyst (m/w) - Tableau ...,Data-Sci-BI
3,"Konzeption, Customizing sowie Softwareanpassun...",Infr-Admin-Linux
4,Es sollen mehrere Automatisierungen mit ubot S...,IT/IT


In [10]:
# shuffle the data
df = df.sample(frac=1.0)
df.reset_index(drop=True,inplace=True)
df.head()

Unnamed: 0,project,label
0,Projektbeschreibung \n\n ...,IT-Mgmt-Projectleiter
1,Projektbeschreibung \n\n ...,Data-Sci-BI
2,"Freiberuflicher Fullstack Developer (Python, G...",Dev-Web-Fullstack
3,Projektbeschreibung \n\n ...,Infr-Admin-Microsoft
4,Aufgabe: \nEntwicklung und eigenständige konst...,IT-Technical-Dev


In [11]:
df.iloc[0,0]

'Projektbeschreibung \n\n                             \n              Melden Sie sich jetzt an und bewerben Sie sich direkt! \n              Jetzt bewerben \n         \n    \n\n                                * Sicherstellung einer termin-, kosten- und qualitätsgerechten Projektabwicklung, inkl. Änderungskostenmanagement * Koordination der Aufbau- und Inbetriebnahmephase von Karosserierohbauanlagen vor Ort bei unseren Endkunden * Beratung bei der Erstellung von Projekt- und Prozessdokumentationen * Überwachung des Fortschritts der Anlageninbetriebnahme und Planung der hierfür notwendigen personellen Ressourcen (Beratung bei der Erstellung von Kapazitäts- und Terminplänen) * Überwachung von Unterlieferanten auf der Kundenbaustelle * Einleitung von Korrekturmaßnahmen und Dokumentation der Änderungen * Zentraler Ansprechpartner des Kunden hinsichtlich Termin und Qualität * Abgeschlossene Ausbildung zum Techniker oder ein Hochschulstudium im Bereich Elektrotechnik/Mechatronik * Baustellen-

In [15]:
#load nltk's German and english stopwords'
import os
currDir = os.getcwd()
print(currDir)
if "USL" not in currDir:
    dataDir = os.path.join(currDir,  "ML", "USL", "data")
else: 
    dataDir = os.path.join(currDir,  "data")
with open(os.path.join(dataDir, 'german_stopwords_full.txt'), 'r') as f:
    stopwords_germ = f.read().splitlines()
stopwords_eng = nltk.corpus.stopwords.words('english')

#german cities
from bundeslander import Baden_Württemberg, Bayern, Berlin, Brandenburg, Bremen, Hamburg, Hessen, Mecklenburg_Vorpommern, Niedersachsen, Nordrhein_Westfalen, Rheinland_Pfalz, Saarland, Sachsen, Sachsen_Anhalt, Schleswig_Holstein, Thüringen, Ausland
All = Baden_Württemberg + Bayern + Berlin + Brandenburg + Bremen +Hamburg + Hessen + Mecklenburg_Vorpommern + Niedersachsen + Nordrhein_Westfalen + Rheinland_Pfalz + Saarland + Sachsen + Sachsen_Anhalt + Schleswig_Holstein + Thüringen + Ausland
cities = list(set([city.lower() for city in All]))

months = ['Januar', 'January','Februar', 'February', 'März', 'March', 'April', 'Mai', 'May', 'Juni', 'June', 'Juli', 
          'July', 'August', 'September', 'Oktober', 'October', 'November', 'Dezember', 'December']
months = [month.lower() for month in months]
print(months)

stopwords_manual = [line.rstrip('\n') for line in open('stopwords_manual.txt')]
print(len(stopwords_manual))

/opt/jupyter/Icxa/projectfinder_analytics/ML/USL
['januar', 'january', 'februar', 'february', 'märz', 'march', 'april', 'mai', 'may', 'juni', 'june', 'juli', 'july', 'august', 'september', 'oktober', 'october', 'november', 'dezember', 'december']
844


In [16]:
stopwords_all = list(set(stopwords_germ + stopwords_eng + stopwords_manual + cities + months))
len(stopwords_all)

13240

In [17]:
stopwords_add = []
stopwords_add = list(set(stopwords_add + stopwords_manual))
checker = list(set(stopwords_germ + stopwords_eng + cities + months))
stopwords_add.sort()
with open('stopwords_manual.txt', 'w') as f:
    for item in stopwords_add:
        if item not in checker:
            f.write("%s\n" % item)
stopwords_manual = [line.rstrip('\n') for line in open('stopwords_manual.txt')]
print(len(stopwords_manual))

844


In [18]:
stopwords_all = list(set(stopwords_germ + stopwords_eng + stopwords_manual + cities + months))
len(stopwords_all)

13240

In [19]:
stemmer_own = {
    
    'admin': 'administration',  
    'verwaltung': 'administration',
    'architektur' : 'architekture',
    'agil' : 'agile',
    'analys': 'analyst',
    'app': 'application',
    'anwend' : 'application',
    'automat': 'automate',
   
    
    'consultant' : 'berater',
    'berat': 'berater',
    'bereich' : 'bereich',
    'cisco': 'cisco',
    'konzept' : 'concept',
    'container': 'containerization',
    'zertifi' : 'certificate',
    'certifi' : 'certificate',
    'design' : 'design',
    'engineer' : 'engineer',
    'ingenieur'  : 'engineer',
    'entwick': 'entwicklung',
    'develop': 'entwicklung',
    'program': 'entwicklung',
    'entwickler' : 'entwicklung',
    
    'extern': 'external',
    'framework': 'framework',
    'globalen': 'global',
    'schnittstell': 'interface',
    'implement' : 'implementation', 
    'infrastr' : 'infrastructure',
    'informati' : 'informatik',
    'intern': 'internal',
    'manage' : 'management',
    'method' : 'method',
    'überwach' : 'monitoring',
    'mobil': 'mobil',
    'betrieb' : 'operation',
    
    'operat' : 'operation',
    'operie' : 'operation',
    'plattform' : 'platform',
    'projec' : 'project',
    'prozess' : 'process',
    'process' : 'process',
    'bearbeitung' : 'process',
    'scrum': 'scrum',
    'softwar': 'software',
    'spezifi' :'specification',
    'specifi' :'specification',
    'unterstützt' : 'support',
    'support' : 'support',
    'system': 'system',
    'anfoder': 'requirement',
    'tech' : 'tech',
    
}

In [20]:
def tokenization_data(text):
    """Normalize, tokenize and stem text string
    
    Args:
    text: string. String containing message for processing
       
    Returns:
    cleaned: list of strings. List containing normalized and stemmed word tokens
    """

    try:
        text = re.sub(r'(\d)',' ',text.lower())
        text = re.sub('[%s]' % re.escape(string.punctuation), ' ', text)
        tokens = word_tokenize(text)
        tokens_cleaned = [word for word in tokens if word not in stopwords_all and len(word) > 1]
        cleaned = []
        stemmer_keys = list(stemmer_own.keys())
        for word in tokens_cleaned:
            for stemmer_key in stemmer_keys:
                if stemmer_key in word:
                    stemmed_word = stemmer_own[stemmer_key]
                    cleaned.append(stemmed_word)
                    break
            else:
                cleaned.append(word)
  
                

    except IndexError:
        pass

    return cleaned

In [21]:
# Clean text and title and create new column "tokenized"
t1 = time.time()
df['token_stem_spRm'] = df['project'].apply(tokenization_data)
t2 = time.time()
print("Time taken to prepare", len(df), "projects documents:", (t2-t1)/60, "min")

Time taken to prepare 12168 projects documents: 8.700641453266144 min


In [22]:
df.head()

Unnamed: 0,project,label,token_stem_spRm
0,Projektbeschreibung \n\n ...,IT-Mgmt-Projectleiter,"[sicherstellung, termin, qualitätsgerechten, p..."
1,Projektbeschreibung \n\n ...,Data-Sci-BI,"[verantwortung, tracking, analytics, web, mobi..."
2,"Freiberuflicher Fullstack Developer (Python, G...",Dev-Web-Fullstack,"[freiberuflicher, fullstack, entwicklung, pyth..."
3,Projektbeschreibung \n\n ...,Infr-Admin-Microsoft,"[application, management, installation, suppor..."
4,Aufgabe: \nEntwicklung und eigenständige konst...,IT-Technical-Dev,"[entwicklung, eigenständige, konstruktive, qua..."


In [46]:
bigram = gensim.models.Phrases(df['token_stem_spRm'].tolist(), min_count=2, threshold=2) # higher threshold fewer phrases.
bigram_mod = gensim.models.phrases.Phraser(bigram)
def make_bigrams(text):
    return bigram_mod[text]

In [47]:
# Form Bigrams
df['token_stem_spRm_bigram'] = df['token_stem_spRm'].apply(make_bigrams)

In [56]:
df.tail(20)

Unnamed: 0,project,label,token_stem_spRm,token_stem_spRm_bigram
12148,Projektbeschreibung \n\n ...,ERP-SAP,"[spezialist, process, mm, sicherstellung, syst...","[spezialist, process, mm, sicherstellung, syst..."
12149,Projektbeschreibung \n\n ...,Data-Engr-Big Data,"[bi, berater, software, entwicklung, berater, ...","[bi, berater, software_entwicklung, berater, t..."
12150,Unser Kunde aus Berlin ist auf der Suche nach ...,ERP-SAP,"[sap, workflow, entwicklung, gesamtkontigent, ...","[sap_workflow, entwicklung, gesamtkontigent, b..."
12151,Für unseren Kunden sind wir aktuell auf der Su...,IT/Rechnungswes,"[controller, kundendetails, handelsunternehmen...","[controller, kundendetails, handelsunternehmen..."
12152,Projektbeschreibung \n\n ...,Dev-Web-Frontend,"[operation, entwicklung, dokumentation, versch...","[operation, entwicklung, dokumentation_versch,..."
12153,Projektbeschreibung \n\n ...,Dev-Web-Backend,"[business, innovativer, google, reseller, dien...","[business_innovativer, google_reseller, dienst..."
12154,Projektbeschreibung \n\n ...,Infr-Admin-Microsoft,"[entwicklung, ms, sharepoint, ms, sharepoint, ...","[entwicklung, ms_sharepoint, ms_sharepoint, ms..."
12155,Projektbeschreibung \n\n ...,Dev-Web-Backend,"[net, entwicklung, system, application, egover...","[net_entwicklung, system, application_egovernm..."
12156,Von mobilen Apps bis hin zur Robotersteuerung ...,Dev-Web-Fullstack,"[mobil, application, robotersteuerung, konzipi...","[mobil_application, robotersteuerung_konzipier..."
12157,Projektbeschreibung \n\n ...,SW-Dev-Others,"[komponentenverantwortlichen, verzeichnisdiens...","[komponentenverantwortlichen, verzeichnisdiens..."


In [55]:
df.iloc[9,3]

['tibco',
 'entwicklung',
 'integration',
 'integration_layer',
 'design',
 'entwicklung_tibcobasierten',
 'application_integrationslösung',
 'workflows_interface',
 'system',
 'mm',
 'tibco',
 'architekture',
 'enterprise_bus',
 'esb_enterprise',
 'application',
 'integration_eai',
 'architecture_soa',
 'microservices',
 'interface_wsdl',
 'xsd_wadl',
 'openapi_jdbc',
 'jms_mehrrjährige',
 'enterprise',
 'java_kommerzieller',
 'integrations',
 'software',
 'tibco_optimal',
 'tibco_activematrix',
 'businessworks',
 'nützlich',
 'alternativ',
 'eclipse_ide']

In [23]:
# Obtain all bigrams
all_lines = df['token_stem_spRm_bigram'].tolist()
all_bigrams = list(set([word for words in all_lines for word in words if '_' in word]))
all_bigrams.sort()
print(len(all_bigrams))

4431


In [25]:
with open('all_bigrams_Approach_4.txt', 'w') as f:
    for item in all_bigrams:
        f.write("%s\n" % item)

In [5]:
import os
import nltk
import pickle

#Using NLTK Downloader to obtain the resource stopwords, punkt
nltk.download('stopwords')
nltk.download('punkt')

currDir = os.getcwd()
if "USL" not in currDir:
    os.chdir(os.path.join(currDir,  "ML", "USL"))
else:
    os.chdir("../USL")

# load nltk's German and english stopwords'
dataDir = os.path.join(currDir,  "constants")
with open(os.path.join(dataDir, 'german_stopwords_full.txt'), 'r') as f:
    stopwords_germ = f.read().splitlines()
stopwords_eng = nltk.corpus.stopwords.words('english')
combined_stopWordsPath = os.path.join(dataDir, 'stopwords_manual.txt')

#german cities
from script_files.bundeslander import Baden_Württemberg, Bayern, Berlin, Brandenburg, Bremen, Hamburg, Hessen, Mecklenburg_Vorpommern, Niedersachsen, Nordrhein_Westfalen, Rheinland_Pfalz, Saarland, Sachsen, Sachsen_Anhalt, Schleswig_Holstein, Thüringen, Ausland

All = Baden_Württemberg + Bayern + Berlin + Brandenburg + Bremen +Hamburg + Hessen + Mecklenburg_Vorpommern + Niedersachsen + Nordrhein_Westfalen + Rheinland_Pfalz + Saarland + Sachsen + Sachsen_Anhalt + Schleswig_Holstein + Thüringen + Ausland
cities = list(set([city.lower() for city in All]))

months = ['Januar', 'January','Februar', 'February', 'März', 'March', 'April', 'Mai', 'May', 'Juni', 'June', 'Juli', 
          'July', 'August', 'September', 'Oktober', 'October', 'November', 'Dezember', 'December']
months = [month.lower() for month in months]
#print(months)

stopwords_manual = [line.rstrip('\n') for line in open(combined_stopWordsPath)]
#print(len(stopwords_manual))

stopwords_all = list(set(stopwords_germ + stopwords_eng + stopwords_manual + cities + months))
len(stopwords_all)

stopwordsLocation = os.path.join(dataDir,  "stopwords.pickle")
pickle_out = open(stopwordsLocation,"wb")
pickle.dump(stopwords_all, pickle_out)
pickle_out.close()

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
