# DOCUMENTATION

1. Import relevant libraries and download relevant resources
2. Obtain the details for retrieving and stroing the data
3. Get the dataset from mongoDB database and store it as a pandas dataframe. <br/>
4. uce the orginial dataframe by removing the columns which are not needed for Topic Modelling. Cureently we are considering only the area and the description of the project as the columns in our dataframe. 
5. Perform tokenization by removing spaces and punctuations
6. Identifying stopwords:
    1. Load NLTK's English and German stopwords
    2. Add cities and mothns to it 
    3. Manually added stopwords (irrelevant words for our analysis)
7. Create a new column having tokens without stopwords
8. Generate bigrams from tokens containing stopwords and apply the bigrams to the tokens without stopwords.
9. Store the new tokens with bigrams in a separate column.
10. Store all bigrams into  file.

# Importing all the relevant libraries and downloading all relevant resources

In [1]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
import gensim
from gensim.models import LdaModel
from gensim import models, corpora, similarities
import re
import time
from nltk import FreqDist
from scipy.stats import entropy
import matplotlib.pyplot as plt
import seaborn as sns
from pymongo import MongoClient
from nltk.tokenize import word_tokenize
import pickle
import string
from string import punctuation
import os
%matplotlib inline
sns.set_style("darkgrid")

In [2]:
#Using NLTK Downloader to obtain the resource stopwords, punkt
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

# Database details for retrieving dataset and storing the dataset

In [3]:
#Details for retrieving  data from projectfinder
db_loc = {
    'ip' :'10.10.250.0',
    'port' : 27017,
    'database' : 'projectfinder',
    'collection' : 'itproject_clean'
}

In [4]:
#Details for storing data related to projectfinder
db_data = {
    'ip' :'10.10.250.0',
    'port' : 27017,
    'database' : 'projectfinder',
    'collection' : 'mldata1'
}

#%%[markdown]
#Methods for loading the dataset

In [5]:
def load_dataset_from_mongodb(db_obj):
    
    """
    This method loads a dataset as a pandas dataframe from MongoDB 
    
    Parameters:
    @db_obj (dict): Storing the ip address, port number, database name and collection name for dataset to be loaded
    
    Returns:
    panadas dataframe: Containing the loaded dataset
    """
    
    #Extracting the items from the inputted dictionary
    dbname = db_obj['database']
    ip = db_obj['ip']
    port = db_obj['port']
    collection = db_obj['collection']
    
    #Creating a connection to the database using MongoClient
    connection = MongoClient(ip, port)
    db = connection[dbname]
    
    #Excluding the fileds which are not needed in the dataframe 
    #Currenlty excluding the id associated with each document of the collection
    
    exclude_field = {'_id': False}
    raw_dataset = list(db[collection].find({}, projection=exclude_field))
    
    dataset = pd.DataFrame(raw_dataset)
    print(f'Data loaded from mongodb {collection} collection succesfully')
    return dataset

In [6]:
def save_to_momgodb(df,db_):
    
    """
    This method saves a dataframe as a collection into a specified MongoDB database.
    
    Parameters:
    @df (pandas dataframe): Storing the dataset to be saved
    @db_ (dict): Details for the database where the given dataset is to be saved
    
    """
    
    #Convert data prsent in the dataframe to JSON format
    data = df.to_dict(orient='records')
    
     #Extracting the items from the inputted dictionary of database details
    dbname = db_['database']
    ip = db_['ip']
    port = db_['port']
    coll = db_['collection']
    
    #Creating a connection to the database using MongoClient
    connection = MongoClient(ip,port)
    db = connection[dbname]
    col = db[collection].insert_many(data)
    
    print(f'data saved as {coll}')

In [7]:
def load_dataset_from_json(data):
    with open(data) as f:
            d = json.load(f)
        #normalize json
    dataset= json_normalize(d)
    return dataset

In [8]:
df_rawData = load_dataset_from_mongodb(db_loc)
df_rawData.shape

Data loaded from mongodb itproject_clean collection succesfully


(14059, 25)

In [9]:
def get_required_dataset(original_dataset):
    
    #Select required colunms
    df = original_dataset[['description', 'bereich']]
    df = df[df['description'] != '']
    #df.rename(columns = {'description' : 'project', 'bereich' : 'class'})
    df['project'] = df['description']
    df['label'] = df['bereich']
    df.drop(['description', 'bereich'], axis=1, inplace=True)
    df = df[df['label'] != 'IT/Bauingenieur']
    df = df.drop_duplicates()
    return df

In [10]:
df_preprocessedDataset = get_required_dataset(df_rawData)
df_preprocessedDataset.shape
df_preprocessedDataset.head()

Unnamed: 0,project,label
0,Für einen unserer Kunden aus dem Finanzdienstl...,Infr-Admin-Microsoft
1,Kann Profil leider nicht löschen.,IT/Consulting
2,Business Intelligence Analyst (m/w) - Tableau ...,Data-Sci-BI
3,"Konzeption, Customizing sowie Softwareanpassun...",Infr-Admin-Linux
4,Es sollen mehrere Automatisierungen mit ubot S...,IT/IT


In [11]:
# shuffle the data
df_preprocessedDataset = df_preprocessedDataset.sample(frac=1.0)
df_preprocessedDataset.reset_index(drop=True,inplace=True)
df_preprocessedDataset.head()

Unnamed: 0,project,label
0,"Java Java EE/J2EE Apache Tomcat HTML5 JSP, J...",Dev-Web-Fullstack
1,Projektbeschreibung \n\n ...,SW-Dev-Others
2,We are recruiting for a \n \nSenior SAP PP-DS...,ERP-SAP
3,Beginn: asap / Dauer: 6MM + / Ort: Frankfurt /...,SW-Dev-Others
4,Projektbeschreibung \n\n ...,IT-Mgmt-Consulting


In [12]:
df_preprocessedDataset.iloc[0,0]

'Java Java EE/J2EE  Apache Tomcat HTML5  JSP, JSF, Open Source MySQL, Docker, CSS optional Automotive Know How Sichere Deutsch- sowie Englischkenntnisse Ihre Aufgaben Design, Entwicklung und Test bis hin zum Deployment maßgeschneiderter IT-Lösungen in der Automotive Branche Neubau einer Microservice-Applikation "Technische Daten" für eine Bereitstellungsplattform Kontakt Cegeka Deutschland GmbH Senta Ehrlich Martin-Behaim-Straße 22 63263 Neu-Isenburg Tel. +49 6102 8235 835 Fax +49 6102 8235 789 senta.ehrlich@cegeka.de'

In [13]:
def tokenization_data(text):
    text = re.sub(r'(\d)',' ',text.lower())
    text = re.sub('[%s]' % re.escape(string.punctuation), ' ', text)
    tokens = word_tokenize(text)
    return tokens

In [14]:
# Clean text and title and create new column "tokenized"
t1 = time.time()
df_preprocessedDataset['tokenized'] = df_preprocessedDataset['project'].apply(tokenization_data)
t2 = time.time()
print("Time taken to prepare", len(df_preprocessedDataset), "projects documents:", (t2-t1)/60, "min")

Time taken to prepare 12168 projects documents: 0.37454417943954466 min


In [15]:
df_preprocessedDataset.head()

Unnamed: 0,project,label,tokenized
0,"Java Java EE/J2EE Apache Tomcat HTML5 JSP, J...",Dev-Web-Fullstack,"[java, java, ee, j, ee, apache, tomcat, html, ..."
1,Projektbeschreibung \n\n ...,SW-Dev-Others,"[projektbeschreibung, melden, sie, sich, jetzt..."
2,We are recruiting for a \n \nSenior SAP PP-DS...,ERP-SAP,"[we, are, recruiting, for, a, senior, sap, pp,..."
3,Beginn: asap / Dauer: 6MM + / Ort: Frankfurt /...,SW-Dev-Others,"[beginn, asap, dauer, mm, ort, frankfurt, id, ..."
4,Projektbeschreibung \n\n ...,IT-Mgmt-Consulting,"[projektbeschreibung, melden, sie, sich, jetzt..."


In [16]:
# load nltk's German and english stopwords'
currDir = os.getcwd()
print(currDir)
if "USL" not in currDir:
    dataDir = os.path.join(currDir,  "ML", "USL", "data")
else: 
    dataDir = os.path.join(currDir,  "data")
with open(os.path.join(dataDir, 'german_stopwords_full.txt'), 'r') as f:
    stopwords_germ = f.read().splitlines()
stopwords_eng = nltk.corpus.stopwords.words('english')

/opt/jupyter/Icxa/projectfinder_analytics/ML/USL


In [17]:
#german cities
if "USL" not in currDir:
    from ML.USL.bundeslander import Baden_Württemberg, Bayern, Berlin, Brandenburg, Bremen, Hamburg, Hessen, Mecklenburg_Vorpommern, Niedersachsen, Nordrhein_Westfalen, Rheinland_Pfalz, Saarland, Sachsen, Sachsen_Anhalt, Schleswig_Holstein, Thüringen, Ausland
else:
    from bundeslander import Baden_Württemberg, Bayern, Berlin, Brandenburg, Bremen, Hamburg, Hessen, Mecklenburg_Vorpommern, Niedersachsen, Nordrhein_Westfalen, Rheinland_Pfalz, Saarland, Sachsen, Sachsen_Anhalt, Schleswig_Holstein, Thüringen, Ausland

All = Baden_Württemberg + Bayern + Berlin + Brandenburg + Bremen +Hamburg + Hessen + Mecklenburg_Vorpommern + Niedersachsen + Nordrhein_Westfalen + Rheinland_Pfalz + Saarland + Sachsen + Sachsen_Anhalt + Schleswig_Holstein + Thüringen + Ausland
cities = list(set([city.lower() for city in All]))

In [18]:
months = ['Januar', 'January','Februar', 'February', 'März', 'March', 'April', 'Mai', 'May', 'Juni', 'June', 'Juli', 
          'July', 'August', 'September', 'Oktober', 'October', 'November', 'Dezember', 'December']
months = [month.lower() for month in months]
print(months)

['januar', 'january', 'februar', 'february', 'märz', 'march', 'april', 'mai', 'may', 'juni', 'june', 'juli', 'july', 'august', 'september', 'oktober', 'october', 'november', 'dezember', 'december']


In [19]:
stopwords_manual = [line.rstrip('\n') for line in open(os.path.join(dataDir, 'stopwords_manual.txt'))]
print(len(stopwords_manual))

844


In [20]:
stopwords_all = list(set(stopwords_germ + stopwords_eng + stopwords_manual + cities + months))
len(stopwords_all)

13240

In [None]:
stopwords_add = []
stopwords_add = list(set(stopwords_add + stopwords_manual))
checker = list(set(stopwords_germ + stopwords_eng + cities + months))
stopwords_add.sort()
with open('stopwords_manual.txt', 'w') as f:
    for item in stopwords_add:
        if item not in checker:
            f.write("%s\n" % item)

In [None]:
stopwords_manual = [line.rstrip('\n') for line in open('stopwords_manual.txt')]
print(len(stopwords_manual))

stopwords_all = list(set(stopwords_germ + stopwords_eng + stopwords_manual + cities + months))
len(stopwords_all)

In [None]:
def remove_stopwords(words):
    return [word for word in words if word not in stopwords_all]

In [None]:
# Remove Stop Words
df_preprocessedDataset['tokenized_wo_stopwords'] = df_preprocessedDataset['tokenized'].apply(remove_stopwords)

In [None]:
df_preprocessedDataset.head()

In [None]:
bigram = gensim.models.Phrases(df_preprocessedDataset['tokenized'].tolist(), min_count=5, threshold=50) # higher threshold fewer phrases.
bigram_mod = gensim.models.phrases.Phraser(bigram)
def make_bigrams(text):
    return bigram_mod[text]

In [None]:
# Form Bigrams
df_preprocessedDataset['tokenized_w_bigrams'] = df_preprocessedDataset['tokenized_wo_stopwords'].apply(make_bigrams)

In [None]:
df_preprocessedDataset.head()

In [None]:
# Obtain all bigrams
all_lines = df_preprocessedDataset['tokenized_w_bigrams'].tolist()
all_bigrams = list(set([word for words in all_lines for word in words if '_' in word]))
all_bigrams.sort()
print(len(all_bigrams))

In [None]:
with open('all_bigrams_Approach_2.txt', 'w') as f:
    for item in all_bigrams:
        f.write("%s\n" % item)

In [None]:
stemmer_own = {
    
    'abgeschlossen': 'abgeschlossen',
    'admin': 'administration',  
    'verwaltung': 'administration',
    'architektur' : 'architekture',
    'agil' : 'agile',
    'analys': 'analyst',
    'app': 'application',
    'anwend' : 'application',
    'automat': 'automate',
   
    
    'consultant' : 'berater',
    'berat': 'berater',
    'bereich' : 'bereich',
    'cisco': 'cisco',
    'konzept' : 'concept',
    'container': 'containerization',
    'contin': 'continuous',
    'zertifi' : 'certificate',
    'certifi' : 'certificate',
    'design' : 'design',
    'engineer' : 'engineer',
    'ingenieur'  : 'engineer',
    'entwick': 'entwicklung',
    'develop': 'entwicklung',
    'device':'device',
    'program': 'entwicklung',
    'entwickler' : 'entwicklung',
    
    'extern': 'external',
    'framework': 'framework',
    'fix': 'fix',
    'globalen': 'global',
    'install' : 'install',
    'schnittstell': 'interface',
    'implement' : 'implementation', 
    'infrastr' : 'infrastructure',
    'informati' : 'informatik',
    'intern': 'internal',
    'integriert' : 'integrate',
    'konfigur': 'konfigure',
    'manage' : 'management',
    'method' : 'method',
    'überwach' : 'monitoring',
    'mobil': 'mobil',
    'betrieb' : 'operation',
    'künstliche': 'künstliche',
    'notebook': 'notebooks',
    'read':'read',
    'write':'write',
    'relational':'relational',
    'master':'master',
    'script':'script',
    'skript':'skript',
    'skale':'scale',
    
    'operat' : 'operation',
    'operie' : 'operation',
    'vorschläg' : 'option',
    'plattform' : 'platform',
    'projec' : 'project',
    'prozess' : 'process',
    'process' : 'process',
    'bearbeitung' : 'process',
    'scrum': 'scrum',
    'softwar': 'software',
    'spezifi' :'specification',
    'specifi' :'specification',
    'unterstützt' : 'support',
    'support' : 'support',
    'system': 'system',
    'anfoder': 'requirement',
    'tech' : 'tech',
    
}

In [None]:
def tokenize(text):
    """Normalize, tokenize and stem text string
    
    Args:
    text: string. String containing message for processing
       
    Returns:
    cleaned: list of strings. List containing normalized and stemmed word tokens
    """

    try:
        text = re.sub(r'(\d)',' ',text.lower())
        text = re.sub('[%s]' % re.escape(string.punctuation), ' ', text)
        tokens = word_tokenize(text)
        tokens_cleaned = [word for word in tokens if word not in stopwords_all and len(word) > 1]
        cleaned = []
        stemmer_keys = list(stemmer_own.keys())
        for word in tokens_cleaned:
            for stemmer_key in stemmer_keys:
                if stemmer_key in word:
                    stemmed_word = stemmer_own[stemmer_key]
                    cleaned.append(stemmed_word)
                    break
            else:
                cleaned.append(word)
  
                

    except IndexError:
        pass

    return cleaned

In [None]:
# Clean text and title and create new column "tokenized"
t1 = time.time()
df_preprocessedDataset['tokenized'] = df_preprocessedDataset['project'].apply(tokenize)
t2 = time.time()

In [None]:
print("Time taken to prepare", len(df_preprocessedDataset), "projects documents:", (t2-t1)/60, "min")

In [None]:
df_preprocessedDataset.head()

In [None]:
# Create a list containing all the words in a dataframe
all_words_df = [word for item in list(df_preprocessedDataset['tokenized']) for word in item]

# Use nltk fdist to get a frequency distribution of all words
fdist_words = FreqDist(all_words_df)
print(len(fdist_words)) # number of unique words
print(type(fdist_words))

#print(fdist_words.items())

In [None]:
total_unique_words = len(fdist_words)
sorted_freqDist_words = fdist_words.most_common()
maxFreq = sorted_freqDist_words[0][1]
print(maxFreq)
freq_values = [sorted_freqDist_words[i][1] for i in range(total_unique_words)]
avgFreq = np.mean(freq_values)
print(avgFreq)

In [None]:
#Considering words with frequency of 100 or more
top_words = [sorted_freqDist_words[i][0] for i in range(total_unique_words) if sorted_freqDist_words[i][1] >= 100]
print(len(top_words))
#print(top_words)

In [None]:
def most_appeared(text):
    return [word for word in text if word in top_words]

In [None]:
#Reduce the words in tokenized column to the words with frequency more than 100. 
df_preprocessedDataset['tokenized'] = df_preprocessedDataset['tokenized'].apply(most_appeared)

In [None]:
df_preprocessedDataset.head(20)

In [None]:
# only keep articles with more than 10 tokens, otherwise too short
df_preprocessedDataset = df_preprocessedDataset[df_preprocessedDataset['tokenized'].map(len) >= 10]
# make sure all tokenized items are lists
df_preprocessedDataset = df_preprocessedDataset[df_preprocessedDataset['tokenized'].map(type) == list]
df_preprocessedDataset.reset_index(drop=True,inplace=True)

In [None]:
print("After cleaning and excluding short aticles, the dataframe now has:", len(df_preprocessedDataset), "articles")

In [None]:
# create a mask of binary values to split into train and test
msk = np.random.rand(len(df_preprocessedDataset)) < 0.9960
msk

In [None]:
train_df = df_preprocessedDataset[msk]
train_df.reset_index(drop=True,inplace=True)

test_df = df_preprocessedDataset[~msk]
test_df.reset_index(drop=True,inplace=True)

In [None]:
train_df.head()

In [None]:
def train_lda(data, n=10):
    """
    This function trains the lda model
    We setup parameters like number of topics, the chunksize to use in Hoffman method
    We also do 2 passes of the data since this is a small dataset, so we want the distributions to stabilize
    """
    num_topics = n
    chunksize = 300
    dictionary = corpora.Dictionary(data['tokenized'])
    corpus = [dictionary.doc2bow(doc) for doc in data['tokenized']]
    t1 = time.time()
    # low alpha means each document is only represented by a small number of topics, and vice versa
    # low eta means each topic is only represented by a small number of words, and vice versa
    lda = LdaModel(corpus=corpus, num_topics=num_topics, id2word=dictionary,
                   alpha=1e-2, eta=0.5e-2, chunksize=chunksize, minimum_probability=0.0, passes=2)
    t2 = time.time()
    print("Time to train LDA model on ", len(df_preprocessedDataset), "documents: ", (t2-t1)/60, "min")
    return dictionary,corpus,lda

In [None]:
dictionary,corpus,lda = train_lda(train_df, 10)

In [None]:
lda.save('LDA_Approach_1.model')

In [None]:
from gensim import corpora, models, similarities
model =  models.LdaModel.load('LDA_Approach_1.model')

In [None]:
# print all topics
model.show_topics(num_topics=20, num_words=20)

In [None]:
with open('dictionary_LDA_A1', 'wb') as output:
    pickle.dump(dictionary, output)
    
with open('corpus_LDA_A1', 'wb') as output:
    pickle.dump(corpus, output)

In [None]:
# Save model to disk.
from gensim.test.utils import datapath
temp_file = datapath("model")
lda.save(temp_file)

pickle.dump(lda, open('model_LDA_A1', 'wb'))

In [None]:
# Load a potentially pretrained model from disk.
lda2 = LdaModel.load(temp_file)

In [None]:
# show_topics method shows the the top num_words contributing to num_topics number of random topics
lda.show_topics(num_topics=13, num_words=20)

In [None]:
for t_id in range (2):
    print("TopicID: " + str(t_id))
    topics = lda.show_topic(topicid=t_id, topn=20)
    for topic in topics:
        print(topic[0] + ": " + str(topic[1]))
    print()


# Random project from training data

In [None]:
#Select an article at random from train_df
random_index = int(np.random.randint(len(train_df), size=[1, 1]))
print(random_index)

In [None]:
data_to_check = train_df.iloc[random_index,2]
bow = dictionary.doc2bow(data_to_check)
doc_distribution = np.array([topic[1] for topic in lda.get_document_topics(bow=bow)])

In [None]:
print(train_df.iloc[random_index,2])

In [None]:
print(doc_distribution)
print(len(doc_distribution))
np.argsort(-doc_distribution)[:3]
print(doc_distribution)
print(len(doc_distribution))

In [None]:
# bar plot of topic distribution for this document
def plot_topic_dist(doc_distr, index):
    """
    This function plots the topic distrubtion for a given document
    It takes two parameters
    (1) doc_distr = type: list of floats, list of topic probability distribution in a document
    (2) index = type: int, index number of document to plot
    We also do 2 passes of the data since this is a small dataset, so we want the distributions to stabilize
    """
    fig, ax = plt.subplots(figsize=(12,8));
    # the histogram of the data
    patches = ax.bar(np.arange(len(doc_distr)), doc_distr)
    ax.set_xlabel('Topic ID', fontsize=15)
    ax.set_ylabel('Topic Probability Score', fontsize=15)
    ax.set_title("Topic Distribution for Project in Index " + str(index), fontsize=20)
    ax.set_xticks(range(0,10))
    x_ticks_labels = ['ERP/SAP','SW_Dev/Web','IT_App_Mgr/SW_Dev_Arch','SW_Dev/DevOps','Sys_Admin/Support', 'IT_Admin_SW/Oracle/Ops','Data/Ops','IT_Process_Mgr/Consultant', 'MS_DEV/Admin','Business_Analyst/Consulting']
    ax.set_xticklabels(x_ticks_labels, rotation='vertical', fontsize=8)
    fig.tight_layout()
    return plt.show()

In [None]:
plot_topic_dist(doc_distribution, random_index)

In [None]:
lda_model =  models.LdaModel.load('LDA_Approach_1.model')

In [None]:
lda_model.show_topics()

In [None]:
doc_distribution1 = np.array([topic[1] for topic in lda_model.get_document_topics(bow=bow)])
labels = np.argmax(doc_distribution1)
print(doc_distribution1)