In [None]:
import pandas as pd
import numpy as np

import warnings
warnings.filterwarnings("ignore")

#for text pre-processing
import re, string
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import SnowballStemmer
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

#for model-building
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, f1_score, accuracy_score, confusion_matrix
from sklearn.metrics import roc_curve, auc, roc_auc_score

# bag of words
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

#for word embedding
import gensim
from gensim.models import Word2Vec

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


In [None]:
data = pd.read_csv('Data.csv',sep=",", encoding='cp1252')

In [None]:
data.isna().sum()

S.No             0
organization     0
Category        74
text             0
dtype: int64

In [None]:
data

Unnamed: 0,S.No,organization,Category,text
0,1,PeopleSoft,,Anubhav Kumar Singh \n\n To work in a global...
1,2,PeopleSoft,,Profile Summary: \n?7+ years of exper...
2,3,PeopleSoft,,PeopleSoft Database Administrator\n ...
3,4,PeopleSoft,,Murali\n\nExperience Summary \n\nI have 6 year...
4,5,PeopleSoft,,"Priyanka Ramadoss\n61/46, MountPleasant, \nCoo..."
...,...,...,...,...
69,70,WorkDay,,Workday Integration Consultant\n\nName ...
70,71,WorkDay,,Seeking suitable positions in Workday HCM as ...
71,72,WorkDay,,WORKDAY | HCM | FCM\nName : Kumar S.S\nRole ...
72,73,WorkDay,,Venkateswarlu.B Workday Consultant ...


In [None]:
data['text'][55]

"Hari Krishna M\n\n\nSummary:\n?A result oriented professional with 6.10 yrs. of experience in Software Engineering with 3.10 yrs. of relevant experience in Workday Consultant.\n?Exceptional ability in understanding the business needs and improving the process.\n?Excellent communication skills and proven experience in working independently as well as in a team.\n?Involved in preparing business requirement documents and analysis of client functional requirements.\n?Extensive knowledge on Complete Tenant configurations – (Supervisory Organizations, Roles, Compensation, Business Processes)\n?Configuration of Supervisory Organizations, Job Profiles & Positions, Compensation (salary plans based on different grades, grade profiles and allowances), Security and Business Processes.\n?Day to day support of\xa0Workday\xa0HCM, Security, Compensation, reporting issues and implementing enhancements when needed.\n?Created Custom Reports and scheduled reports as requested by end-users.\n?Worked on mo

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

count_vect = CountVectorizer(max_df=0.8, min_df=2, stop_words='english')
doc_term_matrix = count_vect.fit_transform(data['text'].values.astype('U'))

In [None]:
doc_term_matrix

<74x2161 sparse matrix of type '<class 'numpy.int64'>'
	with 16785 stored elements in Compressed Sparse Row format>

In [None]:
from sklearn.decomposition import LatentDirichletAllocation

LDA = LatentDirichletAllocation(n_components=5, random_state=42)
LDA.fit(doc_term_matrix)

LatentDirichletAllocation(n_components=5, random_state=42)

In [None]:
import random

for i in range(10):
    random_id = random.randint(0,len(count_vect.get_feature_names()))
    print(count_vect.get_feature_names()[random_id])

movements
multiple
mentor
https
involved
budget
bachelors
27
pub
edge


In [None]:
first_topic = LDA.components_[0]

In [None]:
top_topic_words = first_topic.argsort()[-10:]
top_topic_words

array([1539,  404, 1961,  897,  853,  734, 1984, 1413, 1411,  197])

In [None]:
for i in top_topic_words:
    print(count_vect.get_feature_names()[i])

ps
code
testing
good
fscm
engine
tools
peoplesoft
people
application


In [None]:
for i,topic in enumerate(LDA.components_):
    print(f'Top 10 words for topic #{i}:')
    print([count_vect.get_feature_names()[i] for i in topic.argsort()[-10:]])
    print('\n')

Top 10 words for topic #0:
['ps', 'code', 'testing', 'good', 'fscm', 'engine', 'tools', 'peoplesoft', 'people', 'application']


Top 10 words for topic #1:
['queries', 'developer', 'functions', 'development', 'database', 'team', 'services', 'server', 'data', 'sql']


Top 10 words for topic #2:
['scheduler', 'domains', 'oracle', 'web', 'process', 'servers', 'database', 'server', 'application', 'peoplesoft']


Top 10 words for topic #3:
['custom', 'report', 'hcm', 'core', 'worked', 'eib', 'reports', 'integrations', 'business', 'workday']


Top 10 words for topic #4:
['bootstrap', 'technologies', 'css3', 'developer', 'html', 'ui', 'javascript', 'web', 'js', 'react']




In [None]:
topic_values = LDA.transform(doc_term_matrix)
topic_values.shape

(74, 5)

In [None]:
data['Topic'] = topic_values.argmax(axis=1)

In [None]:
data.head(15)

Unnamed: 0,S.No,organization,Category,text,Topic
0,1,PeopleSoft,,Anubhav Kumar Singh \n\n To work in a global...,2
1,2,PeopleSoft,,Profile Summary: \n?7+ years of exper...,2
2,3,PeopleSoft,,PeopleSoft Database Administrator\n ...,2
3,4,PeopleSoft,,Murali\n\nExperience Summary \n\nI have 6 year...,2
4,5,PeopleSoft,,"Priyanka Ramadoss\n61/46, MountPleasant, \nCoo...",2
5,6,PeopleSoft,,PROFILE SUMMARY\n\nI have overall 6.8 years’ e...,2
6,7,PeopleSoft,,PEOPLESOFT Administrator\n\n\nSRINIVAS.K ...,2
7,8,PeopleSoft,,PeopleSoft Admin\nVARKALA VIKAS\n\nCareer Obj...,2
8,9,PeopleSoft,,Vinod Akkala ...,2
9,10,PeopleSoft,,PeopleSoft Admin/PeopleSoft DBA\n\nGanesh...,2


# NMF for Topic Modeling 

In [None]:
import pandas as pd
import numpy as np

data = pd.read_csv('Data.csv',sep=",", encoding='cp1252')
data = data.head()
data.dropna()
data

Unnamed: 0,S.No,organization,Category,text
0,1,PeopleSoft,,Anubhav Kumar Singh \n\n To work in a global...
1,2,PeopleSoft,,Profile Summary: \n?7+ years of exper...
2,3,PeopleSoft,,PeopleSoft Database Administrator\n ...
3,4,PeopleSoft,,Murali\n\nExperience Summary \n\nI have 6 year...
4,5,PeopleSoft,,"Priyanka Ramadoss\n61/46, MountPleasant, \nCoo..."


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vect = TfidfVectorizer(max_df=0.8, min_df=2, stop_words='english')
doc_term_matrix = tfidf_vect.fit_transform(data['text'].values.astype('U'))

In [None]:
from sklearn.decomposition import NMF

nmf = NMF(n_components=5, random_state=42)
nmf.fit(doc_term_matrix )

NMF(n_components=5, random_state=42)

In [None]:
import random

for i in range(10):
    random_id = random.randint(0,len(tfidf_vect.get_feature_names()))
    print(tfidf_vect.get_feature_names()[random_id])

latest
designer
elm
installing
pvt
challenging
files
having
linux
tier


In [None]:
first_topic = nmf.components_[0]
top_topic_words = first_topic.argsort()[-10:]

In [None]:
for i in top_topic_words:
    print(tfidf_vect.get_feature_names()[i])

administrator
multiple
management
taking
professional
sql
installed
dba
configured
oracle


In [None]:
for i,topic in enumerate(nmf.components_):
    print(f'Top 10 words for topic #{i}:')
    print([tfidf_vect.get_feature_names()[i] for i in topic.argsort()[-10:]])
    print('\n')

Top 10 words for topic #0:
['administrator', 'multiple', 'management', 'taking', 'professional', 'sql', 'installed', 'dba', 'configured', 'oracle']


Top 10 words for topic #1:
['monitoring', 'batch', 'worked', 'control', 'management', 'services', 'non', 'profile', '2016', 'databases']


Top 10 words for topic #2:
['weekly', 'performing', 'support', 'issues', 'reporting', 'platforms', 'various', 'related', 'hcm', 'involved']


Top 10 words for topic #3:
['knowledge', 'admin', 'support', 'working', 'peopletools', 'various', 'applications', 'linux', 'new', 'upgrade']


Top 10 words for topic #4:
['tier', 'users', 'performing', 'oracle', 'updates', 'troubleshooting', 'fixes', 'dba', 'performed', 'issues']




In [None]:
topic_values = nmf.transform(doc_term_matrix)
data['Topic'] = topic_values.argmax(axis=1)
data.head()

Unnamed: 0,S.No,organization,Category,text,Topic
0,1,PeopleSoft,,Anubhav Kumar Singh \n\n To work in a global...,3
1,2,PeopleSoft,,Profile Summary: \n?7+ years of exper...,4
2,3,PeopleSoft,,PeopleSoft Database Administrator\n ...,0
3,4,PeopleSoft,,Murali\n\nExperience Summary \n\nI have 6 year...,2
4,5,PeopleSoft,,"Priyanka Ramadoss\n61/46, MountPleasant, \nCoo...",1
