In [1]:
import pandas as pd

In [2]:
jobs_data = pd.read_csv('job_skills.csv') 

In [3]:
jobs_data['job_skills'][0]

'Building Custodial Services, Cleaning, Janitorial Services, Materials Handling, Housekeeping, Sanitation, Waste Management, Floor Maintenance, Equipment Maintenance, Safety Protocols, Communication Skills, Attention to Detail, Physical Strength, Experience in Housekeeping'

In [4]:
jobs_data[~jobs_data['job_skills'].isna() & jobs_data['job_skills'].str.contains('agriculture', na=False, case=False)]

Unnamed: 0,job_link,job_skills
68,https://www.linkedin.com/jobs/view/research-as...,"Agronomy, Animal Science, Farm Equipment Opera..."
91,https://www.linkedin.com/jobs/view/production-...,"Production, Trimming, Harvesting, Packaging, L..."
449,https://ca.linkedin.com/jobs/view/senior-accou...,"Senior Accountant, General Accounting, Financi..."
1490,https://www.linkedin.com/jobs/view/manager-mai...,"Farming, Food production, Warehouse experience..."
3572,https://www.linkedin.com/jobs/view/part-time-c...,"Customer service, Sales ability, Food preparat..."
...,...,...
1292955,https://www.linkedin.com/jobs/view/fresh-marke...,"Agriculture, Agronomy, Irrigation, Nutrient ma..."
1293175,https://www.linkedin.com/jobs/view/part-time-l...,"Agricultural Engineering, Mechanized Agricultu..."
1294742,https://www.linkedin.com/jobs/view/production-...,"Production Operations, Processing, Ingredient ..."
1294752,https://www.linkedin.com/jobs/view/transmissio...,"Vegetation Management, Forestry, Arboriculture..."


In [5]:
jobs_data.shape

(1296381, 2)

In [6]:
jobs_data[~jobs_data['job_skills'].isna() & jobs_data['job_skills'].str.contains('docker', na=False, case=False)]

Unnamed: 0,job_link,job_skills
185,https://www.linkedin.com/jobs/view/senior-soft...,"Java, Linux, Docker, Kubernetes, AWS, Software..."
387,https://www.linkedin.com/jobs/view/platform-en...,"Software Development, Agile, ServiceNow, Incid..."
408,https://uk.linkedin.com/jobs/view/software-eng...,"C, C++, Java, SQL, JavaRx, LLDB, GDB, RDMA, Do..."
1442,https://www.linkedin.com/jobs/view/prisma-clou...,"Solutions Architect, DevOps Engineering, Prism..."
1496,https://www.linkedin.com/jobs/view/senior-engi...,"Cloud administration, Google Cloud Platform (G..."
...,...,...
1295677,https://www.linkedin.com/jobs/view/senior-soft...,"Software Engineering, Full Stack Development, ..."
1295796,https://www.linkedin.com/jobs/view/staff-fulls...,"Node.js, TypeScript, Software Development, RES..."
1296029,https://uk.linkedin.com/jobs/view/backend-soft...,"Python, Programming languages, Product shippin..."
1296068,https://www.linkedin.com/jobs/view/senior-soft...,"JavaScript, Java, Python, Go, TypeScript, SQL,..."


In [3]:
df = jobs_data[~jobs_data['job_skills'].isna()]

In [4]:
df['job_skills'] = df['job_skills'].apply(lambda x : x.lower().strip())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['job_skills'] = df['job_skills'].apply(lambda x : x.lower().strip())


In [6]:
sample_data = df.sample(700000) 

In [7]:
sample_data[sample_data['job_skills'].str.contains('sql',case=False)]

Unnamed: 0,job_link,job_skills
830404,https://www.linkedin.com/jobs/view/professiona...,"mongodb, customer success, sales, technical, c..."
242128,https://www.linkedin.com/jobs/view/consulting-...,"actuarial, pricing methodology, analytics, it,..."
394466,https://www.linkedin.com/jobs/view/senior-soft...,"software engineering, microservices, cloudnati..."
946935,https://www.linkedin.com/jobs/view/full-stack-...,"agile methodology, software development, web a..."
868278,https://www.linkedin.com/jobs/view/sr-software...,"c#, ms sql server, windows operating systems, ..."
...,...,...
9106,https://uk.linkedin.com/jobs/view/senior-java-...,"java, j2ee, spring framework, jdbc, sql, xml, ..."
504940,https://www.linkedin.com/jobs/view/senior-data...,"python, linux, sql, vertica, mysql, etl, jira,..."
1242436,https://ca.linkedin.com/jobs/view/technical-an...,"sql, cloud infrastructure, relational database..."
673276,https://www.linkedin.com/jobs/view/technical-p...,"data privacy, data platform, data governance, ..."


In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF
import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords  
import string

stop_words = set(stopwords.words('english'))
def custom_tokenizer(text):
    text = text.lower()  # Convert to lowercase
    # Remove punctuation
    text = ''.join(c for c in text if c not in string.punctuation)
    # Remove numbers (optional)
    remove_numbers = True  # Set to False to keep numbers
    if remove_numbers:
        text = ''.join(c for c in text if c not in string.digits)
    words = word_tokenize(text)
    # Filter out stopwords (if using a custom list)
    if stop_words:
        words = [word for word in words if word not in stop_words]
    return words
    
vectorizer = TfidfVectorizer(tokenizer=custom_tokenizer)

tfidf_matrix = vectorizer.fit_transform(df['job_skills'])






In [9]:
num_topics = 100
nmf_model = NMF(n_components=num_topics, random_state=32)
W = nmf_model.fit_transform(tfidf_matrix)
H = nmf_model.components_

top_k_words = 25
feature_names = vectorizer.get_feature_names_out()
topics = {}
for topic_idx, topic in enumerate(H):
  top_features_ind = topic.argsort()[:-top_k_words - 1:-1]
  top_features = [feature_names[i] for i in top_features_ind]
  topics[topic_idx] = top_features
  print(f"Topic {topic_idx}: {', '.join(top_features)}")

# Step 7: Assign Topics to Jobs
sample_data['top_topics'] = W.argmax(axis=1)
print(sample_data[['job_skills', 'top_topics']])

Topic 0: management, time, budget, inventory, vendor, performance, change, case, supply, contract, risk, chain, procurement, people, stakeholder, resource, logistics, supplier, property, budgeting, stress, order, kpi, organization, purchasing
Topic 1: care, home, acute, hospice, clinical, wound, palliative, compassion, intensive, primary, unit, coordination, critical, services, interdisciplinary, compassionate, patientcentered, case, plans, urgent, hospital, endoflife, empathy, rehabilitation, resident
Topic 2: ability, independently, meet, handle, deadlines, part, pressure, follow, learn, use, lift, communicate, work, maintain, instructions, effectively, perform, manage, adapt, tasks, multiple, multitask, operate, strong, read
Topic 3: food, preparation, sanitation, kitchen, cooking, culinary, menu, hygiene, cleaning, beverage, service, servsafe, haccp, handler, catering, teamwork, storage, recipe, safety, chef, handling, cleanliness, handlers, serving, permit
Topic 4: insurance, visi

ValueError: Length of values (1294296) does not match length of index (700000)

## Individual elements testing

In [10]:
qa_engineer_skills = [
    "Manual Testing",
    "Automation Testing",
    "Test Case Development",
    "Defect Tracking and Management",
    "Software Development Life Cycle (SDLC)",
    "Version Control Systems",
    "Performance Testing",
    "Security Testing",
    "Mobile Testing",
    "API Testing",
    "Continuous Integration/Continuous Deployment (CI/CD)",
    "Soft Skills",
    "Test Automation Frameworks",
    "DevOps Practices",
    "Programming Skills",
    "Database/SQL",
    "Continuous Learning"
]

In [11]:
data_analyst ="""Develop, implement, and maintain leading-edge analytics systems, taking complicated problems and building simple frameworks
Identify trends and opportunities for growth through analysis of complex datasets
Evaluate organizational methods and provide source-to-target mappings and information-model specification documents for datasets
Create best-practice reports based on data mining, analysis, and visualization
Evaluate internal systems for efficiency, problems, and inaccuracies, and develop and maintain protocols for handling, processing, and cleaning data
Work directly with managers and users to gather requirements, provide status updates, and build relationships
Responsibilities
Work closely with project managers to understand and maintain focus on their analytics needs, including critical metrics and KPIs, and deliver actionable insights to relevant decision-makers
Proactively analyze data to answer key questions for stakeholders or yourself, with an eye on what drives business performance, and investigate and communicate which areas need improvement in efficiency and productivity
Create and maintain rich interactive visualizations through data interpretation and analysis, with reporting components from multiple data sources
Define and implement data acquisition and integration logic, selecting an appropriate combination of methods and tools within the defined technology stack to ensure optimal scalability and performance of the solution
Develop and maintain databases by acquiring data from primary and secondary sources, and build scripts that will make our data evaluation process more flexible or scalable across datasets
Required skills and qualifications
Three or more years of experience mining data as a data analyst
Proven analytics skills, including mining, evaluation, and visualization
Technical writing experience in relevant areas, including queries, reports, and presentations
Strong SQL or Excel skills, with aptitude for learning other analytics tools
Preferred skills and qualifications
Bachelor’s degree (or equivalent) in mathematics, computer science, economics, or statistics
Experience with database and model design and segmentation techniques
Strong programming experience with frameworks, including XML, JavaScript, and ETL
Practical experience in statistical analysis through the use of statistical packages, including Excel, SPSS, and SAS
Proven success in a collaborative, team-oriented environment""" 

In [12]:
def get_top_n_topics_for_text(text, vectorizer, nmf_model, n=10):
    # Preprocess and vectorize the text
    tfidf_vector = vectorizer.transform([text])
    
    # Apply the NMF model to the vectorized text
    W_text = nmf_model.transform(tfidf_vector)
    
    # Get the top N topics for the text
    topic_strengths = W_text[0]
    top_topic_indices = topic_strengths.argsort()[-n:][::-1]
    return top_topic_indices
def get_representative_skills(job_description, vectorizer, nmf_model, topics, top_n=10, top_k_words=25):
    top_topics = get_top_n_topics_for_text(job_description, vectorizer, nmf_model, top_n)
    representative_skills = set()
    for topic_idx in top_topics:
        representative_skills.update(topics[topic_idx][:top_k_words])
    return list(representative_skills)
    
# Example usage: Get the top 10 topics for a new text
new_text = ' '.join(qa_engineer_skills)
top_n = 10
representative_skills = get_representative_skills(new_text, vectorizer, nmf_model, topics, top_n=3, top_k_words=10)
print(f"Representative skills for a QA Engineer: {', '.join(representative_skills)}")


Representative skills for a QA Engineer: requirements, cybersecurity, network, system, security, interpersonal, incident, unit, clearance, user, information, organizational, testing, strong, communication, documentation, integration, test, analytical, skills, cyber, infrastructure, risk, problemsolving, automation, excellent, mathematical, presentation, functional


For the QA engineer, it seems that the skills matched are good and represent the topic.

In [13]:
representative_skills = get_representative_skills(data_analyst, vectorizer, nmf_model, topics, top_n=10, top_k_words=10)


In [14]:
print(f"Representative skills for a data analyst: {', '.join(representative_skills)}")


Representative skills for a data analyst: retail, processing, safe, erp, tableau, industry, public, microsoft, previous, speaking, excel, root, noise, cost, information, research, testing, environmental, documentation, fastpaced, analytical, preferred, two, skills, problemsolving, mathematical, presentation, functional, player, systems, electronic, visualization, prior, one, related, outlook, sap, years, cause, unit, experience, setting, forecasting, intelligence, organizational, teams, age, year, market, integration, access, collaborative, grant, relevant, operating, passion, supervisory, bi, attitude, interpersonal, powerpoint, positive, user, team, pos, analysis, statistical, writing, data, collection, excellent, automation, analytics, sql, minimum, word, requirements, system, field, environment, technical, reporting, proposal, months, strong, communication, fire, entry, test, epic, ms, report


Same thing for the data analyst job, but it seems that it tends to ingore some hard technical skills, like python, power bi, mysql etc ... But we can be optimistic about the results since that it captures most of the important words. 

In [15]:
web_developer = """  Write well designed, testable, efficient code by using best software development practices
Create website layout/user interface by using standard HTML/CSS practices
Integrate data from various back-end services and databases
Gather and refine specifications and requirements based on technical needs
Create and maintain software documentation
Be responsible for maintaining, expanding, and scaling our site
Stay plugged into emerging technologies/industry trends and apply them into operations and activities
Cooperate with web designers to match visual design intent
Requirements and skills
Proven working experience in web programming
Top-notch programming skills and in-depth knowledge of modern HTML/CSS
Familiarity with at least one of the following programming languages: PHP, ASP.NET, Javascript or Ruby on Rails
A solid understanding of how web applications work including security, session management, and best development practices
Adequate knowledge of relational database systems, Object Oriented Programming and web application development
Hands-on experience with network diagnostics, network analytics tools
Basic knowledge of Search Engine Optimization process
Aggressive problem diagnosis and creative problem solving skills
Strong organizational skills to juggle multiple tasks within the constraints of  timelines and budgets with business acumen
Ability to work and thrive in a fast-paced environment, learn rapidly and master diverse web technologies and techniques.
BS in computer science or a related field"""

In [16]:
representative_skills = get_representative_skills(web_developer, vectorizer, nmf_model, topics, top_n=3, top_k_words=20)


In [17]:
print(f"Representative skills for a web developer: {', '.join(representative_skills)}")


Representative skills for a web developer: threat, cloud, cybersecurity, dod, response, listening, safe, network, system, decisionmaking, security, abilities, interpersonal, motivational, attitude, player, inclusive, secret, pc, environment, diverse, math, incident, noise, networking, clearance, prioritization, administrative, vulnerability, us, positive, supportive, time, information, organizational, strong, paced, weather, communication, team, working, dynamic, fastpaced, work, analytical, access, multitasking, collaborative, skills, cyber, infrastructure, risk, problemsolving, excellent, mathematical, windows, passion, presentation, irs


For the web developer job, it seems that the model is struggling to find a clear topic, and ignores all the hard skills like html, css and javascript. 

In [18]:
rayen = """Education : related courses : 
- statistical analysis 
- machine learning 
- deep learning 
- cloud services  
- probability and statistics 
Experience : 
machine learning intern : 
● Identified business challenges and opportunities within the recruitment process and applied Natural Language
Processing techniques to develop an innovative Application Tracking System. This system efficiently ranked
candidates for specific job descriptions, using named entity recognition and word embedding, resulting in a
significant reduction in the time spent on candidate selection.
● Orchestrated a comprehensive data acquisition strategy to harvest and preprocess candidate application data
with precision.
● Collaborated closely with HR and engineering teams to deploy the system using FastAPI and Vue.js, ensuring a
user-friendly interface for recruiters, and precisely mapped out a plan for the next 2 upcoming versions and
enhancements.
Buisness developer at enactus fst el manar : 
●Contributed to product creation, market evaluation, and customer segmentation while driving
clients' behavior analysis.
● Collaborated effectively with the sales team to develop a robust sales strategy for optimal results in 2 different
projects : Moonray, and Student plus
Junior machine learning engineer at omdena : 
● Played a pivotal role in the successful development and deployment of a computer vision system using deep
learning techniques ( pytorch ) in Egyptian orphanages, effectively tackling the issue of constrained monitoring.
● Gained expertise in deploying accurate solutions with low computational capabilities using python.
● Helped improve the well-being of more than 220 children.
Data analyst : omdena : 
● Successfully extracted and delivered actionable insights from a diverse dataset as part of the Omdena initiative for Peru's Open Data Platform, enabling positive transformation in Lima and significantly enhancing the quality of life for its residents. 
● Applied Python techniques for data collection and preprocessing, and Power BI for visualization dashboards, yielding critical insights into aggression trends and contributing to targeted interventions in Lima.
IT consultant at optima junior entreprise:  
● Utilized Vue.js, Laravel, and MySQL to successfully aggregate diverse data sources, standardize data storage, and develop interactive dashboards, leading to enhanced data management efficiency, elevated data quality, and user-friendly data accessibility.
Data science unity manager engineers spark : 
● Contributed To building a training strategy In the field of Data science for college students. 
● Executed 7 training sessions which significantly elevated students' proficiency in data science, achieving a remarkable 60% knowledge enhancement, coupled with an impressive satisfaction rate exceeding 85%.
Projects : 
Student performance prediction and analysis : 
Utilized and combined various machine learning techniques such as bagging and boosting with Python to predict
the performance of high school students.
● Employed Local and global interpretation techniques with Python to conduct in-depth analysis of the
inter-relations among various educational factors for performance prediction.
● Conducted A/B testing on a cohort of 1000 high school students to assess the impact of different educational
interventions and refine performance prediction models. 
Sales analysis : 
This project analyzes sales data with 3 interactive and clear dashboards. It summarizes sales indicators, highlights customer preferences, and details product categories/specifications driving valuable insights to inform business decisions. 
Virtual accounting firm : 
The platform aims to automate a workflow using angular. Additionally, it allows for better tracking of the accounting firm's resources and ensures more accurate and transparent billing for clients. It operates as a virtual office. Therefore, the principle is to automate various tasks that are typically performed in a "physical" office."""

In [19]:
representative_skills = get_representative_skills(rayen, vectorizer, nmf_model, topics, top_n=3, top_k_words=20)


In [20]:
print(f"Representative skills for a data analyst: {', '.join(representative_skills)}")


Representative skills for a data analyst: online, learning, platform, requirements, inspection, subjectmatter, collaboration, system, agile, small, science, application, trend, unit, expertise, verification, cause, root, cost, forecasting, reporting, large, user, validation, personality, variance, pricing, personalization, instant, testing, research, acceptance, group, adaptive, strategic, process, api, documentation, analysis, statistical, market, assessments, ai, test, integration, personalized, analytical, lesson, live, automated, data, troubleshooting, gathering, risk, automation, functional, presentation, classes, quantitative, tutoring


This example is a combination between data analysis and machine learning and resulted in a more general topic representation. Not much noise was introduced, but as mentioned before, there is a lack of hard technical keyword capturing, this may be due to how tf-idf penalizes frequent elements.

In [21]:
graphic_designer = """ Collaborating with multiple teams across the company, the graphic designer should be able to take written or spoken ideas and convert them into a design that connects. The successful candidate will have a thorough understanding of branding and marketing, and be able to find the right style and layout for every project.

Objectives of this role
Work on a wide range of projects and media, using various software programs to visualize and develop innovative graphic designs that meet business goals
Obtain input from managers to ensure that designs meet organizational standards and brand expectations, express ideas accurately, and represent the company or client appropriately
Work independently as well as cooperatively with marketing team to meet deadlines, stay within budget, and schedule project implementation based on workload, which may include five or more simultaneous projects
Examine existing processes, identify flaws, and create solutions that improve design capabilities
Update and maintain internal databases for designs, photography, and video
Responsibilities
Collaborate, brainstorm, and strategize with multiple teams or clients on a wide range of materials that may include web pages, presentations, programming collateral, signage, internal communications, newsletters, and marketing materials
Translate strategic direction into high-quality design within an established brand identity
Develop concepts by hand or with software, and execute original content by determining the ideal usage of color, text, font style, imagery, and layout
Manage the design and uploading process for all project materials, based on best practices for using a content management system
Use trend intelligence and knowledge of historical and current markets when designing and executing specific classifications
Required skills and qualifications
Exceptional creativity and innovative design skills
Five or more years of experience (academic and professional) with design software, including Illustrator, InDesign, Photoshop, Dreamweaver""" 

In [22]:
representative_skills = get_representative_skills(graphic_designer, vectorizer, nmf_model, topics, top_n=3, top_k_words=20)


In [25]:
print(f"Representative skills for a graphic designer: {', '.join(representative_skills)}")


Representative skills for a graphic designer: layout, modeling, creative, processing, content, marketing, architecture, agile, interior, cad, ux, public, property, autocad, applications, c, product, seo, relations, prototyping, media, social, creation, embedded, standards, photoshop, user, graphic, design, adobe, digital, editing, event, spreadsheet, database, languages, software, coding, objectoriented, crm, campaign, architectural, integration, programming, hardware, configuration, debugging, advertising, google, revit, communications, strategy, specifications, structural, analytics, tools, brand


For the graphic designer post, the model captured many important keywords, but it introduced some confusion with analytical topics, this might be due to the confusion with presentation skills.

In [26]:
Agriculture = """Here are some of the main activities and tasks that Agricultural representatives, consultants and specialists have to perform, and some of the physical demands they involve:

Provide counselling and advisory services to farmers on crop cultivation and fertilization, harvesting, animal and poultry care, disease prevention, farm management, farm financing, marketing and other agricultural subjects
Prepare and conduct advisory information sessions and lectures for farmers and other groups
Conduct research, analyze agricultural data and prepare research reports
Liaise with researchers, educators and government or business managers on matters pertaining to farming and agriculture
Maintain records of services provided and the effects of advice given
May operate unmanned aerial vehicle/drone to map drainage, plant seeds, analyze crop health and identify areas of stress and evenly spray fertilizers or pesticides.""" 


In [27]:
representative_skills = get_representative_skills(Agriculture, vectorizer, nmf_model, topics, top_n=3, top_k_words=20)


In [28]:
print(f"Representative skills for an agriculture worker: {', '.join(representative_skills)}")


Representative skills for an agriculture worker: acute, requirements, case, law, jd, industry, acumen, finance, intensive, doctor, consulting, commercial, patientcentered, compassion, clinical, juris, unit, administration, primary, interdisciplinary, intelligence, chain, business, research, technology, home, compassionate, real, contract, strategic, litigation, process, legal, market, wound, writing, client, analytical, coordination, negotiation, estate, trial, hospice, strategy, care, advocacy, palliative, urgent, critical, court, drafting, analytics, presentation, bar, plans, services, supply


## Testing using coherence score

In [30]:
!pip install gensim

Collecting gensim
  Using cached gensim-4.3.2-cp311-cp311-win_amd64.whl (24.0 MB)
Collecting smart-open>=1.8.1 (from gensim)
  Using cached smart_open-7.0.4-py3-none-any.whl (61 kB)
Collecting wrapt (from smart-open>=1.8.1->gensim)
  Using cached wrapt-1.16.0-cp311-cp311-win_amd64.whl (37 kB)
Installing collected packages: wrapt, smart-open, gensim
Successfully installed gensim-4.3.2 smart-open-7.0.4 wrapt-1.16.0



[notice] A new release of pip is available: 23.1.2 -> 24.0
[notice] To update, run: python.exe -m pip install --upgrade pip


In [43]:
from gensim.models.coherencemodel import CoherenceModel


ImportError: cannot import name 'triu' from 'scipy.linalg' (C:\Users\rayen\Desktop\programming\big_data\torch_env\Lib\site-packages\scipy\linalg\__init__.py)