In [1]:
import requests
from bs4 import BeautifulSoup

# Function to get HTML content of a LinkedIn job search page
def get_html_content(url):
    headers = {
        "User-Agent": "Your User Agent",
        "Accept-Language": "en-US,en;q=0.5",
        "Accept-Encoding": "gzip, deflate, br",
        "DNT": "1",  # Do Not Track Request Header
        "Connection": "close"
    }
    response = requests.get(url, headers=headers)
    return response.text

# Function to extract job roles and descriptions from HTML content
def extract_jobs_and_descriptions(html_content):
    soup = BeautifulSoup(html_content, 'html.parser')
    jobs = []

    # LinkedIn job listing class names may change over time. This is a placeholder example.
    job_cards = soup.find_all('div', class_='job-card-container')

    for job_card in job_cards:
        job_title = job_card.find('h3', class_='job-card-list__title').text.strip()
        company_name = job_card.find('h4', class_='job-card-container__company-name').text.strip()
        job_location = job_card.find('span', class_='job-card-container__metadata-item').text.strip()
        job_description = job_card.find('p', class_='job-card-container__description').text.strip()
        
        jobs.append({
            'title': job_title,
            'company': company_name,
            'location': job_location,
            'description': job_description
        })
    
    return jobs

# Example LinkedIn job search URL (you must be logged in to access real data)
url = "https://www.linkedin.com/jobs/search/?keywords=software%20engineer"

html_content = get_html_content(url)
jobs = extract_jobs_and_descriptions(html_content)

# Print the extracted jobs
for job in jobs:
    print(f"Job Title: {job['title']}")
    print(f"Company: {job['company']}")
    print(f"Location: {job['location']}")
    print(f"Description: {job['description']}")
    print('-' * 20)


In [3]:
import pandas as pd

# Step 1: Read the Excel file
file_path = 'C:/Users/rsrsp/Downloads/jobs.xlsx'
output_file = 'C:/Users/rsrsp/Downloads/output.xlsx'  # Change this to your desired output file name
df = pd.read_excel(file_path)

# Step 2: Merge the 10 columns into one column
df['Merged'] = df.apply(lambda row: ' '.join([str(row[col]) for col in df.columns[:10]]), axis=1)

# Step 3: Save to a new Excel file
df.to_excel(output_file, index=False)

print(f'Merged data saved to {output_file}')

Merged data saved to C:/Users/rsrsp/Downloads/output.xlsx


In [7]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans

# Download necessary NLTK data
nltk.download('punkt')
nltk.download('stopwords')

# Step 1: Load the data
# Assuming your dataset is in a CSV file with 'JobTitle' and 'JobDescription' columns
data = pd.read_csv('C:/Users/rsrsp/Downloads/jobs.csv')

# Combine the title and description into a single text column
data['Text'] = data['Job Title'] + ' ' + data['Job Description']

# Step 2: Text Preprocessing
def preprocess_text(text):
    # Tokenize the text
    tokens = word_tokenize(text.lower())
    # Remove stop words
    tokens = [word for word in tokens if word.isalpha() and word not in stopwords.words('english')]
    return ' '.join(tokens)

data['Text'] = data['Text'].apply(preprocess_text)

# Step 3: Feature Extraction using TF-IDF
vectorizer = TfidfVectorizer(max_features=5000)  # You can adjust the max_features
X = vectorizer.fit_transform(data['Text']).toarray()

# Step 4: Model Training with KMeans for clustering
kmeans = KMeans(n_clusters=5, random_state=42)  # Adjust the number of clusters as needed
data['Cluster'] = kmeans.fit_predict(X)

# Output the clustered data
print(data[['Job Title', 'Job Description', 'Cluster']].head())

# Step 5: Prediction (example with new data)
new_data = ["Software Engineer developing AI models"]
new_data = [preprocess_text(text) for text in new_data]
new_X = vectorizer.transform(new_data).toarray()
cluster_prediction = kmeans.predict(new_X)
print("Cluster predictions for new data:", cluster_prediction)


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\rsrsp\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\rsrsp\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
  super()._check_params_vs_input(X, default_n_init=10)


                   Job Title  \
0             Cloud Engineer   
1       Full Stack Developer   
2      Cybersecurity Analyst   
3               Data Analyst   
4  Machine Learning Engineer   

                                     Job Description  Cluster  
0  Implement and manage cloud infrastructure. Opt...        1  
1  Develop front-end and back-end components of w...        1  
2  Monitor and protect against cyber threats. Con...        1  
3  Analyze data for insights. Create visualizatio...        1  
4  Develop and implement machine learning models....        1  
Cluster predictions for new data: [1]


In [1]:
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from datasets import Dataset
from transformers import T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments
from sklearn.metrics import accuracy_score

# Step 1: Load and Prepare the Data
data = pd.read_csv('C:/Users/rsrsp/Downloads/jobs.csv')

# Combine the title and description into a single text column for training
data['input_text'] = "title: " + data['Job Title'] + " description: " + data['Job Description']

# Split the data into training and test sets
train_df, test_df = train_test_split(data[['input_text']], test_size=0.2, random_state=42)

# Convert the dataframes to Hugging Face datasets
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

# Step 2: Fine-Tune a Pre-Trained Model
model_name = 't5-small'
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

def preprocess_data(examples):
    inputs = examples['input_text']
    model_inputs = tokenizer(inputs, max_length=512, truncation=True, padding='max_length')
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(inputs, max_length=512, truncation=True, padding='max_length')
    model_inputs['labels'] = labels['input_ids']
    return model_inputs

train_dataset = train_dataset.map(preprocess_data, batched=True)
test_dataset = test_dataset.map(preprocess_data, batched=True)

# Set training arguments
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy='epoch',
    learning_rate=5e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir='./logs',
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
)

# Train the model
trainer.train()

# Step 3: Generate Descriptions for New Job Titles
def generate_description(job_title):
    input_text = "title: " + job_title
    input_ids = tokenizer.encode(input_text, return_tensors='pt', max_length=512, truncation=True)
    output_ids = model.generate(input_ids, max_length=512, num_beams=5, early_stopping=True)
    description = tokenizer.decode(output_ids[0], skip_special_tokens=True)
    return description

new_job_title = "Software Engineer"
generated_description = generate_description(new_job_title)
print(f"Generated description for '{new_job_title}': {generated_description}")

# Step 4: Evaluate the Model
# For simplicity, using a basic accuracy metric by checking if the generated description contains relevant words from the actual description

def evaluate_accuracy(dataset, tokenizer, model):
    correct = 0
    total = 0
    for example in dataset:
        job_title = example['input_text'].split('description:')[0].replace('title: ', '').strip()
        actual_description = example['input_text'].split('description:')[1].strip()
        generated_description = generate_description(job_title)
        
        # A simple heuristic for accuracy: check if the generated description contains at least one word from the actual description
        if any(word in generated_description for word in actual_description.split()):
            correct += 1
        total += 1
    
    accuracy = correct / total
    return accuracy

train_accuracy = evaluate_accuracy(train_dataset, tokenizer, model)
test_accuracy = evaluate_accuracy(test_dataset, tokenizer, model)

print(f"Train Accuracy: {train_accuracy * 100:.2f}%")
print(f"Test Accuracy: {test_accuracy * 100:.2f}%")


  torch.utils._pytree._register_pytree_node(





Downloading spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


Downloading tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. If you see this, DO NOT PANIC! This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=True`. This should only be set if you understand what it means, and thouroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


Downloading config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

Downloading generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Map:   0%|          | 0/1884 [00:00<?, ? examples/s]



Map:   0%|          | 0/471 [00:00<?, ? examples/s]

ImportError: Using the `Trainer` with `PyTorch` requires `accelerate>=0.20.1`: Please run `pip install transformers[torch]` or `pip install accelerate -U`