In [34]:
import pandas as pd

data = {
    'gender': ['Female', 'Male', 'Male', 'Female', 'Male', 'Female', 'Male', 'Female', 'Male', 'Female'],
    'SeniorCitizen': [0, 0, 1, 0, 0, 1, 0, 0, 1, 0],
    'Partner': ['Yes', 'No', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'Yes', 'No'],
    'Dependents': ['No', 'No', 'No', 'Yes', 'Yes', 'No', 'Yes', 'No', 'No', 'Yes'],
    'tenure': [1, 34, 2, 45, 2, 8, 22, 10, 28, 62],
    'PhoneService': ['No', 'Yes', 'Yes', 'No', 'Yes', 'Yes', 'Yes', 'No', 'Yes', 'Yes'],
    'MultipleLines': ['No phone service', 'No', 'No', 'No phone service', 'No', 'Yes', 'No', 'No phone service', 'Yes', 'No'],
    'InternetService': ['DSL', 'DSL', 'Fiber optic', 'DSL', 'Fiber optic', 'Fiber optic', 'DSL', 'DSL', 'Fiber optic', 'No'],
    'OnlineSecurity': ['No', 'Yes', 'No', 'Yes', 'No', 'No', 'Yes', 'Yes', 'No', 'No internet service'],
    'MonthlyCharges': [29.85, 56.95, 90.05, 42.30, 70.70, 99.65, 55.20, 20.05, 104.80, 20.90],
    'TotalCharges': ['29.85', '1889.5', '180.1', '1840.75', '151.65', '820.5', '1250.75', '190.5', '3046.05', '1237.5'], # Note: string type
    'Contract': ['Month-to-month', 'One year', 'Month-to-month', 'One year', 'Month-to-month', 'Month-to-month', 'Month-to-month', 'Month-to-month', 'Two year', 'Two year'],
    'PaymentMethod': ['Electronic check', 'Mailed check', 'Electronic check', 'Bank transfer (automatic)', 'Electronic check', 'Credit card (automatic)', 'Mailed check', 'Mailed check', 'Electronic check', 'Bank transfer (automatic)'],
    'Churn': ['No', 'No', 'Yes', 'No', 'Yes', 'Yes', 'No', 'Yes', 'Yes', 'No']
}

data = pd.DataFrame(data)
data.head(10)

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,MonthlyCharges,TotalCharges,Contract,PaymentMethod,Churn
0,Female,0,Yes,No,1,No,No phone service,DSL,No,29.85,29.85,Month-to-month,Electronic check,No
1,Male,0,No,No,34,Yes,No,DSL,Yes,56.95,1889.5,One year,Mailed check,No
2,Male,1,No,No,2,Yes,No,Fiber optic,No,90.05,180.1,Month-to-month,Electronic check,Yes
3,Female,0,Yes,Yes,45,No,No phone service,DSL,Yes,42.3,1840.75,One year,Bank transfer (automatic),No
4,Male,0,No,Yes,2,Yes,No,Fiber optic,No,70.7,151.65,Month-to-month,Electronic check,Yes
5,Female,1,Yes,No,8,Yes,Yes,Fiber optic,No,99.65,820.5,Month-to-month,Credit card (automatic),Yes
6,Male,0,No,Yes,22,Yes,No,DSL,Yes,55.2,1250.75,Month-to-month,Mailed check,No
7,Female,0,Yes,No,10,No,No phone service,DSL,Yes,20.05,190.5,Month-to-month,Mailed check,Yes
8,Male,1,Yes,No,28,Yes,Yes,Fiber optic,No,104.8,3046.05,Two year,Electronic check,Yes
9,Female,0,No,Yes,62,Yes,No,No,No internet service,20.9,1237.5,Two year,Bank transfer (automatic),No


In [None]:
cat_columns = []
num_columns = []
for column in data.columns:
    if column != "Churn":
        if data[column].nunique() <= 4:
            cat_columns.append(column)
        else:
            num_columns.append(column)

data["TotalCharges"] = pd.to_numeric(data["TotalCharges"], errors="coerce")

if data["TotalCharges"].isnull().any():
    mean = data["TotalCharges"].mean()
    data["TotalCharges"] = data["TotalCharges"].fillna(mean)

data["Churn"] = data["Churn"].apply(lambda row: 1 if row == "Yes" else 0)

from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

preprocessor = ColumnTransformer(transformers=[
    ('num', StandardScaler(), num_columns),
    ('cat', OneHotEncoder(handle_unknown="ignore", drop="first"), cat_columns),
], remainder="passthrough")

Y = data["Churn"]
X = data.drop("Churn", axis= 1)

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, stratify=Y)

model_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                                 ('classifier', RandomForestClassifier(random_state=42, class_weight='balanced'))])
                                 # class_weight='balanced' can help with imbalanced target variable

model_pipeline.fit(X_train, y_train)
print("\nModel training completed.")


Model training completed.


In [38]:
y_pred = model_pipeline.predict(X_test)
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred, zero_division=0))

[[2]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00         2

    accuracy                           1.00         2
   macro avg       1.00      1.00      1.00         2
weighted avg       1.00      1.00      1.00         2



In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from collections import defaultdict
import re

# --- 1. Data Engineering: Simulate Data and Basic Preprocessing ---

def simulate_data():
    """Simulates data for profiles, applications, preferences, and job postings."""
    user_profiles_data = {
        'UserID': [1, 2, 3, 4, 5],
        'Skills': [
            ['Python', 'Data Analysis', 'SQL', 'Pandas'],
            ['Java', 'Spring Boot', 'Microservices', 'Kubernetes'],
            ['JavaScript', 'React', 'Node.js', 'HTML', 'CSS'],
            ['Python', 'Machine Learning', 'TensorFlow', 'NLP'],
            ['Project Management', 'Agile', 'Scrum', 'Communication']
        ],
        'Experience': [
            'Data Analyst at BizCorp; Python scripting for data cleaning.',
            'Senior Java Developer at FinTech Solutions; Built scalable microservices.',
            'Frontend Developer at WebDesign LLC; Developed responsive UIs with React.',
            'AI Researcher at InnovateAI; Focused on NLP models with Python.',
            'Scrum Master at AgilePro; Led multiple project teams.'
        ],
        'LocationPref': ['New York', 'Remote', 'San Francisco', 'Remote', 'London']
    }
    df_user_profiles = pd.DataFrame(user_profiles_data)

    user_applications_data = {
        'UserID': [1, 1, 2, 3, 3, 4, 4, 4, 5],
        'JobID': [101, 102, 201, 301, 302, 101, 401, 402, 501]
    }
    df_user_applications = pd.DataFrame(user_applications_data)

    user_preferences_data = {
        'UserID': [1, 2, 3, 4, 5],
        'DesiredRole': ['Data Scientist', 'Backend Engineer', 'Frontend Developer', 'Machine Learning Engineer', 'Product Manager'],
        'DesiredIndustry': [['Technology', 'Finance'], ['Finance', 'E-commerce'], ['Web Development', 'Media'], ['AI', 'Research'], ['Technology', 'SaaS']],
        'ExperienceLevelSought': ['Mid-level', 'Senior', 'Mid-level', 'Senior', 'Lead'],
        'OpenToRemote': [True, True, False, True, False]
    }
    df_user_preferences = pd.DataFrame(user_preferences_data)

    job_postings_data = {
        'JobID': [101, 102, 201, 202, 301, 302, 401, 402, 501, 502, 601, 602],
        'Title': [
            'Data Analyst Python', 'Senior Data Analyst', 'Java Backend Developer', 'Lead Java Engineer',
            'React Frontend Developer', 'UI/UX Designer', 'Machine Learning Scientist - NLP', 'AI Engineer - Computer Vision',
            'Agile Project Lead', 'Technical Product Owner', 'Entry Level Python Developer', 'Remote Python Data Engineer'
        ],
        'Description': [
            'Analyze large datasets using Python and SQL. Experience with Pandas and NumPy required. Tableau for visualization.',
            'Seeking an experienced data analyst for leading projects. Strong SQL and Python skills. Leadership experience.',
            'Develop robust backend systems using Java and Spring Boot. Knowledge of microservices and cloud platforms.',
            'Lead a team of Java developers. Architect and implement scalable solutions. Strong experience with Java, Spring.',
            'Build modern web interfaces with React and JavaScript. Collaborate with UX designers. CSS and HTML proficiency.',
            'Design user-friendly interfaces and experiences. Proficiency in Figma, Sketch. Understanding of user research.',
            'Research and develop NLP models. Python, TensorFlow, PyTorch. Publications in NLP conferences a plus.',
            'Develop computer vision algorithms. Experience with OpenCV, PyTorch. C++ and Python.',
            'Lead agile projects, facilitate scrum ceremonies, and manage project timelines. CSM certification preferred.',
            'Define product roadmap and features for a technical product. Work closely with engineering teams.',
            'Junior role for Python developer. Focus on data processing tasks. SQL knowledge needed.',
            'Fully remote role for a data engineer. Python, Spark, Airflow, and AWS cloud services experience.'
        ],
        'RequiredSkills': [
            ['Python', 'SQL', 'Pandas', 'Tableau'], ['SQL', 'Python', 'Leadership', 'Communication'],
            ['Java', 'Spring Boot', 'Microservices', 'REST APIs'], ['Java', 'Spring', 'Architecture', 'Leadership'],
            ['React', 'JavaScript', 'HTML', 'CSS', 'Git'], ['Figma', 'Sketch', 'User Research', 'UI Design'],
            ['Python', 'TensorFlow', 'PyTorch', 'NLP'], ['Python', 'OpenCV', 'PyTorch', 'C++'],
            ['Agile', 'Scrum', 'JIRA', 'Communication'], ['Product Management', 'Agile', 'Roadmap'],
            ['Python', 'SQL', 'Data Processing'], ['Python', 'Spark', 'Airflow', 'AWS', 'Data Engineering']
        ],
        'Location': ['New York', 'New York', 'Remote', 'San Francisco', 'San Francisco', 'Remote', 'Remote', 'Boston', 'London', 'New York', 'New York', 'Remote'],
        'Industry': ['Technology', 'Finance', 'Finance', 'Technology', 'Web Development', 'Design', 'AI', 'AI', 'Technology', 'SaaS', 'Finance', 'Technology'],
        'RequiredExperienceLevel': ['2+ years', '5+ years', '3+ years', '7+ years', '2+ years', '3+ years', 'PhD or 5+ years', '3+ years', '5+ years', '4+ years', 'Entry-level', '3+ years']
    }
    df_job_postings = pd.DataFrame(job_postings_data)
    return df_user_profiles, df_user_applications, df_user_preferences, df_job_postings

df_user_profiles, df_user_applications, df_user_preferences, df_job_postings = simulate_data()
df_user_profiles = pd.merge(df_user_profiles, df_user_preferences, on='UserID', how='left')


In [2]:
df_job_postings.head(10)

Unnamed: 0,JobID,Title,Description,RequiredSkills,Location,Industry,RequiredExperienceLevel
0,101,Data Analyst Python,Analyze large datasets using Python and SQL. E...,"[Python, SQL, Pandas, Tableau]",New York,Technology,2+ years
1,102,Senior Data Analyst,Seeking an experienced data analyst for leadin...,"[SQL, Python, Leadership, Communication]",New York,Finance,5+ years
2,201,Java Backend Developer,Develop robust backend systems using Java and ...,"[Java, Spring Boot, Microservices, REST APIs]",Remote,Finance,3+ years
3,202,Lead Java Engineer,Lead a team of Java developers. Architect and ...,"[Java, Spring, Architecture, Leadership]",San Francisco,Technology,7+ years
4,301,React Frontend Developer,Build modern web interfaces with React and Jav...,"[React, JavaScript, HTML, CSS, Git]",San Francisco,Web Development,2+ years
5,302,UI/UX Designer,Design user-friendly interfaces and experience...,"[Figma, Sketch, User Research, UI Design]",Remote,Design,3+ years
6,401,Machine Learning Scientist - NLP,"Research and develop NLP models. Python, Tenso...","[Python, TensorFlow, PyTorch, NLP]",Remote,AI,PhD or 5+ years
7,402,AI Engineer - Computer Vision,Develop computer vision algorithms. Experience...,"[Python, OpenCV, PyTorch, C++]",Boston,AI,3+ years
8,501,Agile Project Lead,"Lead agile projects, facilitate scrum ceremoni...","[Agile, Scrum, JIRA, Communication]",London,Technology,5+ years
9,502,Technical Product Owner,Define product roadmap and features for a tech...,"[Product Management, Agile, Roadmap]",New York,SaaS,4+ years


In [3]:


def user_profile_to_text(row):
    skills = ', '.join(row['Skills'])
    experience = row['Experience']
    location_pref = row['LocationPref']
    desired_role = row['DesiredRole']
    desired_industry = ', '.join(row['DesiredIndustry'])
    experience_level = row['ExperienceLevelSought']
    open_to_remote = 'Open to remote' if row['OpenToRemote'] else 'Not open to remote'
    return (f"Seeking a {experience_level} {desired_role} role in the {desired_industry} industry. "
            f"Preferred location is {location_pref}. "
            f"Key skills include {skills}. "
            f"Past experience: {experience}.")

def job_posting_to_text(row):
    title = row['Title']
    description = row['Description']
    required_skills = ', '.join(row['RequiredSkills'])
    location = row['Location']
    industry = row['Industry']
    required_experience = row['RequiredExperienceLevel']
    return (f"Job Title: {row['Title']}. Industry: {row['Industry']}. Location: {row['Location']}. "
            f"Experience Required: {row['RequiredExperienceLevel']}. "
            f"Job Description: {row['Description']}. "
            f"Required Skills: {required_skills}.")

df_user_profiles['ProfileText'] = df_user_profiles.apply(user_profile_to_text, axis=1)
df_job_postings["JobText"] = df_job_postings.apply(job_posting_to_text, axis=1)
df_user_profiles.head(10)


Unnamed: 0,UserID,Skills,Experience,LocationPref,DesiredRole,DesiredIndustry,ExperienceLevelSought,OpenToRemote,ProfileText
0,1,"[Python, Data Analysis, SQL, Pandas]",Data Analyst at BizCorp; Python scripting for ...,New York,Data Scientist,"[Technology, Finance]",Mid-level,True,Seeking a Mid-level Data Scientist role in the...
1,2,"[Java, Spring Boot, Microservices, Kubernetes]",Senior Java Developer at FinTech Solutions; Bu...,Remote,Backend Engineer,"[Finance, E-commerce]",Senior,True,Seeking a Senior Backend Engineer role in the ...
2,3,"[JavaScript, React, Node.js, HTML, CSS]",Frontend Developer at WebDesign LLC; Developed...,San Francisco,Frontend Developer,"[Web Development, Media]",Mid-level,False,Seeking a Mid-level Frontend Developer role in...
3,4,"[Python, Machine Learning, TensorFlow, NLP]",AI Researcher at InnovateAI; Focused on NLP mo...,Remote,Machine Learning Engineer,"[AI, Research]",Senior,True,Seeking a Senior Machine Learning Engineer rol...
4,5,"[Project Management, Agile, Scrum, Communication]",Scrum Master at AgilePro; Led multiple project...,London,Product Manager,"[Technology, SaaS]",Lead,False,Seeking a Lead Product Manager role in the Tec...


In [11]:
# --- Create Positive pairs of user profiles and job postings ---
from sentence_transformers import SentenceTransformer, losses, util
from sentence_transformers import InputExample
from torch.utils.data import DataLoader

def create_positive_pairs(user_profiles, job_postings):
    role_advantage = 10
    location_advantage = 5
    industry_advantage = 3
    skill_advantage = 2


    positive_pairs = []
    for _, user in user_profiles.iterrows():
        for _, job in job_postings.iterrows():
            score = 0
            if user['DesiredRole'] in job['Title']:
                score += role_advantage
            if any(skill in job['RequiredSkills'] for skill in user['Skills']):
                score += skill_advantage
            if user['LocationPref'] == job['Location']:
                score += location_advantage
            if job['Industry'] in user['DesiredIndustry']:
                score += industry_advantage
            if score > 12:
                positive_pairs.append(InputExample(texts=[user['ProfileText'], job['JobText']], label=1.0))
    return positive_pairs

train_samples = create_positive_pairs(df_user_profiles, df_job_postings)


In [12]:
BATCH_SIZE = 32
EPOCH = 10

model = SentenceTransformer('all-MiniLM-L6-v2')
train_examples = DataLoader(dataset=train_samples, shuffle=True, batch_size=BATCH_SIZE)

loss = losses.MultipleNegativesRankingLoss(model=model, )

warmup_steps = int(len(train_examples)* EPOCH * 0.1)

model.fit(train_objectives=[(train_examples, loss)],
            epochs=EPOCH,
            warmup_steps=warmup_steps,
            output_path='job_matching_model',
            use_amp=True,
            show_progress_bar=True)

                                                                     

Step,Training Loss


In [28]:
user_embedding = model.encode(df_user_profiles["ProfileText"].tolist(), convert_to_tensor=True)
job_embedding = model.encode(df_job_postings["JobText"].tolist(), convert_to_tensor=True)

print("\n--- Top Job Matches (Fine-Tuned Approach) ---")
for i in range(len(df_user_profiles)):
    user_doc = df_user_profiles.iloc[i]
    print(f"\nUser {user_doc['UserID']} (Desired Role: {user_doc['DesiredRole']}), Skills: {', '.join(user_doc['Skills'])}, Location Preference: {user_doc['LocationPref']})")
    
    # Calculate cosine similarity
    cosine_scores = util.cos_sim(user_embedding[i], job_embedding)[0]
    
    # Get the top 3 best matching jobs
    top_results = cosine_scores.topk(k=3)

    for score, idx in zip(top_results[0], top_results[1]):
        id_int = int(idx)
        job_doc = df_job_postings.iloc[id_int]
        print(f"Job ID: {job_doc['JobID']}, Title: {job_doc['Title']}, Score: {score.item():.4f}")


--- Top Job Matches (Fine-Tuned Approach) ---

User 1 (Desired Role: Data Scientist), Skills: Python, Data Analysis, SQL, Pandas, Location Preference: New York)
Job ID: 101, Title: Data Analyst Python, Score: 0.7240
Job ID: 102, Title: Senior Data Analyst, Score: 0.7059
Job ID: 601, Title: Entry Level Python Developer, Score: 0.6564

User 2 (Desired Role: Backend Engineer), Skills: Java, Spring Boot, Microservices, Kubernetes, Location Preference: Remote)
Job ID: 201, Title: Java Backend Developer, Score: 0.7484
Job ID: 202, Title: Lead Java Engineer, Score: 0.6158
Job ID: 301, Title: React Frontend Developer, Score: 0.5837

User 3 (Desired Role: Frontend Developer), Skills: JavaScript, React, Node.js, HTML, CSS, Location Preference: San Francisco)
Job ID: 301, Title: React Frontend Developer, Score: 0.7663
Job ID: 201, Title: Java Backend Developer, Score: 0.5482
Job ID: 202, Title: Lead Java Engineer, Score: 0.5251

User 4 (Desired Role: Machine Learning Engineer), Skills: Python, M

In [None]:
import torch.nn as nn
import torch.optim as optim

class jobEncoder(nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super(jobEncoder, self). __init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.fc = nn.Linear(embedding_dim, 128)
    
    def forward(self, X)