<a href="https://colab.research.google.com/github/satyam26en/JOB/blob/main/Untitled54.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [33]:
import pandas as pd
import plotly.express as px
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans

# Load the dataset from the provided GitHub URL
file_url = "https://raw.githubusercontent.com/satyam26en/JOB/main/Clean_Job_File.csv"
df = pd.read_csv(file_url)

# Preprocess the responsibilities text
df['responsibilities'] = df['responsibilities'].fillna('')

# Vectorize the responsibilities text using TF-IDF
vectorizer = TfidfVectorizer(stop_words='english', max_features=1000)
X = vectorizer.fit_transform(df['responsibilities'])

# Use K-Means clustering to cluster the responsibilities
num_clusters = 10
kmeans = KMeans(n_clusters=num_clusters, random_state=42, max_iter=2500, n_init=10)
df['Cluster'] = kmeans.fit_predict(X)

# Get the top terms per cluster
order_centroids = kmeans.cluster_centers_.argsort()[:, ::-1]
terms = vectorizer.get_feature_names_out()

# Function to get the top terms for each cluster
def get_top_terms(cluster_num, n_terms=10):
    top_terms = [terms[ind] for ind in order_centroids[cluster_num, :n_terms]]
    return top_terms

# Display the top terms for each cluster to manually assign job role names
for i in range(num_clusters):
    print(f"Cluster {i} top terms: {get_top_terms(i)}")

# Manually assign job role names based on top terms
cluster_to_job_role = {
    0: 'Software Developer',
    1: 'Sales Executive',
    2: 'HR Manager',
    3: 'Operations Manager',
    4: 'Marketing Specialist',
    5: 'Customer Service',
    6: 'Project Manager',
    7: 'Customer Service',
    8: 'Sales Executive',
    9: 'Business Analyst'
}

df['Job Role'] = df['Cluster'].map(cluster_to_job_role)

# Provided job openings data
job_openings_data = {
    'Software Developer': {'Total': 37944, 'Fresher': 1253, 'Junior': 4013, 'Mid-Level': 8332, 'Senior': 19075, 'Expert': 5371},
    'Project Manager': {'Total': 6528, 'Fresher': 185, 'Junior': 926, 'Mid-Level': 1790, 'Senior': 2838, 'Expert': 789},
    'Data Scientist': {'Total': 2882, 'Fresher': 0, 'Junior': 1, 'Mid-Level': 7, 'Senior': 2852, 'Expert': 22},
    'Finance Manager': {'Total': 11491, 'Fresher': 245, 'Junior': 1085, 'Mid-Level': 2088, 'Senior': 5762, 'Expert': 2311},
    'Business Analyst': {'Total': 3275, 'Fresher': 346, 'Junior': 770, 'Mid-Level': 1477, 'Senior': 486, 'Expert': 196},
    'Sales Executive': {'Total': 2916, 'Fresher': 332, 'Junior': 720, 'Mid-Level': 777, 'Senior': 806, 'Expert': 281},
    'Customer Service': {'Total': 2743, 'Fresher': 75, 'Junior': 366, 'Mid-Level': 615, 'Senior': 1174, 'Expert': 513},
    'HR Manager': {'Total': 2685, 'Fresher': 165, 'Junior': 528, 'Mid-Level': 698, 'Senior': 1028, 'Expert': 266},
    'Marketing Specialist': {'Total': 741, 'Fresher': 5, 'Junior': 7, 'Mid-Level': 15, 'Senior': 691, 'Expert': 23},
    'Operations Manager': {'Total': 1803, 'Fresher': 0, 'Junior': 0, 'Mid-Level': 0, 'Senior': 1800, 'Expert': 3}
}

# Create detailed hover text for each bar
def create_hover_text(job_role):
    data = job_openings_data[job_role]
    return (f"Job Role: {job_role}<br>Total Openings: {data['Total']}<br>"
            f"Fresher Openings: {data['Fresher']}<br>"
            f"Junior Openings: {data['Junior']}<br>"
            f"Mid-Level Openings: {data['Mid-Level']}<br>"
            f"Senior Openings: {data['Senior']}<br>"
            f"Expert Openings: {data['Expert']}")

total_openings_by_role = pd.DataFrame([
    {'Job Role': role, 'Number of Openings': data['Total'], 'Hover Text': create_hover_text(role)}
    for role, data in job_openings_data.items()
])

total_openings_by_role = total_openings_by_role.sort_values(by='Number of Openings')

# Create a bar chart using Plotly
fig = px.bar(
    total_openings_by_role,
    x='Job Role',
    y='Number of Openings',
    title='Top Job Openings Based on Responsibilities and Experience Category',
    labels={'Job Role': 'Job Role', 'Number of Openings': 'Number of Openings'},
    color='Job Role',
    color_discrete_sequence=px.colors.qualitative.Pastel,
    custom_data=['Hover Text']
)

# Update layout for better visualization and border
fig.update_layout(
    xaxis_title='Job Role',
    yaxis_title='Number of Openings',
    title={
        'text': 'Top Job Openings Based on Responsibilities and Experience Category',
        'y': 0.9,
        'x': 0.5,
        'xanchor': 'center',
        'yanchor': 'top'
    },
    uniformtext_minsize=8,
    uniformtext_mode='hide',
    showlegend=False,  # Hide legend
    yaxis=dict(
        tickmode='array',
        tickvals=[0, 3000, 6000, 9000, 12000, 15000, 18000, 21000, 24000, 27000, 30000, 33000, 36000, 39000, 42000],
        range=[0, 42000],
        title='Number of Openings',
        linecolor='black', linewidth=2, mirror=True
    ),
    plot_bgcolor='white',  # Set background color
    paper_bgcolor='white',  # Set paper background color
    margin=dict(l=50, r=50, b=100, t=100, pad=4),  # Adjust margins
    xaxis=dict(linecolor='black', linewidth=2, mirror=True),  # Add border to x-axis
    width=1000,  # Width of the chart
    height=600  # Height of the chart
)

# Update hover template to show detailed information
fig.update_traces(hovertemplate='%{customdata[0]}')

# Show the interactive bar chart
fig.show()


Cluster 0 top terms: ['java', 'sql', 'design', 'development', 'data', 'testing', 'javascript', 'engineering', 'net', 'communication']
Cluster 1 top terms: ['sales', 'development', 'business', 'management', 'marketing', 'insurance', 'b2b', 'generation', 'channel', 'lead']
Cluster 2 top terms: ['agency', 'channel', 'team', 'training', 'bfsi', 'tied', 'handling', 'advisors', 'branch', 'management']
Cluster 3 top terms: ['operations', 'management', 'hr', 'office', 'customer', 'service', 'banking', 'business', 'data', 'store']
Cluster 4 top terms: ['skills', 'communication', 'good', 'english', 'interpersonal', 'management', 'sales', 'presentation', 'analytical', 'office']
Cluster 5 top terms: ['marketing', 'media', 'content', 'digital', 'social', 'writing', 'seo', 'management', 'sales', 'google']
Cluster 6 top terms: ['training', 'learning', 'development', 'partnership', 'distribution', 'manager', 'management', 'soft', 'analysis', 'skills']
Cluster 7 top terms: ['customer', 'voice', 'bpo', 

In [36]:
import pandas as pd
import plotly.express as px
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans

# Load the dataset from the provided GitHub URL
file_url = "https://raw.githubusercontent.com/satyam26en/JOB/main/Clean_Job_File.csv"
df = pd.read_csv(file_url)

# Preprocess the responsibilities text
df['responsibilities'] = df['responsibilities'].fillna('')

# Vectorize the responsibilities text using TF-IDF
vectorizer = TfidfVectorizer(stop_words='english', max_features=1000)
X = vectorizer.fit_transform(df['responsibilities'])

# Use K-Means clustering to cluster the responsibilities
num_clusters = 10
kmeans = KMeans(n_clusters=num_clusters, random_state=42, max_iter=2500, n_init=10)
df['Cluster'] = kmeans.fit_predict(X)

# Get the top terms per cluster
order_centroids = kmeans.cluster_centers_.argsort()[:, ::-1]
terms = vectorizer.get_feature_names_out()

# Function to get the top terms for each cluster
def get_top_terms(cluster_num, n_terms=10):
    top_terms = [terms[ind] for ind in order_centroids[cluster_num, :n_terms]]
    return top_terms

# Display the top terms for each cluster to manually assign job role names
for i in range(num_clusters):
    print(f"Cluster {i} top terms: {get_top_terms(i)}")

# Manually assign job role names based on top terms
cluster_to_job_role = {
    0: 'Software Developer',
    1: 'Sales Executive',
    2: 'HR Manager',
    3: 'Operations Manager',
    4: 'Marketing Specialist',
    5: 'Customer Service',
    6: 'Project Manager',
    7: 'Customer Service',
    8: 'Sales Executive',
    9: 'Business Analyst'
}

df['Job Role'] = df['Cluster'].map(cluster_to_job_role)

# Provided job openings data
job_openings_data = {
    'Software Developer': {'Total': 37944, 'Fresher': 1253, 'Junior': 4013, 'Mid-Level': 8332, 'Senior': 19075, 'Expert': 5371},
    'Project Manager': {'Total': 6528, 'Fresher': 185, 'Junior': 926, 'Mid-Level': 1790, 'Senior': 2838, 'Expert': 789},
    'Data Scientist': {'Total': 2882, 'Fresher': 0, 'Junior': 1, 'Mid-Level': 7, 'Senior': 2852, 'Expert': 22},
    'Finance Manager': {'Total': 11491, 'Fresher': 245, 'Junior': 1085, 'Mid-Level': 2088, 'Senior': 5762, 'Expert': 2311},
    'Business Analyst': {'Total': 3275, 'Fresher': 346, 'Junior': 770, 'Mid-Level': 1477, 'Senior': 486, 'Expert': 196},
    'Sales Executive': {'Total': 2916, 'Fresher': 332, 'Junior': 720, 'Mid-Level': 777, 'Senior': 806, 'Expert': 281},
    'Customer Service': {'Total': 2743, 'Fresher': 75, 'Junior': 366, 'Mid-Level': 615, 'Senior': 1174, 'Expert': 513},
    'HR Manager': {'Total': 2685, 'Fresher': 165, 'Junior': 528, 'Mid-Level': 698, 'Senior': 1028, 'Expert': 266},
    'Marketing Specialist': {'Total': 741, 'Fresher': 5, 'Junior': 7, 'Mid-Level': 15, 'Senior': 691, 'Expert': 23},
    'Operations Manager': {'Total': 1803, 'Fresher': 0, 'Junior': 0, 'Mid-Level': 0, 'Senior': 1800, 'Expert': 3}
}

# Create detailed hover text for each bar
def create_hover_text(job_role):
    data = job_openings_data[job_role]
    return (f"Job Role: {job_role}<br>Total Openings: {data['Total']}<br>"
            f"Fresher Openings: {data['Fresher']}<br>"
            f"Junior Openings: {data['Junior']}<br>"
            f"Mid-Level Openings: {data['Mid-Level']}<br>"
            f"Senior Openings: {data['Senior']}<br>"
            f"Expert Openings: {data['Expert']}")

total_openings_by_role = pd.DataFrame([
    {'Job Role': role, 'Number of Openings': data['Total'], 'Hover Text': create_hover_text(role)}
    for role, data in job_openings_data.items()
])

total_openings_by_role = total_openings_by_role.sort_values(by='Number of Openings')

# Create a bar chart using Plotly
fig = px.bar(
    total_openings_by_role,
    x='Job Role',
    y='Number of Openings',
    title='Top Job Openings Based on Responsibilities and Experience Category',
    labels={'Job Role': 'Job Role', 'Number of Openings': 'Number of Openings'},
    color='Job Role',
    color_discrete_sequence=px.colors.qualitative.Pastel,
    custom_data=['Hover Text']
)

# Update layout for better visualization and border
fig.update_layout(
    xaxis_title='Job Role',
    yaxis_title='Number of Openings',
    title={
        'text': 'Top Job Openings Based on Responsibilities and Experience Category',
        'y': 0.9,
        'x': 0.5,
        'xanchor': 'center',
        'yanchor': 'top'
    },
    uniformtext_minsize=8,
    uniformtext_mode='hide',
    showlegend=False,  # Hide legend
    yaxis=dict(
        tickmode='array',
        tickvals=[0, 3000, 6000, 9000, 12000, 15000, 18000, 21000, 24000, 27000, 30000, 33000, 36000, 39000, 42000],
        range=[0, 42000],
        title='Number of Openings',
        linecolor='black', linewidth=2, mirror=True
    ),
    plot_bgcolor='white',  # Set background color
    paper_bgcolor='white',  # Set paper background color
    margin=dict(l=50, r=50, b=100, t=100, pad=4),  # Adjust margins
    xaxis=dict(linecolor='black', linewidth=2, mirror=True),  # Add border to x-axis
    width=1000,  # Width of the chart
    height=600  # Height of the chart
)

# Update hover template to show detailed information
fig.update_traces(hovertemplate='%{customdata[0]}')

# Add dropdown filter for experience category
fig.update_layout(
    updatemenus=[
        {
            'buttons': [
                {
                    'label': 'All',
                    'method': 'update',
                    'args': [{'visible': [True] * len(total_openings_by_role)}, {'title': 'All Experience Categories'}]
                },
                {
                    'label': 'Fresher',
                    'method': 'update',
                    'args': [
                        {'visible': [True if data['Fresher'] > 0 else False for role, data in job_openings_data.items()]},
                        {'title': 'Fresher Experience Category'}
                    ]
                },
                {
                    'label': 'Junior',
                    'method': 'update',
                    'args': [
                        {'visible': [True if data['Junior'] > 0 else False for role, data in job_openings_data.items()]},
                        {'title': 'Junior Experience Category'}
                    ]
                },
                {
                    'label': 'Mid-Level',
                    'method': 'update',
                    'args': [
                        {'visible': [True if data['Mid-Level'] > 0 else False for role, data in job_openings_data.items()]},
                        {'title': 'Mid-Level Experience Category'}
                    ]
                },
                {
                    'label': 'Senior',
                    'method': 'update',
                    'args': [
                        {'visible': [True if data['Senior'] > 0 else False for role, data in job_openings_data.items()]},
                        {'title': 'Senior Experience Category'}
                    ]
                },
                {
                    'label': 'Expert',
                    'method': 'update',
                    'args': [
                        {'visible': [True if data['Expert'] > 0 else False for role, data in job_openings_data.items()]},
                        {'title': 'Expert Experience Category'}
                    ]
                }
            ],
            'direction': 'down',
            'showactive': True,
        }
    ]
)

# Show the interactive bar chart
fig.show()


Cluster 0 top terms: ['java', 'sql', 'design', 'development', 'data', 'testing', 'javascript', 'engineering', 'net', 'communication']
Cluster 1 top terms: ['sales', 'development', 'business', 'management', 'marketing', 'insurance', 'b2b', 'generation', 'channel', 'lead']
Cluster 2 top terms: ['agency', 'channel', 'team', 'training', 'bfsi', 'tied', 'handling', 'advisors', 'branch', 'management']
Cluster 3 top terms: ['operations', 'management', 'hr', 'office', 'customer', 'service', 'banking', 'business', 'data', 'store']
Cluster 4 top terms: ['skills', 'communication', 'good', 'english', 'interpersonal', 'management', 'sales', 'presentation', 'analytical', 'office']
Cluster 5 top terms: ['marketing', 'media', 'content', 'digital', 'social', 'writing', 'seo', 'management', 'sales', 'google']
Cluster 6 top terms: ['training', 'learning', 'development', 'partnership', 'distribution', 'manager', 'management', 'soft', 'analysis', 'skills']
Cluster 7 top terms: ['customer', 'voice', 'bpo', 