In [None]:
!pip install streamlit





In [None]:
import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.metrics import classification_report, accuracy_score, hamming_loss
from sklearn.preprocessing import MultiLabelBinarizer
from xgboost import XGBClassifier
from sklearn.multioutput import MultiOutputClassifier
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from wordcloud import WordCloud
import nltk
import streamlit as st

nltk.download('punkt')
nltk.download('stopwords')


def clean_text(text):
    text = text.lower()
    text = re.sub(r'\d+', '', text)
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    return text.strip()

# Function to train and evaluate the model
def train_skill_prediction_model(data):
    """
    Trains the XGBoost skill prediction model, evaluates performance, and returns results.
    """
    data['cleaned_job_description'] = data['job_description'].apply(clean_text)
    known_skills = [
        'python', 'java', 'c++', 'javascript', 'sql', 'html', 'css',
        'project management', 'communication', 'teamwork', 'leadership',
        'agile', 'devops', 'data analysis', 'machine learning', 'cloud',
        'marketing', 'finance', 'sales', 'ui/ux', 'customer service'
    ]

    def extract_skills(text):
        words = word_tokenize(text)
        return list(set(word for word in words if word in known_skills))

    data['extracted_skills'] = data['cleaned_job_description'].apply(extract_skills)
    mlb = MultiLabelBinarizer()
    y = mlb.fit_transform(data['extracted_skills'])
    X = data['cleaned_job_description']

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    xgb_model = make_pipeline(
        TfidfVectorizer(tokenizer=word_tokenize, stop_words='english', ngram_range=(1, 2)),
        MultiOutputClassifier(XGBClassifier(eval_metric='mlogloss', use_label_encoder=False, random_state=42))
    )

    xgb_model.fit(X_train, y_train)

    y_pred_xgb = xgb_model.predict(X_test)

    accuracy = accuracy_score(y_test, y_pred_xgb)
    hamming = hamming_loss(y_test, y_pred_xgb)
    misclassification_rate = 1 - accuracy

    report = classification_report(y_test, y_pred_xgb, target_names=mlb.classes_)

    return {
        "classification_report": report,
        "accuracy": accuracy,
        "hamming_loss": hamming,
        "misclassification_rate": misclassification_rate
    }

def run_model():
    st.title("Job Description Skill Extraction Model")


    uploaded_file = st.file_uploader("CSV file", type="csv")

    if uploaded_file is not None:
        data = pd.read_csv('training_data.csv')


        st.subheader("Uploaded Data")
        st.write(data.head())

        st.subheader("Model Training and Evaluation")
        model_results = train_skill_prediction_model(data)


        st.write("**Classification Report**")
        st.text(model_results["classification_report"])

        st.write(f"**Accuracy**: {model_results['accuracy']:.4f}")
        st.write(f"**Hamming Loss**: {model_results['hamming_loss']:.4f}")
        st.write(f"**Misclassification Rate**: {model_results['misclassification_rate']:.4f}")

if __name__ == '__main__':
    run_model()


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
