In [1]:
import pandas as pd
import numpy as np
import re
import string
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from collections import Counter
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.multioutput import MultiOutputClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB,ComplementNB
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix,ConfusionMatrixDisplay
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer,TfidfVectorizer
from wordcloud import WordCloud
from langdetect import detect
sns.set()

# Work With 'Job Description'
Explore data and select feature to modeling

In [11]:
jobpostDF = pd.read_csv('C:/zshahpouri/data/postings/postings.csv')
jobpostDF.head(1)

ParserError: Error tokenizing data. C error: EOF inside string starting at row 1

In [None]:
jobpostDF.isnull().sum()
jobpostDF = jobpostDF.dropna(subset='description')
jobpostDF = jobpostDF.loc[:,['job_id','title','description']]
jobpostDF.head(2)

Build Text Cleaner with Various Method

In [None]:
contraction_mapping = {
    "ain't": "is not",
    "aren't": "are not",
    "can't": "cannot",
    "'cause": "because",
    "could've": "could have",
    "couldn't": "could not",
    "didn't": "did not",
    "doesn't": "does not",
    "don't": "do not",
    "hadn't": "had not",
    "hasn't": "has not",
    "haven't": "have not",
    "he'd": "he would",
    "he'll": "he will",
    "he's": "he is",
    "how'd": "how did",
    "how'd'y": "how do you",
    "how'll": "how will",
    "how's": "how is",
    "I'd": "I would",
    "I'd've": "I would have",
    "I'll": "I will",
    "I'll've": "I will have",
    "I'm": "I am",
    "I've": "I have",
    "i'd": "i would",
    "i'd've": "i would have",
    "i'll": "i will",
    "i'll've": "i will have",
    "i'm": "i am",
    "i've": "i have",
    "isn't": "is not",
    "it'd": "it would",
    "it'd've": "it would have",
    "it'll": "it will",
    "it'll've": "it will have",
    "it's": "it is",
    "let's": "let us",
    "ma'am": "madam",
    "mayn't": "may not",
    "might've": "might have",
    "mightn't": "might not",
    "mightn't've": "might not have",
    "must've": "must have",
    "mustn't": "must not",
    "mustn't've": "must not have",
    "needn't": "need not",
    "needn't've": "need not have",
    "o'clock": "of the clock",
    "oughtn't": "ought not",
    "oughtn't've": "ought not have",
    "shan't": "shall not",
    "sha'n't": "shall not",
    "shan't've": "shall not have",
    "she'd": "she would",
    "she'd've": "she would have",
    "she'll": "she will",
    "she'll've": "she will have",
    "she's": "she is",
    "should've": "should have",
    "shouldn't": "should not",
    "shouldn't've": "should not have",
    "so've": "so have",
    "so's": "so as",
    "this's": "this is",
    "that'd": "that would",
    "that'd've": "that would have",
    "that's": "that is",
    "there'd": "there would",
    "there'd've": "there would have",
    "there's": "there is",
    "here's": "here is",
    "they'd": "they would",
    "they'd've": "they would have",
    "they'll": "they will",
    "they'll've": "they will have",
    "they're": "they are",
    "they've": "they have",
    "to've": "to have",
    "wasn't": "was not",
    "we'd": "we would",
    "we'd've": "we would have",
    "we'll": "we will",
    "we'll've": "we will have",
    "we're": "we are",
    "we've": "we have",
    "weren't": "were not",
    "what'll": "what will",
    "what'll've": "what will have",
    "what're": "what are",
    "what's": "what is",
    "what've": "what have",
    "when's": "when is",
    "when've": "when have",
    "where'd": "where did",
    "where's": "where is",
    "where've": "where have",
    "who'll": "who will",
    "who'll've": "who will have",
    "who's": "who is",
    "who've": "who have",
    "why's": "why is",
    "why've": "why have",
    "will've": "will have",
    "won't": "will not",
    "won't've": "will not have",
    "would've": "would have",
    "wouldn't": "would not",
    "wouldn't've": "would not have",
    "y'all": "you all",
    "y'all'd": "you all would",
    "y'all'd've": "you all would have",
    "y'all're": "you all are",
    "y'all've": "you all have",
    "you'd": "you would",
    "you'd've": "you would have",
    "you'll": "you will",
    "you'll've": "you will have",
    "you're": "you are",
    "you've": "you have",
    "u.s": "america",
    "e.g": "for example",
}


# Clean contraction
def clean_contractions(text):
    specials = ["’", "‘", "´", "`"]
    for s in specials:
        text = text.replace(s, "'")
    for word in contraction_mapping.keys():
        if "" + word + "" in text:
            text = text.replace("" + word + "", "" + contraction_mapping[word] + "")
    return text


# Remove Url Pattern
def remove_urls(text):
    url_pattern = r"https?://\S+|www\.\S+"
    return re.sub(url_pattern, "", text)


# Remove HTML Tag
def remove_html(text):
    html_pattern = re.compile("<.*?>")
    return html_pattern.sub(r"", text)


# Remove special character "!"#$%&'()*+, -./:;<=>?@[\]^_`{|}~"
def remove_punctuation(text):
    punctuation = string.punctuation + "–"
    return re.sub(f"[{re.escape(punctuation)}]", "", text)


# Remove E-mail pattern
def remove_emails(text):
    return re.sub(r"\S+@\S+", "", text)


# Remove New Line Code Snippet
def remove_code_snippet(text):
    return text.replace("\n", "")


# Remove Emoji
def remove_emoji(string):
    emoji_pattern = re.compile(
        "["
        "\U0001F600-\U0001F64F"  # emoticons
        "\U0001F300-\U0001F5FF"  # symbols & pictographs
        "\U0001F680-\U0001F6FF"  # transport & map symbols
        "\U0001F1E0-\U0001F1FF"  # flags (iOS)
        "\U00002702-\U000027B0"
        "\U000024C2-\U0001F251"
        "]+",
        flags=re.UNICODE,
    )
    return emoji_pattern.sub(r"", string)


# Remove Non-English Text
def remove_non_english_text(text):
    def is_english(text):
        try:
            return detect(text) == "en"
        except:
            return False

    return text if is_english(text) else ""


# Remove Digits
def remove_digits(text):
    return "".join(filter(lambda char: not char.isdigit(), text))


# Remove Stop Words
def remove_stopwords(text):
    stop_words = set(stopwords.words("english"))
    custom_stop_words = [
        "job",
        "role",
        "position",
        "responsibility",
        "responsibilities",
        "duties",
        "duty",
        "requirement",
        "requirements",
        "qualification",
        "qualifications",
        "description",
        "descriptions",
        "candidate",
        "candidates",
        "applicant",
        "applicants",
        "opportunity",
        "opportunities",
        "team",
        "teams",
        "work",
        "working",
        "employee",
        "employees",
        "employer",
        "employers",
        "company",
        "companies",
        "location",
        "locations",
        "department",
        "departments",
        "report",
        "reports",
        "reporting",
        "benefit",
        "benefits",
        "compensation",
        "salary",
        "experience",
        "experienced",
        "year",
        "years",
        "gender",
        "race",
        "color",
        "sex",
        "orientation",
        "sexual",
        "religion",
        "national",
        "identify",
        "veteran",
        "nation",
        "including",
        "required",
        "disability",
        "regard"
    ]

    words = text.split()
    filtered_words = [
        word
        for word in words
        if word.lower() not in stop_words and word.lower() not in custom_stop_words
    ]
    return " ".join(filtered_words)


# Lemmatization
def lemmatize_words(text):
    words = text.split()
    lemmatizer = WordNetLemmatizer()
    lemmatized_words = [lemmatizer.lemmatize(word) for word in words]
    return " ".join(lemmatized_words)

In [None]:
def clean_text(text,remove_stop_words=True):
    text = text.lower()
    text = clean_contractions(text)
    text = remove_urls(text)
    text = remove_html(text)
    text = remove_code_snippet(text)
    text = remove_punctuation(text)
    text = remove_emails(text)
    text = remove_emoji(text)
    text = remove_non_english_text(text)
    text = remove_digits(text)
    text = lemmatize_words(text)
    if remove_stop_words:
        text = remove_stopwords(text)
    return text

Let's Clean 'Job Description'

In [None]:
#Clean Job description text
jobpostDF['description_cleaned'] = jobpostDF['description'].astype(str).apply(lambda x: clean_text(x))
jobpostDF['description_cleaned_st'] = jobpostDF['description'].astype(str).apply(lambda x: clean_text(x, remove_stop_words=False))

Collect **most common** word that can occur in every job description and also collect **rare word** then remove it

In [None]:
cnt = Counter()
for text in jobpostDF["description_cleaned"].values:
    for word in text.split():
        cnt[word] += 1
freqWords = set([w[0] for w in cnt.most_common(10)])
rareWords = set([w for w, freq in cnt.items() if freq == 1])

def remove_freq_rare_words(text):
    preserved_words = {'management', 'product', 'project'}
    return " ".join([word for word in str(text).split() if word not in freqWords and word not in rareWords or word in preserved_words])
jobpostDF['description_cleaned'] = jobpostDF['description_cleaned'].apply(remove_freq_rare_words)

Count 'Job Description' Length and Remove text that have 0,1,2 length because it not make sense and can be empty text

In [None]:
#Count Text Lenght
#Remove Description that have 0 and 1 length
jobpostDF['original_length'] = jobpostDF['description'].str.split().apply(len)
jobpostDF['cleaned_length'] = jobpostDF['description_cleaned'].str.split().apply(len)
jobpostDF = jobpostDF[jobpostDF['cleaned_length'].isin([0,1,2])==False]

# Visualization of Job Description Text

##### Distribution of Text Length for Job Description

In [None]:
fig = plt.figure(figsize=(16,7))

ax1 = fig.add_subplot(121)
sns.histplot(jobpostDF['original_length'], ax=ax1, color='blue', bins=30, zorder=1)
ax1.set_title('Original Descriptions')


ax2 = fig.add_subplot(122)
sns.histplot(jobpostDF['cleaned_length'], ax=ax2, color='green', bins=30, zorder=1)
ax2.set_title('Cleaned Descriptions')


describe_original = jobpostDF.original_length.describe().to_frame().round(2)
bbox_original = [0.65, 0.55, 0.3, 0.4]
table_original = ax1.table(cellText=describe_original.values, rowLabels=describe_original.index, bbox=bbox_original, colLabels=describe_original.columns, zorder=2)
table_original.auto_set_font_size(False)
table_original.set_fontsize(12)
# table_original.auto_set_column_width(col=list(range(len(describe_original.columns))))
for key, cell in table_original.get_celld().items():
    cell.set_text_props(ha='center', va='center')
    cell.set_height(0.2) 


describe_cleaned = jobpostDF.cleaned_length.describe().to_frame().round(2)
bbox_cleaned = [0.65, 0.55, 0.3, 0.4]
table_cleaned = ax2.table(cellText=describe_cleaned.values, rowLabels=describe_cleaned.index, bbox=bbox_cleaned, colLabels=describe_cleaned.columns, zorder=2)
table_cleaned.auto_set_font_size(False)
table_cleaned.set_fontsize(12)
# table_cleaned.auto_set_column_width(col=list(range(len(describe_cleaned.columns))))
for key, cell in table_cleaned.get_celld().items():
    cell.set_text_props(ha='center', va='center')
    cell.set_height(0.2) 
    
fig.suptitle('Distribution of Text Length for Job Description: Before vs. After Cleaning', fontsize=16)
plt.tight_layout()
plt.show()


##### Word Clound Before and After Clean Process

In [None]:
unclean = ' '.join([text for text in jobpostDF['description']])
clean = ' '.join([text for text in jobpostDF['description_cleaned']])

# Generate word clouds
wordcloud_unclean = WordCloud(width=800, height=500, random_state=21, max_font_size=110).generate(unclean)
wordcloud_clean = WordCloud(width=800, height=500, random_state=21, max_font_size=110).generate(clean)

plt.figure(figsize=(24, 9))
plt.subplot(1, 2, 1)
plt.imshow(wordcloud_unclean, interpolation="bilinear")
plt.title("Unclean Text", fontsize = 20)
plt.axis('off')

plt.subplot(1, 2, 2)
plt.imshow(wordcloud_clean, interpolation="bilinear")
plt.title("Cleaned Text", fontsize = 20)
plt.axis('off')

plt.tight_layout()
plt.show()

##### Top 10 Word in Job Description

In [None]:
cnt = Counter()
for text in jobpostDF['description_cleaned'].values:
    for word in text.split():
        cnt[word] += 1

mostCommon = cnt.most_common(10)

words = []
freq = []
for word, count in mostCommon:
    words.append(word)
    freq.append(count)

sns.barplot(x=freq, y=words)
plt.title('Top 10 Most Frequently Occuring Words')
plt.show()

##### Plot Unigrams , Bigrams and Trigrams in 'Job Description' before and after remove 'Stop Word"

In [None]:
def get_top_ngrams(corpus, ngram_range, n=None):
    vec = CountVectorizer(ngram_range=ngram_range).fit(corpus)
    bag_of_words = vec.transform(corpus)
    
    sum_words = bag_of_words.sum(axis=0)
    
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq = sorted(words_freq, key=lambda x: x[1], reverse=True)
    
    common_words = words_freq[:n]
    words = []
    freqs = []
    for word, freq in common_words:
        words.append(word)
        freqs.append(freq)
        
    df = pd.DataFrame({'Word': words, 'Freq': freqs})
    return df
#collect n-gram for job description without Stop Word
unigrams = get_top_ngrams(jobpostDF['description_cleaned'], (1, 1),20)
bigrams = get_top_ngrams(jobpostDF['description_cleaned'], (2, 2), 20)
trigrams = get_top_ngrams(jobpostDF['description_cleaned'], (3, 3),20)
#collect n-gram for job description with Stop Word
unigrams_st = get_top_ngrams(jobpostDF['description_cleaned_st'], (1, 1),20)
bigrams_st = get_top_ngrams(jobpostDF['description_cleaned_st'], (2, 2),20)
trigrams_st = get_top_ngrams(jobpostDF['description_cleaned_st'], (3, 3),20)

N-Gram before remove Stop Words

In [None]:
plt.figure(figsize=(24, 12))

plt.subplot(1,3,1)
sns.barplot(x='Freq', y='Word', data=unigrams_st)
plt.title('Top 20 Unigrams before removing stopwords', size=15)

plt.subplot(1,3,2)
sns.barplot(x='Freq', y='Word', data=bigrams_st)
plt.title('Top 20 Bigrams before removing stopwords', size=15)

plt.subplot(1,3,3)
sns.barplot(x='Freq', y='Word', data=trigrams_st)
plt.title('Top 20 Trigrams before removing stopwords', size=15)

plt.tight_layout()
plt.show()

N-Gram before after Stop Words

In [None]:
plt.figure(figsize=(24, 12))
plt.subplot(1,3,1)
sns.barplot(x='Freq', y='Word', data=unigrams)
plt.title('Top 20 Unigrams after removing stopwords', size=15)

plt.subplot(1,3,2)
sns.barplot(x='Freq', y='Word', data=bigrams)
plt.title('Top 20 Bigrams after removing stopwords', size=15)

plt.subplot(1,3,3)
sns.barplot(x='Freq', y='Word', data=trigrams)
plt.title('Top 20 Trigrams after removing stopwords', size=15)

plt.tight_layout()
plt.show()

N-Gram Count in Job Description

In [None]:
bow_converter = CountVectorizer(tokenizer=lambda doc: doc, lowercase=False, token_pattern=None)
x = bow_converter.fit_transform(jobpostDF['description_cleaned'])
words = bow_converter.get_feature_names_out()

bigram_converter = CountVectorizer(tokenizer=lambda doc: doc, ngram_range=(2,2), lowercase=False, token_pattern=None) 
x2 = bigram_converter.fit_transform(jobpostDF['description_cleaned'])
bigrams = bigram_converter.get_feature_names_out()

trigram_converter = CountVectorizer(tokenizer=lambda doc: doc, ngram_range=(3,3), lowercase=False, token_pattern=None) 
x3 = trigram_converter.fit_transform(jobpostDF['description_cleaned'])
trigrams = trigram_converter.get_feature_names_out()

sns.set_style("white")
counts = [len(words), len(bigrams), len(trigrams)]
plt.plot(counts, color='blue')
plt.plot(counts, 'bo')
plt.ticklabel_format(style = 'plain')
plt.xticks(range(3), ['unigram', 'bigram', 'trigram'])
plt.tick_params(labelsize=14)
plt.title('Number of ngrams in Job Description', {'fontsize':16})
plt.show()


# Work with Job Skill Data

In [None]:
jobskillsDF = pd.read_csv('job_skills.csv')
jobskillsDF.head()

In [None]:
jobskillsDF.isnull().sum()
jobskillsDF = jobskillsDF.dropna(subset='skill_abr')
jobskillsDF.shape

Take a Closer Look for How Many 'Job Skill' In This Dataset

In [None]:
plt.figure(figsize=(10, 6))
ax = sns.countplot(x=jobskillsDF['skill_abr'], width=0.6)

palette = sns.color_palette("deep", len(ax.patches))
for bar, color in zip(ax.patches, palette):
    bar.set_color(color)

for p in ax.patches:
    ax.annotate(f'{int(p.get_height())}', (p.get_x() + p.get_width() / 2., p.get_height()),
                ha='center', va='center', fontsize=8, color='black', xytext=(0, 5),
                textcoords='offset points')

unique_skills = jobskillsDF['skill_abr'].nunique()
ax.text(0.95, 0.95, f'Unique Skills : {unique_skills}', transform=ax.transAxes, 
        verticalalignment='top', horizontalalignment='right', fontsize=20, color='Black')

unique_jobs = jobskillsDF['job_id'].nunique()
ax.text(0.95, 0.85, f'Jobs : {unique_jobs}', transform=ax.transAxes, 
        verticalalignment='top', horizontalalignment='right', fontsize=20, color='Black')

plt.title('Count of Job Skills', fontsize=16)
plt.xticks(rotation=45, fontsize=8)
ax.set_xlabel("Skills")
plt.tight_layout()
plt.show()

Regroup with Related Skill to Only 10 Skill + 1 Other Skill

In [None]:
#Grouping Skill to 10 Skill 
skill_mapping = { 'ADM': 'ADM', #1.Administration
                 'CNSL': 'ADM',
                  'HR': 'ADM',
                  'LGL': 'ADM',
                  'MGMT': 'ADM',
                  'PRJM':'ADM',
                  'ACCT':'FIN', #2.Business and Finace
                  'CUST':'FIN',
                  'DIST':'FIN',
                  'FIN':'FIN',
                  'PRCH':'FIN',
                  'SALE':'FIN',
                  'STRA':'FIN',
                  'SUPL':'FIN',
                  'BD':'FIN',
                  'GENB':'FIN',
                  'ART':'DSGN', #3.Creative and Design
                  'DSGN':'DSGN',
                  'WRT':'DSGN',
                  'EDU':'EDU', #4.Education
                  'TRNG':'EDU',
                  'ENG':'ENG',#5.Engineering
                  'IT':'ENG',
                  'MNFC':'ENG',
                  'HCPR':'HCPR',#6.Healthcare
                  'ADVR':'MRKT',#7.Marketing and Advertising
                  'MRKT':'MRKT',
                  'PR':'MRKT',
                  'PRDM':'PRDM',#8.Product Development
                  'ANLS':'RSCH',#9.Research and Science
                  'SCI':'RSCH',
                  'RSCH':'RSCH',
                  'QA':'RSCH',
                  'PROD':'PROD'}#10.Project Management
jobskillsDF['skill_abr_regroup'] = jobskillsDF['skill_abr'].replace(skill_mapping)

In [None]:
#Remove duplicate row because 1 job may have multi skill but after grouping skill it can be the same skill
jobskillsDF = jobskillsDF.drop_duplicates(subset=['job_id', 'skill_abr_regroup'])
print(jobskillsDF.shape)

In [None]:
#Plot Skills after Grouping
plt.figure(figsize=(10, 6))
ax = sns.countplot(x=jobskillsDF['skill_abr_regroup'], width=0.6)

palette = sns.color_palette("deep", len(ax.patches))
for bar, color in zip(ax.patches, palette):
    bar.set_color(color)

for p in ax.patches:
    ax.annotate(f'{int(p.get_height())}', (p.get_x() + p.get_width() / 2., p.get_height()),
                ha='center', va='center', fontsize=8, color='black', xytext=(0, 5),
                textcoords='offset points')

unique_skills_regroup = jobskillsDF['skill_abr_regroup'].nunique()
ax.text(0.95, 0.95, f'Unique Skills : {unique_skills_regroup}', transform=ax.transAxes, 
        verticalalignment='top', horizontalalignment='right', fontsize=20, color='Black')

unique_jobs_regroup = jobskillsDF['job_id'].nunique()
ax.text(0.95, 0.85, f'Jobs : {unique_jobs}', transform=ax.transAxes, 
        verticalalignment='top', horizontalalignment='right', fontsize=20, color='Black')

plt.title('Count of Job Skills', fontsize=16)
plt.xticks(rotation=45, fontsize=8)
ax.set_xlabel("Skills")
plt.tight_layout()
plt.show()

# Job Description VS Job Skills

- Create Multi-Label for Job Skill (1 Job : Multi Skill)

In [None]:
multiskillDF = pd.merge(jobpostDF, jobskillsDF, on='job_id', how='inner') # merge skill type with description mathc by job id
multiskillDF = multiskillDF.groupby('job_id').agg({'title': 'first', 'description_cleaned': 'first', 'skill_abr_regroup': ','.join}).reset_index()
multiskillDF['skill_count'] = multiskillDF['skill_abr_regroup'].str.split(',').apply(len)
multiskillDF['skill_abr_regroup'] = multiskillDF['skill_abr_regroup'].str.split(',').tolist()
multiskillDF.head()

In [None]:
plt.figure(figsize=(10, 6))
ax = sns.countplot(x=multiskillDF['skill_count'], width=0.6)

palette = sns.color_palette("pastel", len(ax.patches))
for bar, color in zip(ax.patches, palette):
    bar.set_color(color)

for p in ax.patches:
    ax.annotate(f'{int(p.get_height())}', (p.get_x() + p.get_width() / 2., p.get_height()),
                ha='center', va='center', fontsize=14, color='black', xytext=(0, 5),
                textcoords='offset points')

labels = [f"{int(label) + 1} Skills" for label in ax.get_xticks()]
ax.set_xticklabels(labels)

plt.title('Count of Job Skills', fontsize=16)
plt.xticks(rotation=0, fontsize=12)
ax.set_xlabel("Number of Skills required")
plt.tight_layout()
plt.show()

- Creat Single Label for Job Skill Dataframe to Meansure TF-IDF Score of Each Job Skill

In [None]:
oneskillDF = multiskillDF[multiskillDF['skill_count'] == 1].copy()
oneskillDF['skill_abr_regroup'] = oneskillDF['skill_abr_regroup'].str.join('')
print(oneskillDF.shape)

In [None]:
plt.figure(figsize=(10, 6))
ax = sns.countplot(x=oneskillDF['skill_abr_regroup'], width=0.6)

palette = sns.color_palette("deep", len(ax.patches))
for bar, color in zip(ax.patches, palette):
    bar.set_color(color)

for p in ax.patches:
    ax.annotate(f'{int(p.get_height())}', (p.get_x() + p.get_width() / 2., p.get_height()),
                ha='center', va='center', fontsize=10, color='black', xytext=(0, 5),
                textcoords='offset points')

nu_skill = oneskillDF['skill_abr_regroup'].nunique()
ax.text(0.95, 0.95, f'Unique Skills : {nu_skill}', transform=ax.transAxes, 
        verticalalignment='top', horizontalalignment='right', fontsize=20, color='Black')

jobs = oneskillDF['job_id'].nunique()
ax.text(0.95, 0.85, f'Jobs : {jobs}', transform=ax.transAxes, 
        verticalalignment='top', horizontalalignment='right', fontsize=20, color='Black')

plt.title('Count of Job Skills', fontsize=16)
plt.xticks(rotation=45, fontsize=8)
ax.set_xlabel("Skills")
plt.tight_layout()
plt.show()

### TF-IDF Score of Job Description and Skills

In [None]:
tfidf_vectorizor = TfidfVectorizer(min_df=5, 
                             max_df=0.5,
                             analyzer='word',
                             strip_accents='unicode',
                             ngram_range=(1, 3),
                             sublinear_tf=True, 
                             smooth_idf=True,
                             use_idf=True)
def top_tfidf_feats(row, features, top_n=20):
    topn_ids = np.argsort(row)[::-1][:top_n]
    top_feats = [(features[i], row[i]) for i in topn_ids]
    df = pd.DataFrame(top_feats)
    df.columns = ['feature', 'tfidf']
    return df

def top_feats_in_doc(Xtr, features, row_id, top_n=20):
    row = np.squeeze(Xtr[row_id].toarray())
    return top_tfidf_feats(row, features, top_n)

def top_mean_feats(Xtr, features, grp_ids=None, min_tfidf=0.1, top_n=10):
    if grp_ids:
        D = Xtr[grp_ids].toarray()
    else:
        D = Xtr.toarray()

    D[D < min_tfidf] = 0
    tfidf_means = np.mean(D, axis=0)
    return top_tfidf_feats(tfidf_means, features, top_n)

def top_feats_by_class(Xtr, y, features, min_tfidf=0.1, top_n=16):
    dfs = []
    labels = np.unique(y)
    for label in labels:
        ids = np.where(y==label)
        feats_df = top_mean_feats(Xtr, features, ids, min_tfidf=min_tfidf, top_n=top_n)
        feats_df.label = label
        dfs.append(feats_df)
    return dfs

def plot_tfidf_classfeats_h(dfs, num_class=9):
    num_class = len(dfs)
    fig = plt.figure(figsize=(12, num_class*10), facecolor="w")
    x = np.arange(len(dfs[0]))
    for i, df in enumerate(dfs):
        ax = fig.add_subplot(num_class, 1, i+1)
        ax.spines["top"].set_visible(False)
        ax.spines["right"].set_visible(False)
        ax.set_frame_on(False)
        ax.get_xaxis().tick_bottom()
        ax.get_yaxis().tick_left()
        ax.set_xlabel("Mean Tf-Idf Score", labelpad=16, fontsize=16)
        ax.set_ylabel("Word", labelpad=16, fontsize=16)
        ax.set_title(str(df.label) + ' Skill', fontsize=25)
        ax.ticklabel_format(axis='x', style='sci', scilimits=(-2,2))
        ax.barh(x, df.tfidf, align='center')
        ax.set_yticks(x)
        ax.set_ylim([-1, x[-1]+1])
        ax.invert_yaxis()
        yticks = ax.set_yticklabels(df.feature)
        
        for tick in ax.yaxis.get_major_ticks():
                tick.label1.set_fontsize(20) 
        plt.subplots_adjust(bottom=0.09, right=0.97, left=0.15, top=0.95, wspace=0.52)
    plt.show()

tfidf_vectorizor.fit(list(oneskillDF['description_cleaned']))

class_Xtr = tfidf_vectorizor.transform(oneskillDF['description_cleaned'])
class_y = oneskillDF['skill_abr_regroup']
class_features = tfidf_vectorizor.get_feature_names_out()
class_top_dfs = top_feats_by_class(class_Xtr, class_y, class_features)
plot_tfidf_classfeats_h(class_top_dfs, 10)


# Train and Test data set and applying N-gram

In [None]:
count_vect = CountVectorizer(ngram_range=(1, 2))
transformer = TfidfTransformer(norm='l2',sublinear_tf=True)
mlb = MultiLabelBinarizer()

In [None]:
x_train, x_test, y_train, y_test = train_test_split(multiskillDF['description_cleaned'],multiskillDF['skill_abr_regroup'], test_size = 0.20, random_state = 40)

y_train = mlb.fit_transform(y_train)
y_test = mlb.transform(y_test)

x_train_counts = count_vect.fit_transform(x_train)
x_train_tfidf = transformer.fit_transform(x_train_counts)

x_test_counts = count_vect.transform(x_test)
x_test_tfidf = transformer.transform(x_test_counts)


print (x_train_tfidf.shape,x_test_tfidf.shape, y_train.shape, y_test.shape)



## Building and evaluating a model

Logistic Regression

In [None]:
#Count Vectorization
lr = MultiOutputClassifier(LogisticRegression(max_iter=1000))
lr.fit(x_train_counts, y_train)
y_pred1 = lr.predict(x_test_counts)
print("Accuracy for Logistic Regression with Count Vectorization (Bag of Word): {:.2f}%".format(accuracy_score(y_test, y_pred1)*100))
print(classification_report(y_test, y_pred1, target_names=list(mlb.classes_)))

#TF-IDF
lr_tfidf = MultiOutputClassifier(LogisticRegression(max_iter=1000))
lr_tfidf.fit(x_train_tfidf, y_train)
y_pred2 = lr_tfidf.predict(x_test_tfidf)
print("Accuracy for Logistic Regression with TF-IDF: {:.2f}%".format(accuracy_score(y_test, y_pred2)*100))
print(classification_report(y_test, y_pred2, target_names=list(mlb.classes_)))

Naive Bayes(Multinomial)

In [None]:
#Count Vectorization
mnb = MultiOutputClassifier(MultinomialNB())
mnb.fit(x_train_counts, y_train)
y_pred3 = mnb.predict(x_test_counts)
print("Accuracy for Navie Bayes with Count Vectorization (Bag of Word): {:.2f}%".format(accuracy_score(y_test, y_pred3)*100))
print(classification_report(y_test, y_pred3, target_names=list(mlb.classes_)))

#TF-IDF
mnb_tfidf = MultiOutputClassifier(MultinomialNB())
mnb_tfidf.fit(x_train_tfidf, y_train)
y_pred4 = mnb_tfidf.predict(x_test_tfidf)
print("Accuracy for Navie Bayes with TF-IDF: {:.2f}%".format(accuracy_score(y_test, y_pred4)*100))
print(classification_report(y_test, y_pred4, target_names=list(mlb.classes_)))


Tuning Hyperparameter for ***Navie Bayes with TF-IDF method***

In [None]:
mnb_model = MultiOutputClassifier(MultinomialNB())
alpha = [0.1,0.3,0.5]
paramgrid = {'estimator__alpha':alpha}
gsearch_cv = GridSearchCV(mnb_model, param_grid=paramgrid, cv=5)
gsearch_cv.fit(x_train_tfidf, y_train)

best_alpha = gsearch_cv.best_params_['estimator__alpha']
print(f"Best alpha: {best_alpha}")

mean_test_scores = gsearch_cv.cv_results_['mean_test_score']
plt.plot(alpha, mean_test_scores, marker='o')
plt.xlabel('Alpha')
plt.ylabel('Mean Test Score (Accuracy)')
plt.title('Alpha vs. Mean Test Score')
plt.grid(True)
plt.show()

After Tuning HyperParameter

In [None]:
def LR_classify(X_tr, y_tr, X_test, y_test, description, multilabel=True):
    if multilabel:
        model = MultiOutputClassifier(LogisticRegression(max_iter=1000)).fit(X_tr, y_tr)
        labeltype = 'Multi-Label'
    else:
        model = MultiOutputClassifier(LogisticRegression(max_iter=500)).fit(X_tr, y_tr)
        labeltype = 'Single-Label'
    y_pred = model.predict(X_test)
    
    #Classification Report
    clf_report = classification_report(y_test, y_pred, target_names=list(mlb.classes_), output_dict=True, zero_division=1)
    plt.figure(figsize=(10, 6))
    sns.heatmap(pd.DataFrame(clf_report).iloc[:-1, :].T, annot=True)
    plt.title("{} Logistic Regression Classification ({}) Accuracy Rate: {:.2f}%".format(labeltype,description, accuracy_score(y_test, y_pred)*100))
    plt.show()
    #Confusesion Matrix (MultiLabel)
    if multilabel:
        n_labels = y_test.shape[1]
        fig, axes = plt.subplots(nrows=3, ncols=4, figsize=(15, 12))
        fig.suptitle('{} Logistic Regression Classification ({}) Confusion Matrix'.format(labeltype,description), fontsize=16)
        
        for idx, ax in enumerate(axes.ravel()):
            if idx < n_labels:
                cm = confusion_matrix(y_test[:, idx], y_pred[:, idx])
                disp = ConfusionMatrixDisplay(confusion_matrix=cm)
                disp.plot(cmap=plt.cm.Blues, ax=ax)
                ax.set_title(f'Label: {mlb.classes_[idx]}')
            else:
                ax.axis('off') 
        plt.tight_layout()
        plt.subplots_adjust(top=0.90)
        plt.show()
    #Confusesion Matrix (Single-Label)
    else:
        y_test_labels = y_test.argmax(axis=1)
        y_pred_labels = y_pred.argmax(axis=1)
        
        cm = confusion_matrix(y_test_labels, y_pred_labels)
        cm_normalized = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        
        plt.figure(figsize=(15, 10))
        disp = ConfusionMatrixDisplay(confusion_matrix=cm_normalized, display_labels=mlb.classes_)
        disp.plot(cmap=plt.cm.Blues,values_format=".2f")
        plt.title('{} Logistic Regression Classification ({}) Confusion Matrix'.format(labeltype,description))
        
        plt.xticks(fontsize=8)
        plt.yticks(fontsize=8)
        cbar = plt.gcf().axes[-1]
        cbar.tick_params(labelsize=8)
        
        for text in disp.text_:
            for t in text:
                t.set_fontsize(9)
                t.set_color('black')
        
        plt.show()
   
    return model

In [None]:
def NB_classify(X_tr, y_tr, X_test, y_test, description, multilabel=True):
    if multilabel:
        model = MultiOutputClassifier(MultinomialNB(alpha=0.1)).fit(X_tr, y_tr)
        labeltype = 'Multi-Label'
    else:
        model = MultiOutputClassifier(MultinomialNB(alpha=0.1)).fit(X_tr, y_tr)
        labeltype = 'Single-Label'
    
    y_pred = model.predict(X_test)
    #Classification Report
    clf_report = classification_report(y_test, y_pred, target_names=list(mlb.classes_), output_dict=True, zero_division=1)
    plt.figure(figsize=(10, 6))
    sns.heatmap(pd.DataFrame(clf_report).iloc[:-1, :].T, annot=True)
    plt.title("{} NaiveBayes Classification ({}) Accuracy Rate: {:.2f}%".format(labeltype,description, accuracy_score(y_test, y_pred)*100))
    plt.show()
    #Confusesion Matrix (MultiLabel)
    if multilabel:
        n_labels = y_test.shape[1]
        fig, axes = plt.subplots(nrows=3, ncols=4, figsize=(15, 12))
        fig.suptitle('{} NaiveBayes Classification ({}) Confusion Matrix'.format(labeltype,description), fontsize=16)
        
        for idx, ax in enumerate(axes.ravel()):
            if idx < n_labels:
                cm = confusion_matrix(y_test[:, idx], y_pred[:, idx])
                disp = ConfusionMatrixDisplay(confusion_matrix=cm)
                disp.plot(cmap=plt.cm.Blues, ax=ax)
                ax.set_title(f'Label: {mlb.classes_[idx]}')
            else:
                ax.axis('off') 
        plt.tight_layout()
        plt.subplots_adjust(top=0.90)
        plt.show()
    #Confusesion Matrix (Single-Label)
    else:
        y_test_labels = y_test.argmax(axis=1)
        y_pred_labels = y_pred.argmax(axis=1)
        
        cm = confusion_matrix(y_test_labels, y_pred_labels)
        cm_normalized = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        
        plt.figure(figsize=(15, 10))
        disp = ConfusionMatrixDisplay(confusion_matrix=cm_normalized, display_labels=mlb.classes_)
        disp.plot(cmap=plt.cm.Blues,values_format=".2f")
        plt.title('{} NaiveBayes Classification ({}) Confusion Matrix'.format(labeltype,description))
        
        plt.xticks(fontsize=8)
        plt.yticks(fontsize=8)
        cbar = plt.gcf().axes[-1]
        cbar.tick_params(labelsize=8)
        
        for text in disp.text_:
            for t in text:
                t.set_fontsize(9)
                t.set_color('black')
        
        plt.show()
   
    return model

### ​Multi Label - Skill Classificaiotn

In [None]:
multi_LR_model_bow = LR_classify(x_train_counts, y_train, x_test_counts, y_test, 'Bag Of Words',)
multi_LR_model_tfidf = LR_classify(x_train_tfidf, y_train, x_test_tfidf, y_test, 'TF-IDF')

In [None]:
multi_NB_model_bow = NB_classify(x_train_counts, y_train, x_test_counts, y_test, 'Bag Of Words')
multi_NB_model_tfidf = NB_classify(x_train_tfidf, y_train, x_test_tfidf, y_test, 'TF-IDF')

### Multi Skill Classification (One Label)

In [None]:
oneskillDF['skill_abr_regroup'] = oneskillDF['skill_abr_regroup'].apply(lambda x: [x])


In [None]:
count_vect = CountVectorizer(ngram_range=(1, 3))
transformer = TfidfTransformer(norm='l2',sublinear_tf=True)
mlb = MultiLabelBinarizer()

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(oneskillDF['description_cleaned'],oneskillDF['skill_abr_regroup'], test_size = 0.20, random_state = 60)

# Y_train = mlb.fit_transform(Y_train)
# Y_test = mlb.transform(Y_test)

X_train_counts = count_vect.fit_transform(X_train)
X_train_tfidf = transformer.fit_transform(X_train_counts)

X_test_counts = count_vect.transform(X_test)
X_test_tfidf = transformer.transform(X_test_counts)

print ("Bag of Word Shape :",X_train_counts.shape,X_test_counts.shape, y_train.shape, y_test.shape)
print ("TF-IDF Shape :",X_test_counts.shape,X_test_tfidf.shape, y_train.shape, y_test.shape)


In [None]:
mnb_ = MultinomialNB(alpha=0.1)
mnb_.fit(X_train_counts, Y_train)
y_pred_ = mnb_.predict(X_test_counts)
print("Accuracy for Navie Bayes with Count Vectorization (Bag of Word): {:.2f}%".format(accuracy_score(Y_test, y_pred_)*100))
print(classification_report(Y_test, y_pred_))

In [None]:
#Logistic Regression
one_LR_model_bow = LR_classify(X_train_counts, Y_train, X_test_counts, Y_test, 'Bag Of Words',multilabel=False)
one_LR_model_tfidf = LR_classify(X_train_tfidf, Y_train, X_test_tfidf, Y_test, 'TF-IDF',multilabel=False)

In [None]:
#Naive Bayes
one_NB_model_bow = NB_classify(X_train_counts, Y_train, X_test_counts, Y_test, 'Bag Of Words',multilabel=False)
one_NB_model_tfidf = NB_classify(X_train_tfidf, Y_train, X_test_tfidf, Y_test, 'TF-IDF',multilabel=False)

### Multi Skill Classification (One Label and Remove Unpredictable Skill)

decide to remove some skill that got low f1 score from previous model

In [None]:
cutskillDF = oneskillDF[~oneskillDF['skill_abr_regroup'].isin([['PRDM'], ['PROD'], ['RSCH'],['DSGN'],['MRKT'],['EDU'],['OTHR']])].copy()
print(cutskillDF.shape)
cutskillDF

In [None]:
count_vect = CountVectorizer(ngram_range=(1, 3))
transformer = TfidfTransformer(norm='l2',sublinear_tf=True)
mlb = MultiLabelBinarizer()

In [None]:
cXtrain, cXtest, cY_trian, cY_test = train_test_split(cutskillDF['description_cleaned'],cutskillDF['skill_abr_regroup'], test_size = 0.20, random_state = 60)

cY_trian = mlb.fit_transform(cY_trian)
cY_test = mlb.transform(cY_test)

cXtrain_counts = count_vect.fit_transform(cXtrain)
cXtrain_tfidf = transformer.fit_transform(cXtrain_counts)

cXtest_counts = count_vect.transform(cXtest)
cXtest_tfidf = transformer.transform(cXtest_counts)

In [None]:
#Logistic Regression
cut_LR_model_bow = LR_classify(cXtrain_counts, cY_trian, cXtest_counts, cY_test, 'Bag Of Words',multilabel=False)
cut_LR_model_tfidf = LR_classify(cXtrain_tfidf, cY_trian, cXtest_tfidf, cY_test, 'TF-IDF',multilabel=False)

In [None]:
#Naive Bayes
cut_NB_model_bow = NB_classify(cXtrain_counts, cY_trian, cXtest_counts, cY_test, 'Bag Of Words',multilabel=False)
cut_NB_model_tfidf = NB_classify(cXtrain_tfidf, cY_trian, cXtest_tfidf, cY_test, 'TF-IDF',multilabel=False)

In [None]:
###################

In [None]:
multiskillDF.head()

In [None]:
from sklearn.multiclass import OneVsRestClassifier
from imblearn.over_sampling import RandomOverSampler
from skmultilearn.problem_transform import LabelPowerset
multilabel_binarizer = MultiLabelBinarizer()
multilabel_binarizer.fit(multiskillDF.skill_abr_regroup)
Y = multilabel_binarizer.transform(multiskillDF.skill_abr_regroup)

count_vect = CountVectorizer(ngram_range=(1,2))
X_counts = count_vect.fit_transform(multiskillDF.description_cleaned)

tfidf_transformer = TfidfTransformer()
X_tfidf = tfidf_transformer.fit_transform(X_counts)

In [None]:

lp = LabelPowerset()
Y_lp = lp.transform(Y)

ros = RandomOverSampler(random_state=9000)
X_tfidf_resampled, Y_tfidf_resampled_lp = ros.fit_resample(X_tfidf, Y_lp)

Y_tfidf_resampled = lp.inverse_transform(Y_tfidf_resampled_lp)

In [None]:
# Convert y_train_tfidf_resampled back to multilabel format
y_train_tfidf_resampled_multilabel = multilabel_binarizer.inverse_transform(Y_tfidf_resampled)

# Split the data
x_train_tfidf, x_test_tfidf, y_train_tfidf, y_test_tfidf = train_test_split(X_tfidf_resampled, y_train_tfidf_resampled_multilabel, test_size=0.2, random_state=9000)

# Convert y_train_tfidf to binary matrix format
y_train_tfidf_array = multilabel_binarizer.transform(y_train_tfidf)

# Sum along axis=0 to get the class distribution
y_train_tfidf_sum = y_train_tfidf_array.sum(axis=0)

# Plotting
fig = plt.figure(figsize=(20,20))
(ax_test, ax_train) = fig.subplots(ncols=2, nrows=1)
g1 = sns.barplot(x=Y.sum(axis=0), y=multilabel_binarizer.classes_, ax=ax_test)
g2 = sns.barplot(x=y_train_tfidf_sum, y=multilabel_binarizer.classes_, ax=ax_train)
g1.set_title("class distribution before resampling")
g2.set_title("class distribution in training set after resampling")


In [None]:

y_test_tfidf_array = multilabel_binarizer.transform(y_test_tfidf)
clf = OneVsRestClassifier(MultinomialNB())
# Train the classifier
clf.fit(x_train_tfidf, y_train_tfidf_array)
# Predict on the test set
y_pred_tfidf = clf.predict(x_test_tfidf)
print("Accuracy for Logistic Regression with TF-IDF: {:.2f}%".format(accuracy_score(y_test_tfidf, y_pred_tfidf)*100))
print(classification_report(y_test_tfidf, y_pred_tfidf, target_names=multilabel_binarizer.classes_))