In [1]:
import pandas as pd
import numpy as np
import re
import spacy
from textblob import TextBlob
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from collections import Counter


In [2]:
df = pd.read_csv('/Users/sabrinasayed/Documents/GitHub/Fake-Job-Posts/Data/fake_job_postings.csv')

# Same Cleaning Techniques

In [3]:
# Drop location, job id, and salary range - they are either unnecessary or have too many missing values

df.drop(columns=['job_id', 'location', 'salary_range'], inplace=True)

In [4]:
# Replace missing values in categorical columns with empty strings
columns_with_text = ['title', 'department', 'company_profile', 'description', 'requirements',
       'benefits', 'employment_type', 'required_experience',
       'required_education', 'industry', 'function']
df[columns_with_text] = df[columns_with_text].replace(np.nan, '')
df.isnull().sum()


title                  0
department             0
company_profile        0
description            0
requirements           0
benefits               0
telecommuting          0
has_company_logo       0
has_questions          0
employment_type        0
required_experience    0
required_education     0
industry               0
function               0
fraudulent             0
dtype: int64

In [5]:
import re
import nltk

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
nltk.download('omw-1.4') 

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt_tab')


[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/sabrinasayed/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/sabrinasayed/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/sabrinasayed/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/sabrinasayed/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/sabrinasayed/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [6]:
def preprocess_text(text):
    if not isinstance(text, str):
        return ''
    
    # Convert to lowercase
    text = text.lower()
    
    # Remove HTML tags
    text = re.sub(r'<.*?>', '', text)
    
    # Remove special characters and digits
    text = re.sub(r'[^\w\s]', ' ', text)
    text = re.sub(r'\d+', '', text)
    
    # Remove extra whitespace
    text = ' '.join(text.split())
    
    # Tokenization
    tokens = word_tokenize(text)
    
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]
    
    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token) for token in tokens]
    
    # Join tokens back into text
    return ' '.join(tokens)


In [7]:

# Apply preprocessing to relevant text columns
text_columns = ['title', 'company_profile', 'description', 'requirements', 'benefits']
for column in text_columns:
    df[f'{column}_processed'] = df[column].apply(preprocess_text)

df.head()

Unnamed: 0,title,department,company_profile,description,requirements,benefits,telecommuting,has_company_logo,has_questions,employment_type,required_experience,required_education,industry,function,fraudulent,title_processed,company_profile_processed,description_processed,requirements_processed,benefits_processed
0,Marketing Intern,Marketing,"We're Food52, and we've created a groundbreaki...","Food52, a fast-growing, James Beard Award-winn...",Experience with content management systems a m...,,0,1,0,Other,Internship,,,Marketing,0,marketing intern,food created groundbreaking award winning cook...,food fast growing james beard award winning on...,experience content management system major plu...,
1,Customer Service - Cloud Video Production,Success,"90 Seconds, the worlds Cloud Video Production ...",Organised - Focused - Vibrant - Awesome!Do you...,What we expect from you:Your key responsibilit...,What you will get from usThrough being part of...,0,1,0,Full-time,Not Applicable,,Marketing and Advertising,Customer Service,0,customer service cloud video production,second world cloud video production service se...,organised focused vibrant awesome passion cust...,expect key responsibility communicate client s...,get usthrough part second team gain experience...
2,Commissioning Machinery Assistant (CMA),,Valor Services provides Workforce Solutions th...,"Our client, located in Houston, is actively se...",Implement pre-commissioning and commissioning ...,,0,1,0,,,,,,0,commissioning machinery assistant cma,valor service provides workforce solution meet...,client located houston actively seeking experi...,implement pre commissioning commissioning proc...,
3,Account Executive - Washington DC,Sales,Our passion for improving quality of life thro...,THE COMPANY: ESRI – Environmental Systems Rese...,"EDUCATION: Bachelor’s or Master’s in GIS, busi...",Our culture is anything but corporate—we have ...,0,1,0,Full-time,Mid-Senior level,Bachelor's Degree,Computer Software,Sales,0,account executive washington dc,passion improving quality life geography heart...,company esri environmental system research ins...,education bachelor master gi business administ...,culture anything corporate collaborative creat...
4,Bill Review Manager,,SpotSource Solutions LLC is a Global Human Cap...,JOB TITLE: Itemization Review ManagerLOCATION:...,QUALIFICATIONS:RN license in the State of Texa...,Full Benefits Offered,0,1,1,Full-time,Mid-Senior level,Bachelor's Degree,Hospital & Health Care,Health Care Provider,0,bill review manager,spotsource solution llc global human capital m...,job title itemization review managerlocation f...,qualification rn license state texasdiploma ba...,full benefit offered


In [8]:
import unicodedata

def preprocess_text(text):
    if not isinstance(text, str):
        return ''
    
    # Convert to lowercase
    text = text.lower()
    
    # Normalize unicode characters
    text = unicodedata.normalize('NFKD', text)
    
    # Remove non-ASCII characters
    text = text.encode('ascii', 'ignore').decode('ascii')
    
    # Remove HTML tags
    text = re.sub(r'<.*?>', '', text)
    
    # Remove special characters and digits
    text = re.sub(r'[^\w\s]', ' ', text)
    text = re.sub(r'\d+', '', text)
    
    # Remove extra whitespace
    text = ' '.join(text.split())
    
    # Tokenization
    tokens = word_tokenize(text)
    
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]
    
    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token) for token in tokens]
    
    # Join tokens back into text
    return ' '.join(tokens)

# Reapply the improved preprocessing
text_columns = ['title', 'company_profile', 'description', 'requirements', 'benefits']
for column in text_columns:
    df[f'{column}_processed'] = df[column].apply(preprocess_text)

# Verify the cleaning worked
def check_cleaning(text_series):
    all_text = ' '.join(text_series)
    special_chars = re.findall(r'[^a-zA-Z\s]', all_text)
    remaining = Counter(special_chars).most_common()
    
    if remaining:
        print("Remaining special characters:")
        print(remaining)
    else:
        print("No special characters remaining!")

# Check the results
for column in text_columns:
    print(f"\nChecking {column}_processed:")
    check_cleaning(df[f'{column}_processed'])


Checking title_processed:
Remaining special characters:
[('_', 1)]

Checking company_profile_processed:
Remaining special characters:
[('_', 3827)]

Checking description_processed:
Remaining special characters:
[('_', 8945)]

Checking requirements_processed:
Remaining special characters:
[('_', 2901)]

Checking benefits_processed:
Remaining special characters:
[('_', 3683)]


# Text Vectorization

In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Combine relevant text columns for vectorization
df['combined_text']= (df['title_processed'] + ' ' + 
                      df['company_profile_processed'] + ' ' + 
                      df['description_processed'] + ' ' + 
                      df['requirements_processed'] + ' ' + 
                      df['benefits_processed'])

vectorizer = TfidfVectorizer(min_df = 0.01, 
                             max_df = 0.95, 
                             ngram_range=(1,3), 
                             stop_words='english')

X_tfidf = vectorizer.fit_transform(df['combined_text'])

vec_tfidf = pd.DataFrame(X_tfidf.toarray(), columns=vectorizer.get_feature_names_out())
vec_tfidf.head()

Unnamed: 0,ability,ability build,ability communicate,ability effectively,ability learn,ability manage,ability multi,ability multi task,ability prioritize,ability work,...,youll,youll need,youll work,young,youre,youre looking,youve,yr,zealand,zone
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.011413,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.019763,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Topic Modeling

In [11]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

# Document-term matrix using CountVectorizer
count_vectorizer = CountVectorizer(min_df=0.01, max_df=0.95)
doc_term_matrix = count_vectorizer.fit_transform(df['combined_text'])

# FLDA model
# n_components represents the number of topics you want to extract
n_topics = 10
lda_model = LatentDirichletAllocation(n_components=n_topics, 
                                     random_state=42,
                                     learning_method='batch')
lda_output = lda_model.fit_transform(doc_term_matrix)

# Display the top words for each topic
def print_topics(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        top_words = [feature_names[i] for i in topic.argsort()[:-n_top_words-1:-1]]
        print(f"Topic {topic_idx + 1}: {', '.join(top_words)}")

# Display top 10 words for each topic
print_topics(lda_model, count_vectorizer.get_feature_names_out(), 10)

# Add topic distributions to original dataframe
topic_names = [f"Topic {i+1}" for i in range(n_topics)]
doc_topics = pd.DataFrame(lda_output, columns=topic_names)

# Get dominant topic for each document
df['dominant_topic'] = doc_topics.idxmax(axis=1)


Topic 1: company, finance, financial, service, benefit, employment, position, year, well, credit
Topic 2: marketing, medium, digital, social, content, brand, campaign, online, experience, new
Topic 3: business, management, client, project, team, experience, skill, work, service, company
Topic 4: experience, job, technical, system, engineering, year, website, amp, data, manufacturing
Topic 5: experience, product, design, development, software, technology, service, application, team, user
Topic 6: student, job, teacher, abroad, get, loan, teaching, required, experience, title
Topic 7: customer, service, work, process, business, document, communication, solution, required, mail
Topic 8: sale, customer, product, service, business, work, career, role, candidate, training
Topic 9: service, work, home, time, care, must, experience, position, customer, hour
Topic 10: team, work, experience, people, new, working, company, looking, product, want


# New Advanced Features

In [12]:
# Add new sophisticated NLP features
def extract_advanced_text_features(text):
    if not isinstance(text, str) or len(text.strip()) == 0:
        return {
            # Basic features (from original)
            'avg_word_length': 0,
            'caps_ratio': 0,
            'url_count': 0,
            'email_pattern': 0,
            'money_pattern': 0,
            # New advanced features
            'num_sentences': 0,
            'avg_sentence_length': 0,
            'num_entities': 0,
            'sentiment_score': 0
        }
    
    nlp = spacy.load('en_core_web_md')
    doc = nlp(text)

    # Basic features (keep original calculations)
    words = text.split()
    avg_word_length = np.mean([len(word) for word in words]) if words else 0
    text_length = len(text)
    caps_ratio = sum(1 for c in text if c.isupper()) / text_length if text_length > 0 else 0
    
    # New advanced features
    sentences = list(doc.sents)
    sentiment = TextBlob(text).sentiment
    
    return {
        # Original features
        'avg_word_length': avg_word_length,
        'caps_ratio': caps_ratio,
        'url_count': text.count('http'),
        'email_pattern': len(re.findall(r'[\w\.-]+@[\w\.-]+', text)),
        'money_pattern': len(re.findall(r'[\$£€]\d+', text)),
        # New features
        'num_sentences': len(sentences),
        'avg_sentence_length': np.mean([len(str(sent).split()) for sent in sentences]) if sentences else 0,
        'num_entities': len(doc.ents),
        'sentiment_score': sentiment.polarity
    }

In [13]:
# Create bigram/trigram features
def extract_ngram_features(text_series, min_df=5):
    ngram_vectorizer = CountVectorizer(
        ngram_range=(2, 3),
        min_df=min_df,
        stop_words='english'
    )
    
    ngrams = ngram_vectorizer.fit_transform(text_series)
    return pd.DataFrame(
        ngrams.toarray(),
        columns=[f'ngram_{feat}' for feat in ngram_vectorizer.get_feature_names_out()]
    )

In [15]:
# Apply advanced features to each text column
for column in text_columns:
    print(f"\nProcessing {column} with advanced features...")
    
    # Extract advanced features
    advanced_features = df[f'{column}_processed'].apply(extract_advanced_text_features)
    advanced_df = pd.DataFrame(advanced_features.tolist())
    advanced_df.columns = [f'{column}_{col}' for col in advanced_df.columns]
    
    # Extract n-gram features
    ngram_df = extract_ngram_features(df[f'{column}_processed'])
    


Processing title with advanced features...




KeyboardInterrupt: 

In [None]:
# Put it all together

cleaned_df = pd.concat([df, advanced_df, ngram_df, vec_tfidf], axis=1)
cleaned_df.head()