In [1]:
import json
import csv
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import spacy
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy import sparse

nlp = spacy.load('en_core_web_sm')
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\qfu88\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\qfu88\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\qfu88\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [2]:
def clean_preprocess_export_json_to_csv(json_file_path):
  
    
    # load jason into df
    with open(json_file_path, 'r', encoding='utf-8') as file:
        data = json.load(file)
    df = pd.DataFrame(data)
    
    # Columns to keep
    columns_to_keep = [
        'author', 'created_utc', 'edited', 'id',
        'link_flair_text', 'name', 'num_comments', 'permalink',
        'score', 'text', 'title', 'upvote_ratio', 'url'
    ]
    
    # Keep only the attributes we need
    df_filtered = df.loc[:, columns_to_keep]
    

    
    # remove rows with missing values and drop duplicates
    df_filtered.dropna(inplace=True)
    df_filtered.drop_duplicates(inplace=True)
    
    
    # convert 'created_utc'
    df_filtered['created_datetime'] = pd.to_datetime(df_filtered['created_utc'], unit='s')


    # "created_utc" feature engineering
    df_filtered['day_of_week'] = df_filtered['created_datetime'].dt.day_name()
    df_filtered['hour_of_day'] = df_filtered['created_datetime'].dt.hour
    df_filtered['month'] = df_filtered['created_datetime'].dt.month_name()
    df_filtered['year'] = df_filtered['created_datetime'].dt.year
    
    
    
    ## transformation "edited" feature
    df_filtered['was_edited'] = df_filtered['edited'].apply(lambda x: False if x == False else True)


    df_filtered['edit_datetime'] = pd.to_datetime(df_filtered['edited'].apply(lambda x: x if x != False else None), 
                                                  unit='s', errors='coerce')
    
    
    # drop the original columns that have been processed
    df_filtered.drop(columns=['created_utc', 'edited'], inplace=True)

   
    ## text feature 
    
    def basic_text_cleaning(df):
  
    
        url_pattern = r'https?://\S+|www\.\S+'
        html_pattern = r'<.*?>'
    
   
        df = df[~df['text'].isin(['[deleted]', '[removed]'])]
        
    # remove URLs
        df['text'] = df['text'].apply(lambda x: re.sub(url_pattern, '', str(x)))
        
    # remove HTML tags
        df['text'] = df['text'].apply(lambda x: re.sub(html_pattern, '', str(x)))
        
    #whitespace
        df['text'] = df['text'].apply(lambda x: ' '.join(str(x).split()))
        
    # drop rows with very short responses
        df = df[df['text'].apply(lambda x: len(str(x)) > 3)]
        
        return df


    df_filtered = basic_text_cleaning(df_filtered)
    df_filtered.to_csv('Job_Datasets/df_minimal_cleaning.csv', index=False)
    
    
    
    def preprocess_text(text):
   
    # remove special characters, punctuation, and numbers
        text = re.sub(r'[^a-zA-Z\s]', '', text)
    # tokenize and lemmatize the text
        doc = nlp(text)
        tokens = [token.lemma_ for token in doc if token.text not in stopwords.words('english')]
    # join the tokens back into a string
        return ' '.join(tokens)
    
    df_filtered['text'] = df_filtered['text'].apply(preprocess_text)
    
    df_filtered.to_csv('Job_Datasets/df_nlp_processing.csv', index=False)
    
    tfidf_vect = TfidfVectorizer(max_df=0.95, min_df=2, max_features=1000)

# apply it to preprocessed text
    tfidf_matrix = tfidf_vect.fit_transform(df_filtered['text'])
    
    sparse.save_npz("Job_Datasets/tfidf_matrix.npz", tfidf_matrix)
    
    

In [3]:
json_file_path = 'api/jobs.json'
clean_preprocess_export_json_to_csv(json_file_path)