In [1]:
import json
import csv
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import spacy
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy import sparse

nlp = spacy.load('en_core_web_sm')
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\qfu88\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\qfu88\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\qfu88\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [2]:
df = pd.read_csv('Job_Datasets//all_posts_(1).csv')

In [3]:
df.head()

Unnamed: 0,id,text,title,author,created_utc,num_comments,post_id,upvote_ratio,score,url,subreddit,link_flair_text,link_flair_template_id
0,43580,"""""",Roast my resume,Anonymous_0427,1710585856,24,1bg3af6,0.7,22,https://i.redd.it/hijunkmzdooc1.jpeg,resumes,Review my resume • I'm in Asia,3d212a9a-f5cb-11ec-acef-1adb3b338801
1,44017,"""""",Roast my resume as hard as you can. Hold no pu...,Holiday-Penalty5389,1710197769,2,1bch6zz,1.0,2,https://i.redd.it/0m75bqb0csnc1.jpeg,resumes,Review my resume • I'm in North America,c292b8e0-28b9-11ec-874c-325b17e851a3
2,41034,"""Hi all, for context, 29 year old guy from Ams...",Lazy job or Hard job?,Weak_Assumption_6889,1710540442,8,1bfpxll,0.33,0,https://www.reddit.com/r/careeradvice/comments...,careeradvice,,
3,43519,"""Looking for a new role and haven\u2019t had m...",Roast my Resume Pls,Neither_Trash,1710709540,1,1bh8md2,0.99,1,https://i.redd.it/n918fjprlyoc1.jpeg,resumes,Review my resume • I'm in North America,c292b8e0-28b9-11ec-874c-325b17e851a3
4,41067,"""I am a 24M, from west bengal with a bachelor'...",Is Jadavpur University good for MCA?,grvx_rdt,1710515520,0,1bfg926,0.66,1,https://www.reddit.com/r/careeradvice/comments...,careeradvice,,


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8460 entries, 0 to 8459
Data columns (total 13 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   id                      8460 non-null   int64  
 1   text                    8460 non-null   object 
 2   title                   8460 non-null   object 
 3   author                  8410 non-null   object 
 4   created_utc             8460 non-null   int64  
 5   num_comments            8460 non-null   int64  
 6   post_id                 8460 non-null   object 
 7   upvote_ratio            8460 non-null   float64
 8   score                   8460 non-null   int64  
 9   url                     8460 non-null   object 
 10  subreddit               8460 non-null   object 
 11  link_flair_text         4047 non-null   object 
 12  link_flair_template_id  4047 non-null   object 
dtypes: float64(1), int64(4), object(8)
memory usage: 859.3+ KB


## Cleaning & Preprocessing

In [5]:
df = df.drop_duplicates()

In [6]:
##handling missing value

## filling missing values in the link_flair_text and link_flair_template_id  with placeholder


df['link_flair_text'] = df['link_flair_text'].fillna('Unknown')
df['link_flair_template_id'] = df['link_flair_template_id'].fillna('Unknown')


In [7]:
#Convert created_utc

df['created_datetime'] = pd.to_datetime(df['created_utc'], unit='s')
 # "created_utc" feature engineering
df['day_of_week'] = df['created_datetime'].dt.day_name()
df['hour_of_day'] = df['created_datetime'].dt.hour
df['month'] = df['created_datetime'].dt.month_name()
df['year'] = df['created_datetime'].dt.year

df.drop(columns=['created_utc'], inplace=True)

In [8]:
time_features = ['day_of_week', 'hour_of_day', 'month', 'year']


for feature in time_features:
    print(f"Value counts for {feature}:")
    print(df[feature].value_counts())
    print("\n") 


Value counts for day_of_week:
day_of_week
Thursday     1522
Friday       1447
Sunday       1281
Wednesday    1232
Saturday     1201
Tuesday       980
Monday        797
Name: count, dtype: int64


Value counts for hour_of_day:
hour_of_day
21    536
18    526
19    525
17    514
16    497
20    480
15    453
22    431
1     427
23    407
2     392
0     388
14    369
3     330
13    322
4     299
5     260
12    211
11    211
6     203
10    181
7     180
8     177
9     141
Name: count, dtype: int64


Value counts for month:
month
March        7696
February      510
January       178
November       13
September      11
August         11
December        8
May             8
October         8
June            7
July            7
April           3
Name: count, dtype: int64


Value counts for year:
year
2024    8366
2021      24
2022      23
2020      18
2023       8
2019       8
2018       7
2017       4
2016       2
Name: count, dtype: int64




In [1]:
def basic_text_cleaning(df):
  
    
        url_pattern = r'https?://\S+|www\.\S+'
        html_pattern = r'<.*?>'
    
   
        df = df[~df['text'].isin(['[deleted]', '[removed]'])]
        
    # remove URLs
        df['text'] = df['text'].apply(lambda x: re.sub(url_pattern, '', str(x)))
        
    # remove HTML tags
        df['text'] = df['text'].apply(lambda x: re.sub(html_pattern, '', str(x)))
        
    #whitespace
        df['text'] = df['text'].apply(lambda x: ' '.join(str(x).split()))
        
    # drop rows with very short responses
        df = df[df['text'].apply(lambda x: len(str(x)) > 3)]
        
        return df

In [19]:
df = basic_text_cleaning(df)

In [25]:
df = df.reset_index(drop=True)

In [26]:
df.shape

(7897, 17)

In [30]:
#explore author feature
author_counts = df['author'].value_counts()

# check if one author post only once
multiple_posts = author_counts[author_counts > 1]

if multiple_posts.empty:
    print("Each author has posted only once.")
else:
    print("Some authors have posted more than once:")
    print(multiple_posts)

Some authors have posted more than once:
author
AutoModerator         24
CSCQMods              19
carpet222             17
Odd_Spread_8332       15
savant78              13
                      ..
Character_Log_2657     2
Nighthawk1458          2
BoomBowBoom            2
FinanceMan231          2
channelblonded99       2
Name: count, Length: 589, dtype: int64


In [32]:
df[["num_comments", "score", "upvote_ratio"]].describe()

Unnamed: 0,num_comments,score,upvote_ratio
count,7897.0,7897.0,7897.0
mean,16.176269,29.05787,0.786072
std,51.21553,205.067047,0.233239
min,0.0,0.0,0.05
25%,1.0,1.0,0.62
50%,3.0,1.0,0.86
75%,9.0,3.0,1.0
max,1043.0,8597.0,1.0


In [33]:
df['subreddit'].value_counts()

subreddit
careeradvice           973
LegalAdviceOffTopic    908
ITCareerQuestions      898
teachers               875
FinancialCareers       824
careerguidance         689
resumes                640
jobs                   616
cscareerquestions      522
sales                  448
AskHR                  421
EngineeringCareers      83
Name: count, dtype: int64

In [34]:
df.to_csv('Job_Datasets/cleaned_whole_data.csv', index=False)

In [9]:
df2 = pd.read_csv('Job_Datasets/cleaned_whole_data.csv')

In [5]:
df3 = pd.read_csv('Job_Datasets/historical_data.csv')

In [6]:
df3 = basic_text_cleaning(df3)

In [7]:
df3 = df3.reset_index(drop=True)

In [10]:
combined_df = pd.concat([df2, df3], axis=0, ignore_index=True)


In [11]:
combined_df.shape

(8758, 18)

In [12]:
columns_to_check = ['month', 'year']
for column in columns_to_check:
    print(f"Value counts for {column}:")
    print(combined_df[column].value_counts())
    print("\n")

Value counts for month:
month
March        7226
February      545
January       228
August        114
July          108
May           104
June           97
December       85
April          66
November       65
September      62
October        58
Name: count, dtype: int64


Value counts for year:
year
2024    7888
2023     440
2022     145
2021     125
2020      92
2019      38
2018      18
2017      10
2016       2
Name: count, dtype: int64




In [13]:
combined_df.drop('created_utc', axis = 1, inplace = True)

In [14]:
combined_df.shape

(8758, 17)

In [15]:
combined_df.to_csv('Job_Datasets/final_data.csv', index=False)