In [1]:
# Standard library imports
import itertools
import re
import string
import warnings

# Third-party library imports for general purposes
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import scipy.sparse as sp
import statsmodels.api as sm
from scipy.sparse import load_npz
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, f1_score, mean_absolute_error, mean_squared_error, precision_score, recall_score
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.preprocessing import MaxAbsScaler, StandardScaler, scale
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import ExtraTreesClassifier, GradientBoostingClassifier, GradientBoostingRegressor, RandomForestClassifier, RandomForestRegressor
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LinearRegression
from sklearn.decomposition import PCA, NMF
from sklearn.pipeline import make_pipeline, make_union
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.datasets import make_hastie_10_2
from sklearn.model_selection import cross_val_score
from xgboost import XGBClassifier
from tpot import TPOTClassifier, TPOTRegressor
from tpot.builtins import StackingEstimator
from tpot.export_utils import set_param_recursive
from sklearn.preprocessing import LabelEncoder
from sklearn.svm import LinearSVC
from sklearn.dummy import DummyClassifier
from sklearn.model_selection import ParameterGrid
import plotly.express as px


# Text processing and NLP
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.sentiment import SentimentIntensityAnalyzer
from textblob import TextBlob

# Gensim for topic modeling and vector space modeling
from gensim import corpora, models
from gensim.models.coherencemodel import CoherenceModel
from gensim.corpora.dictionary import Dictionary

# Spacy for advanced NLP
import spacy
from spacytextblob.spacytextblob import SpacyTextBlob

# Statsmodels for statistical modeling
from hyperopt import hp, fmin, tpe, Trials, STATUS_OK

# Auto ARIMA model
from pmdarima.arima import auto_arima

# Plotting and visualization
from pandas.plotting import parallel_coordinates
from tqdm import tqdm

# Miscellaneous
import pickle
import io

# Suppress warnings
warnings.filterwarnings("ignore")

# NLTK downloads for specific functionalities
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('vader_lexicon')

from hyperopt import hp, fmin, tpe, STATUS_OK, Trials

import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
import plotly.figure_factory as ff
from sklearn.cluster import KMeans

nlp = spacy.load('en_core_web_sm')
nlp.add_pipe('spacytextblob')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\dross\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\dross\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\dross\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\dross\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


<spacytextblob.spacytextblob.SpacyTextBlob at 0x1c9aeadf650>

GitHub

In [14]:
df = pd.read_csv('cleaned_whole_data.csv')

df.shape
#Removed target variable due to insufficent count 
df = df[df['subreddit']!='EngineeringCareers']

# show remaining target variables and count
df['subreddit'].value_counts()

subreddit
careeradvice           973
LegalAdviceOffTopic    908
ITCareerQuestions      898
teachers               875
FinancialCareers       824
careerguidance         689
resumes                640
jobs                   616
cscareerquestions      522
sales                  448
AskHR                  421
Name: count, dtype: int64

In [15]:
# for feature engineering we will need to convert desired features in to numerical representation 
label_encoder = LabelEncoder()

# Fit and transform the 'subreddit' column to obtain encoded target variable
df['subreddit_encoded'] = label_encoder.fit_transform(df['subreddit'])


Convert to weekday and weekend One-hot encode 'day_of_week' and 'month', obtaining a new DataFrame

In [4]:

day_mapping = {
    "Monday": 1,
    "Tuesday": 2,
    "Wednesday": 3,
    "Thursday": 4,
    "Friday": 5,
    "Saturday": 6,
    "Sunday": 7
}

df['day_of_week_encoded'] = df['day_of_week'].map(day_mapping)

month_mapping = {
    "January": 1,
    "February": 2,
    "March": 3,
    "April": 4,
    "May": 5,
    "June": 6,
    "July": 7,
    "August": 8,
    "September": 9,
    "October": 10,
    "November": 11,
    "December": 12
}

df['month_encoded'] = df['month'].map(month_mapping)


df['is_weekday'] = df['day_of_week'].apply(lambda x: 1 if x in ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday'] else 0)
df.head()

Unnamed: 0,id,text,title,author,num_comments,post_id,upvote_ratio,score,url,subreddit,...,link_flair_template_id,created_datetime,day_of_week,hour_of_day,month,year,subreddit_encoded,day_of_week_encoded,month_encoded,is_weekday
0,41034,"Hi all, for context, 29 year old guy from Amst...",Lazy job or Hard job?,Weak_Assumption_6889,8,1bfpxll,0.33,0,https://www.reddit.com/r/careeradvice/comments...,careeradvice,...,Unknown,2024-03-15 22:07:22,Friday,22,March,2024,4,5,3,1
1,43519,Looking for a new role and haven\u2019t had mu...,Roast my Resume Pls,Neither_Trash,1,1bh8md2,0.99,1,https://i.redd.it/n918fjprlyoc1.jpeg,resumes,...,c292b8e0-28b9-11ec-874c-325b17e851a3,2024-03-17 21:05:40,Sunday,21,March,2024,8,7,3,0
2,41067,"I am a 24M, from west bengal with a bachelor's...",Is Jadavpur University good for MCA?,grvx_rdt,0,1bfg926,0.66,1,https://www.reddit.com/r/careeradvice/comments...,careeradvice,...,Unknown,2024-03-15 15:12:00,Friday,15,March,2024,4,5,3,1
3,43536,It's created in MS word.. I have 3.5 yoe in Ja...,Review my resume please,GroundbreakingZone94,7,1bh3jpg,0.6,1,https://i.redd.it/tjnti8vplxoc1.jpeg,resumes,...,3d212a9a-f5cb-11ec-acef-1adb3b338801,2024-03-17 17:43:39,Sunday,17,March,2024,8,7,3,0
4,43602,I live in Los Angeles and am 27 years old.\n,Thoughts on My Experience?,AshamedJellyfish9197,1,1bgejk2,0.33,0,https://i.redd.it/ifqv15ry3roc1.jpeg,resumes,...,c292b8e0-28b9-11ec-874c-325b17e851a3,2024-03-16 19:53:24,Saturday,19,March,2024,8,6,3,0


#to provide more context concatenated text and title then preformed tfidf on the Concatenated title and text

In [5]:
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    # Lowercasing
    text = text.lower()
    
    # text = expand_contractions(text)
    
    # Removing unicode characters
    text = re.sub(r'\\u[0-9A-Fa-f]+', '', text)
    
    # Removing escape sequences
    text = re.sub(r'\\n', ' ', text)
    text = re.sub(r'\\', '', text)
    
    # Removing non-alphabetic characters
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    
    # Tokenization
    tokens = word_tokenize(text)
    
    # Removing stop words and lemmatizing
    tokens = [lemmatizer.lemmatize(token) for token in tokens if token not in stop_words]
    
    # Joining back into a string
    return ' '.join(tokens)

df['processed_text'] = df['text'].apply(preprocess_text)
df['combined_text'] = df['title'] + ' ' + df['processed_text']

# Apply TF-IDF Vectorization
tfidf_vectorizer = TfidfVectorizer(max_features=1000)
tfidf_matrix = tfidf_vectorizer.fit_transform(df['combined_text'])

# Using matrices and want to combine with encoded_features, you will need to convert them to a dense format
tfidf_dense = tfidf_matrix.toarray()

We combine the dense array with the array_features for one set of features 

In [6]:
#Inorder to cobine all features will have to convert them to an numpy array 
#this dense matrix gives a 7% higher accuracy score but is expensive computional wise 
#solely due to cost ran sparse matrix  but feel free to run either

array_features = df[['num_comments','upvote_ratio','score','hour_of_day','year','day_of_week_encoded','month_encoded','is_weekday']].values

combined_features = np.hstack((tfidf_dense, array_features))

#target varibale
y = df['subreddit_encoded'].values

In [29]:
#XGB
X_train, X_test, y_train, y_test = train_test_split(combined_features, y, test_size=0.2, random_state=42)
#Linear_SVC
X_trains, X_tests, y_trains, y_tests = train_test_split(tfidf_matrix, y, test_size=0.2, random_state=42)

Our hyper tuning techinques for both the linear svc and the XGB boost where a combination of and exhaustive grid search and Baysiean optimization  

In [56]:
#exhaustive grid XGB
space = {
    'learning_rate': hp.loguniform('learning_rate', np.log(0.01), np.log(1)),
    'max_depth': hp.choice('max_depth', np.arange(1, 15, dtype=int)),
    'min_child_weight': hp.quniform('min_child_weight', 1, 6, 1),
    'subsample': hp.uniform('subsample', 0.5, 1),
    'n_estimators': hp.choice('n_estimators', [100, 200, 300, 400, 500]),
    'verbosity': hp.choice('verbosity', [0]),
    'n_jobs': -1  # Use all available cores
}

In [57]:
# Baysiean optimization XGB
def objective(params):
    xgb_classifier = XGBClassifier(**params)
    score = cross_val_score(xgb_classifier, X_train, y_train, cv=5, scoring='accuracy').mean()
    return {'loss': -score, 'status': STATUS_OK}

# Run the optimization
trials = Trials()
best = fmin(fn=objective, space=space, algo=tpe.suggest, max_evals=100, trials=trials)

print("Best Hyperparameters for XGBoost:", best)

 45%|████▌     | 45/100 [5:27:06<4:32:00, 296.73s/trial, best loss: -0.6872493685051959] 

In [30]:
XGB = XGBClassifier(learning_rate=0.1132567200427148, max_depth=2, min_child_weight=3.0, n_estimators=100, n_jobs=1, subsample=0.6821295850917053, verbosity=0)
XGB.fit(X_train,y_train)
results = XGB.predict(X_test)


 67% accuracy the dense matrix and pairing of features gave us a 6 percent increase in overall preformance then tthe linearSVC suggested with the Auto ML

In [33]:
#XGBClassifier measurement statistics 
acc = accuracy_score(y_test,results)
print(f'Accuracy: {acc}')

#precision
precision = precision_score(y_test,results,average='weighted')
print(f'Precision: {precision}')
#Recall
recall = recall_score(y_test, results, average='weighted')
print(f'Recall:{recall}')
#f1 Score 
f1 = f1_score(y_test,results,average='weighted')
print(f'F1 Score: {f1}')

Accuracy: 0.6609085092770314
Precision: 0.665885221707135
Recall:0.6609085092770314
F1 Score: 0.6609520380787629


The intent of tpot is to assist with model selection tpot gave us a great bench mark model a Linear_SVC that gave good accuracy but after tuning and a pca analysis the final model choosen was an XGboost classifier 

In [None]:
tpot = TPOTClassifier(generations=10,population_size=100,verbosity=2,random_state=42, config_dict='TPOT sparse')
tpot.fit(X_trains, y_trains)
tpot.export('best_tpot_pipelineV2.py')

In [31]:
exported_pipeline = LinearSVC(C= 0.09695023762026912,dual=False,loss='squared_hinge',penalty='l2',tol=0.000143728759546748,class_weight='balanced') 
#fix random state in exported estimator
if hasattr(exported_pipeline,'random_state'):
    setattr(exported_pipeline,'random_state',42)
    
exported_pipeline.fit(X_trains,y_trains)
result = exported_pipeline.predict(X_tests)

In [19]:
#exhaustive grid LinearSVC
param_grid = {'C':[.01,.1,1,10,100,1000],
              'tol':[.0001,.001,.01,.1],
              'loss':['squared_hinge'],
              'penalty':['l2'],
              'dual':[False]}

grid = list(ParameterGrid(param_grid))

lin_svc =LinearSVC(random_state=42)

best_score = 0
best_params={}

for params in tqdm(grid, desc="GridSearch"):
    
    lin_svc.set_params(**params)
    lin_svc.fit(X_trains,y_trains)
    #Evaluate the model
    current_score = lin_svc.score(X_tests,y_tests)
    
    #update with best score and param
    
    if current_score > best_score:
        best_score = current_score
        best_params = params 
        
print("Best Parameters:",best_params)
print("Best Score:",best_score)


GridSearch: 100%|██████████| 24/24 [00:04<00:00,  5.08it/s]

Best Parameters: {'C': 0.1, 'dual': False, 'loss': 'squared_hinge', 'penalty': 'l2', 'tol': 0.1}
Best Score: 0.6276391554702495





In [20]:
# Baysiean optimization LinearSVC
space = {
    'C': hp.loguniform('C', np.log(0.01), np.log(1000)),
    'tol': hp.loguniform('tol', np.log(0.0001), np.log(0.1)),
    'loss': hp.choice('loss', ['squared_hinge']),
    'penalty': hp.choice('penalty', ['l2']),
    'dual': hp.choice('dual', [False])
}

# Define the objective function
def objective(params):
    lin_svc = LinearSVC(**params, random_state=42)
    score = cross_val_score(lin_svc, X_trains, y_trains, cv=5, scoring='accuracy').mean()
    return {'loss': -score, 'status': STATUS_OK}

# Run the optimization
trials = Trials()
best = fmin(fn=objective, space=space, algo=tpe.suggest, max_evals=100, trials=trials)

print("Best Hyperparameters:", best)

100%|██████████| 100/100 [01:12<00:00,  1.38trial/s, best loss: -0.6280587050359713]
Best Hyperparameters: {'C': 0.09695023762026912, 'dual': 0, 'loss': 0, 'penalty': 0, 'tol': 0.000143728759546748}


In [32]:
acc = accuracy_score(y_tests,result)
print(f'Accuracy: {acc}')

#precision
precision = precision_score(y_tests,result,average='weighted')
print(f'Precision: {precision}')
#Recall
recall = recall_score(y_tests, result, average='weighted')
print(f'Recall:{recall}')
#f1 Score 
f1 = f1_score(y_tests,result,average='weighted')
print(f'F1 Score: {f1}')

Accuracy: 0.6218809980806143
Precision: 0.6064071587845172
Recall:0.6218809980806143
F1 Score: 0.6054736306825682


In [20]:
dummy_clf = DummyClassifier(strategy = 'most_frequent', random_state=42)

dummy_clf.fit(X_train, y_train)
y_pred = dummy_clf.predict(X_test)

accuracy = accuracy_score(y_test,y_pred)
print(f'Baseline Accuracy: {accuracy}')



Baseline Accuracy: 0.12859884836852206


XGBoost in comprasion to the linear svc 

In [34]:
#XGB
predicted_subreddit_names = label_encoder.inverse_transform(results)
true_subreddit_names = label_encoder.inverse_transform(y_test)

# Compute the confusion matrix
conf_matrix = confusion_matrix(true_subreddit_names, predicted_subreddit_names)

# Get unique class names in the right order
sorted_unique_names = np.unique(np.concatenate((true_subreddit_names, predicted_subreddit_names)))

# Create the heatmap
fig = ff.create_annotated_heatmap(
    z=conf_matrix,
    x=sorted_unique_names.tolist(),  
    y=sorted_unique_names.tolist(),  
    colorscale='Viridis',
    annotation_text=np.around(conf_matrix.astype(float) / conf_matrix.sum(axis=1)[:, np.newaxis], decimals=2).astype(str),
    hoverinfo="z",
    showscale=True
)

# Update the layout
fig.update_layout(
    title='XGB Predictions',
    xaxis=dict(title='Predicted Subreddit'),
    yaxis=dict(title='True Subreddit', autorange='reversed'),
    width=800,  
    height=800,
)

# Show the figure
fig.show()

The linear svc was choose due to it simplicity fronm there we tune with two techiques the costly yet powerful grid search and Bayesian optimization. Bayesian learns statistically which combinations did better

In [35]:
#linear SVC
predicted_subreddit_names = label_encoder.inverse_transform(result)
true_subreddit_names = label_encoder.inverse_transform(y_tests)
conf_matrix = confusion_matrix(y_tests, result)

# Ensure it's in list format
subreddit_names_list = sorted_unique_names.tolist()

# Re-attempt to create the heatmap
fig = ff.create_annotated_heatmap(
    z=conf_matrix,
    x=subreddit_names_list,  # Ensure this is a list
    y=subreddit_names_list,  # Ensure this is a list too
    colorscale='Viridis',
    annotation_text=np.around(conf_matrix.astype(np.float64) / conf_matrix.sum(axis=1)[:, np.newaxis], decimals=2),
    hoverinfo="z",
    showscale=True
)

fig.update_layout(
    title='Linear SVC Predictions',
    xaxis=dict(title='Predicted Subreddit'),
    yaxis=dict(title='True Subreddit'),
    yaxis_autorange='reversed'
)
fig.show()

Put predictions back into dataframe 

In [13]:
predictions = XGB.predict(combined_features)

# Decode predictions to original labels
decoded_predictions = label_encoder.inverse_transform(predictions)

# Add predictions back to the DataFrame
df['predicted_subreddit'] = decoded_predictions

df.head(5)
# df.to_csv('Final_Predictions.csv', index=False)