In [31]:
# Standard library imports
import itertools
import re
import string
import warnings

# Third-party library imports for general purposes
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import scipy.sparse as sp
import statsmodels.api as sm
from scipy.sparse import load_npz
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, f1_score, mean_absolute_error, mean_squared_error, precision_score, recall_score
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.preprocessing import MaxAbsScaler, StandardScaler, scale
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import ExtraTreesClassifier, GradientBoostingClassifier, GradientBoostingRegressor, RandomForestClassifier, RandomForestRegressor
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LinearRegression
from sklearn.decomposition import PCA, NMF
from sklearn.pipeline import make_pipeline, make_union
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.datasets import make_hastie_10_2
from xgboost import XGBClassifier
from tpot import TPOTClassifier, TPOTRegressor
from tpot.builtins import StackingEstimator
from tpot.export_utils import set_param_recursive
from sklearn.preprocessing import LabelEncoder
from sklearn.svm import LinearSVC
from sklearn.dummy import DummyClassifier
from sklearn.model_selection import ParameterGrid


# Text processing and NLP
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.sentiment import SentimentIntensityAnalyzer
from textblob import TextBlob

# Gensim for topic modeling and vector space modeling
from gensim import corpora, models
from gensim.models.coherencemodel import CoherenceModel
from gensim.corpora.dictionary import Dictionary

# Spacy for advanced NLP
import spacy
from spacytextblob.spacytextblob import SpacyTextBlob

# Statsmodels for statistical modeling
from statsmodels.tsa.stattools import adfuller
from statsmodels.tsa.api import VAR
from statsmodels.tsa.arima.model import ARIMA


# Auto ARIMA model
from pmdarima.arima import auto_arima

# Plotting and visualization
from pandas.plotting import parallel_coordinates
from tqdm import tqdm

# Miscellaneous
import pickle

# Suppress warnings
warnings.filterwarnings("ignore")

# NLTK downloads for specific functionalities
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('vader_lexicon')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\dross\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\dross\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\dross\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\dross\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [3]:
nlp = spacy.load('en_core_web_sm')
nlp.add_pipe('spacytextblob')

<spacytextblob.spacytextblob.SpacyTextBlob at 0x13cd597f010>

LOCALLY

GitHub

In [37]:
df = pd.read_csv('cleaned_whole_data.csv')
#tfidf_matrix = load_npz('C:/Users/dross/Desktop/Project/tfidf_matrix.npz')
# list(df['link_flair_text'].unique())
# labels = df['link_flair_text']
df.head()

Unnamed: 0,id,text,title,author,num_comments,post_id,upvote_ratio,score,url,subreddit,link_flair_text,link_flair_template_id,created_datetime,day_of_week,hour_of_day,month,year
0,41034,"Hi all, for context, 29 year old guy from Amst...",Lazy job or Hard job?,Weak_Assumption_6889,8,1bfpxll,0.33,0,https://www.reddit.com/r/careeradvice/comments...,careeradvice,Unknown,Unknown,2024-03-15 22:07:22,Friday,22,March,2024
1,43519,Looking for a new role and haven\u2019t had mu...,Roast my Resume Pls,Neither_Trash,1,1bh8md2,0.99,1,https://i.redd.it/n918fjprlyoc1.jpeg,resumes,Review my resume • I'm in North America,c292b8e0-28b9-11ec-874c-325b17e851a3,2024-03-17 21:05:40,Sunday,21,March,2024
2,41067,"I am a 24M, from west bengal with a bachelor's...",Is Jadavpur University good for MCA?,grvx_rdt,0,1bfg926,0.66,1,https://www.reddit.com/r/careeradvice/comments...,careeradvice,Unknown,Unknown,2024-03-15 15:12:00,Friday,15,March,2024
3,43536,It's created in MS word.. I have 3.5 yoe in Ja...,Review my resume please,GroundbreakingZone94,7,1bh3jpg,0.6,1,https://i.redd.it/tjnti8vplxoc1.jpeg,resumes,Review my resume • I'm in Asia,3d212a9a-f5cb-11ec-acef-1adb3b338801,2024-03-17 17:43:39,Sunday,17,March,2024
4,43602,I live in Los Angeles and am 27 years old.\n,Thoughts on My Experience?,AshamedJellyfish9197,1,1bgejk2,0.33,0,https://i.redd.it/ifqv15ry3roc1.jpeg,resumes,Review my resume • I'm in North America,c292b8e0-28b9-11ec-874c-325b17e851a3,2024-03-16 19:53:24,Saturday,19,March,2024


In [38]:
df['subreddit'].value_counts()

subreddit
careeradvice           973
LegalAdviceOffTopic    908
ITCareerQuestions      898
teachers               875
FinancialCareers       824
careerguidance         689
resumes                640
jobs                   616
cscareerquestions      522
sales                  448
AskHR                  421
EngineeringCareers      83
Name: count, dtype: int64

In [39]:
label_encoder = LabelEncoder()

# Fit and transform the 'subreddit' column to obtain encoded target variable
df['subreddit_encoded'] = label_encoder.fit_transform(df['subreddit'])


In [40]:
#Convert to weekday and weekend 
# One-hot encode 'day_of_week' and 'month', obtaining a new DataFrame
day_mapping = {
    "Monday": 1,
    "Tuesday": 2,
    "Wednesday": 3,
    "Thursday": 4,
    "Friday": 5,
    "Saturday": 6,
    "Sunday": 7
}

df['day_of_week_encoded'] = df['day_of_week'].map(day_mapping)

month_mapping = {
    "January": 1,
    "February": 2,
    "March": 3,
    "April": 4,
    "May": 5,
    "June": 6,
    "July": 7,
    "August": 8,
    "September": 9,
    "October": 10,
    "November": 11,
    "December": 12
}

df['month_encoded'] = df['month'].map(month_mapping)


df['is_weekday'] = df['day_of_week'].apply(lambda x: 1 if x in ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday'] else 0)
df.head()

Unnamed: 0,id,text,title,author,num_comments,post_id,upvote_ratio,score,url,subreddit,...,link_flair_template_id,created_datetime,day_of_week,hour_of_day,month,year,subreddit_encoded,day_of_week_encoded,month_encoded,is_weekday
0,41034,"Hi all, for context, 29 year old guy from Amst...",Lazy job or Hard job?,Weak_Assumption_6889,8,1bfpxll,0.33,0,https://www.reddit.com/r/careeradvice/comments...,careeradvice,...,Unknown,2024-03-15 22:07:22,Friday,22,March,2024,5,5,3,1
1,43519,Looking for a new role and haven\u2019t had mu...,Roast my Resume Pls,Neither_Trash,1,1bh8md2,0.99,1,https://i.redd.it/n918fjprlyoc1.jpeg,resumes,...,c292b8e0-28b9-11ec-874c-325b17e851a3,2024-03-17 21:05:40,Sunday,21,March,2024,9,7,3,0
2,41067,"I am a 24M, from west bengal with a bachelor's...",Is Jadavpur University good for MCA?,grvx_rdt,0,1bfg926,0.66,1,https://www.reddit.com/r/careeradvice/comments...,careeradvice,...,Unknown,2024-03-15 15:12:00,Friday,15,March,2024,5,5,3,1
3,43536,It's created in MS word.. I have 3.5 yoe in Ja...,Review my resume please,GroundbreakingZone94,7,1bh3jpg,0.6,1,https://i.redd.it/tjnti8vplxoc1.jpeg,resumes,...,3d212a9a-f5cb-11ec-acef-1adb3b338801,2024-03-17 17:43:39,Sunday,17,March,2024,9,7,3,0
4,43602,I live in Los Angeles and am 27 years old.\n,Thoughts on My Experience?,AshamedJellyfish9197,1,1bgejk2,0.33,0,https://i.redd.it/ifqv15ry3roc1.jpeg,resumes,...,c292b8e0-28b9-11ec-874c-325b17e851a3,2024-03-16 19:53:24,Saturday,19,March,2024,9,6,3,0


In [41]:
#to provide more context concatenated text and title then prefrom tfidf
# Concatenate title and text
df['combined_text'] = df['title'] + ' ' + df['text']

# Apply TF-IDF Vectorization
tfidf_vectorizer = TfidfVectorizer(max_features=1000, stop_words='english')
tfidf_matrix = tfidf_vectorizer.fit_transform(df['combined_text'])

# If you're using sparse matrices and want to combine with encoded_features, you might need to convert them to a dense format
tfidf_dense = tfidf_matrix.toarray()

In [42]:
#Inorder to cobine all features will have to convert them to an numpy array 
#this dense matrix gives a 7% higher accuracy score but is expensive computional wise 
#solely due to cost ran sparse matrix  but feel free to run either

array_features = df[['num_comments','upvote_ratio','score','hour_of_day','year','day_of_week_encoded','month_encoded','is_weekday']].values

combined_features = np.hstack((tfidf_dense, array_features))


In [43]:
y = df['subreddit_encoded'].values

In [44]:
X_train, X_test, y_train, y_test = train_test_split(tfidf_matrix, y, test_size=0.2, random_state=42)

In [45]:
tpot = TPOTClassifier(generations=10,population_size=100,verbosity=2,random_state=42, config_dict='TPOT sparse')
#tpot = TPOTClassifier(generations=5, population_size=100, verbosity=2, random_state=42)

This is an expensive run would not run locally you can still use the suggested Linaer SVC on the next comment

In [2]:
tpot.fit(X_train, y_train)

In [14]:
tpot.export('best_tpot_pipelineV2.py')

In [17]:
exported_pipeline = LinearSVC(C=.1,dual=False,loss='squared_hinge',penalty='l2',tol=0.1)
#fix random state in exported estimator
if hasattr(exported_pipeline,'random_state'):
    setattr(exported_pipeline,'random_state',42)
    
exported_pipeline.fit(X_train,y_train)
results = exported_pipeline.predict(X_test)

In [34]:
param_grid = {'C':[.01,.1,1,10,100,1000],
              'tol':[.0001,.001,.01,.1],
              'loss':['squared_hinge'],
              'penalty':['l2'],
              'dual':[False]}

grid = list(ParameterGrid(param_grid))

lin_svc =LinearSVC(random_state=42)

best_score = 0
best_params={}

for params in tqdm(grid, desc="GridSearch"):
    
    lin_svc.set_params(**params)
    lin_svc.fit(X_train,y_train)
    #Evaluate the model
    current_score = lin_svc.score(X_test,y_test)
    
    #update with best score and param
    
    if current_score > best_score:
        best_score = current_score
        best_params = params 
        
print("Best Parameters:",best_params)
print("Best Score:",best_score)


GridSearch: 100%|██████████| 24/24 [00:04<00:00,  5.58it/s]

Best Parameters: {'C': 0.1, 'dual': False, 'loss': 'squared_hinge', 'penalty': 'l2', 'tol': 0.01}
Best Score: 0.6075949367088608





In [23]:
acc = accuracy_score(y_test,results)
print(f'Accuracy: {acc}')

#precision
precision = precision_score(y_test,results,average='weighted')
print(f'Precision: {precision}')
#Recall
recall = recall_score(y_test, results, average='weighted')
print(f'Recall:{recall}')
#f1 Score 
f1 = f1_score(y_test,results,average='weighted')
print(f'F1 Score: {f1}')

Accuracy: 0.6044303797468354
Precision: 0.5805389544523861
Recall:0.6044303797468354
F1 Score: 0.5826799645195494


In [27]:
dummy_clf = DummyClassifier(strategy = 'most_frequent', random_state=42)

dummy_clf.fit(X_train, y_train)
y_pred = dummy_clf.predict(X_test)

accuracy = accuracy_score(y_test,y_pred)
print(f'Baseline Accuracy: {accuracy}')

print(classification_report(y_test,y_pred))

Baseline Accuracy: 0.1291139240506329
              precision    recall  f1-score   support

           0       0.00      0.00      0.00        96
           1       0.00      0.00      0.00        13
           2       0.00      0.00      0.00       169
           3       0.00      0.00      0.00       167
           4       0.00      0.00      0.00       180
           5       0.13      1.00      0.23       204
           6       0.00      0.00      0.00       147
           7       0.00      0.00      0.00       104
           8       0.00      0.00      0.00       126
           9       0.00      0.00      0.00       133
          10       0.00      0.00      0.00        81
          11       0.00      0.00      0.00       160

    accuracy                           0.13      1580
   macro avg       0.01      0.08      0.02      1580
weighted avg       0.02      0.13      0.03      1580

