![](https://data.whicdn.com/images/266750169/original.jpg)

# Acknowledgements

Turcan, E., & McKeown, K. (2019). Dreaddit: A Reddit dataset for stress analysis in social media. arXiv preprint arXiv:1911.00133.

# The relevant research paper link can be found here: -
https://aclanthology.org/D19-6213.pdf

In [None]:
import pandas as pd
import numpy as np

train = pd.read_csv("../input/stress-analysis-in-social-media/dreaddit-train.csv")
test = pd.read_csv("../input/stress-analysis-in-social-media/dreaddit-test.csv")

train.head()

In [None]:
test.head()

In [None]:
print(train.columns)
print(train.shape)

In [None]:
print(test.columns)
print(test.shape)

In [None]:
#categorical columns
train.select_dtypes(include=['object']).columns.tolist()


In [None]:
test.select_dtypes(include=['object']).columns.tolist()


In [None]:
train.subreddit.value_counts()

In [None]:
test.subreddit.value_counts()

In [None]:
train = train.drop(['post_id', 'sentence_range', 'id'], axis = 1)
test = test.drop(['post_id', 'sentence_range', 'id'], axis = 1)

In [None]:
df = pd.concat([train,test],axis=0,ignore_index=True)
df = df.sample(frac = 1).reset_index(drop = True)
df.head()

In [None]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df['subreddit'] = le.fit_transform(df['subreddit'])


In [None]:
df.head()

Label ---> 1 (Stress)

Label ---> 0 (Not stress)

In [None]:
df.corr().abs()['label'].sort_values(ascending = False)[:30]

In [None]:
import tensorflow as tf
import transformers
import tqdm
from keras.preprocessing import sequence

#creating a function
def func_tokenizer(tokenizer_name, docs):
    features = []
    for doc in tqdm.tqdm(docs, desc = 'converting documents to features'):
        tokens = tokenizer_name.tokenize(doc)
        ids = tokenizer_name.convert_tokens_to_ids(tokens)
        features.append(ids)
    return features
print("The function is created successfully")

# BERT tokenizer

In [None]:
#Initialize bert tokenizer
bert_tokenizer = transformers.BertTokenizer.from_pretrained('bert-large-uncased')

X,y = df[['text', 'lex_liwc_Tone', 'lex_liwc_negemo', 'lex_liwc_Clout','lex_liwc_i', 'sentiment' ]], df['label']
bert_features = func_tokenizer(bert_tokenizer, X['text'])

In [None]:
bert_trg = sequence.pad_sequences(bert_features, maxlen = 500)


In [None]:
X = pd.DataFrame(bert_trg)

X = X.assign(lex_liwc_negemo = df['lex_liwc_negemo'].values)
X = X.assign(lex_liwc_Tone= df['lex_liwc_Tone'].values)
X = X.assign(lex_liwc_Clout = df['lex_liwc_Clout'].values)
X = X.assign(lex_liwc_i = df['lex_liwc_i'].values)
X = X.assign(sentiment = df['sentiment'].values)
X = X.assign(lex_dal_min_pleasantness = df['lex_dal_min_pleasantness'].values)
X = X.assign(lex_liwc_posemo = df['lex_liwc_posemo'].values)
X = X.assign(lex_liwc_anx = df['lex_liwc_anx'].values)
X = X.assign(lex_liwc_Authentic = df['lex_liwc_Authentic'].values)
X = X.assign(lex_liwc_social = df['lex_liwc_social'].values)
X = X.assign(lex_liwc_Analytic = df['lex_liwc_Analytic'].values)
X = X.assign(lex_liwc_function = df['lex_liwc_function'].values)
X = X.assign(lex_liwc_Dic = df['lex_liwc_Dic'].values)

In [None]:
X

In [None]:
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier, AdaBoostClassifier
from xgboost import XGBClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline

import warnings
warnings.filterwarnings(action='ignore', category=UserWarning)

# All 9 models to be applied

In [None]:
def get_models():
    
    models = dict()
    models['lr'] = make_pipeline(StandardScaler(),LogisticRegression(solver = 'saga', C = 70.0))
    models['knn'] = make_pipeline(StandardScaler(),KNeighborsClassifier())
    models['cart'] = DecisionTreeClassifier(max_depth = 1)
    models['svm'] = make_pipeline(StandardScaler(),SVC())
    models['bayes'] = make_pipeline(StandardScaler(), GaussianNB())
    models['xgboost'] = XGBClassifier(n_estimators = 11, max_depth = 1)
    models['GBM'] = GradientBoostingClassifier(n_estimators = 10)
    models['rf'] = RandomForestClassifier(n_estimators = 10)
    models['adaboost'] = AdaBoostClassifier(n_estimators= 12)
    
    return models

# Models to be applied with 10 fold repeated stratified K fold cross validation

In [None]:
# evaluate a given model using cross-validation
def evaluate_model(model, X, y):
	cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=4, random_state=1)
	scores = cross_val_score(model, X, y, scoring='f1', cv=cv, n_jobs=-1, error_score='raise')
	return scores

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline
from numpy import mean
from numpy import std

# get the models to evaluate
models = get_models()
# evaluate the models and store results
results, names = list(), list()
for name, model in models.items():
	scores = evaluate_model(model, X, y)
	results.append(scores)
	names.append(name)
	print('>%s %.2f (%.2f)' % (name, scores.mean(), std(scores)))
# plot model performance for comparison
plt.boxplot(results, labels=names, showmeans=True)
plt.show()

# Logistic regression, XGBoost and SVM gave the highest accuracy!

# The relevant research paper link can be found here: -
https://aclanthology.org/D19-6213.pdf

# According to the research paper too, logistic regression gave the highest accuracy.

# Upvote if you like it.