In [1]:
# Saving training data and testing data filepath to variables for easier access
file_path = r"https://raw.githubusercontent.com/amankharwal/Website-data/master/stress.csv"

In [2]:
# Reading the test data and training data and storing them in DataFrame titled as test_data and training_data respectively
import pandas as pd
df = pd.read_csv(file_path)

In [3]:
df.columns

Index(['subreddit', 'post_id', 'sentence_range', 'text', 'id', 'label',
       'confidence', 'social_timestamp', 'social_karma', 'syntax_ari',
       ...
       'lex_dal_min_pleasantness', 'lex_dal_min_activation',
       'lex_dal_min_imagery', 'lex_dal_avg_activation', 'lex_dal_avg_imagery',
       'lex_dal_avg_pleasantness', 'social_upvote_ratio',
       'social_num_comments', 'syntax_fk_grade', 'sentiment'],
      dtype='object', length=116)

In [4]:
relevant_columns = ['text','label']
# label -> 1 (stressed) and lebel -> 0(not stressed)
data = df[relevant_columns]

In [5]:
data.isnull().sum()

text     0
label    0
dtype: int64

In [6]:
posts = data.text
labels = data.label

In [7]:
labels

0       1
1       0
2       1
3       1
4       1
       ..
2833    0
2834    1
2835    0
2836    0
2837    1
Name: label, Length: 2838, dtype: int64

In [8]:
posts

0       He said he had not felt that way before, sugge...
1       Hey there r/assistance, Not sure if this is th...
2       My mom then hit me with the newspaper and it s...
3       until i met my new boyfriend, he is amazing, h...
4       October is Domestic Violence Awareness Month a...
                              ...                        
2833    * Her, a week ago: Precious, how are you? (I i...
2834    I don't have the ability to cope with it anymo...
2835    In case this is the first time you're reading ...
2836    Do you find this normal? They have a good rela...
2837    I was talking to my mom this morning and she s...
Name: text, Length: 2838, dtype: object

In [9]:
# Creating a lemmatizer to convert a word into base form 
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

In [10]:
#function to convert tags given by pos_tag function to the tags accepted by WordNetLemmatizer
from nltk.corpus import wordnet
def get_simple_pos(tag):
  if tag.startswith('J'):
    return wordnet.ADJ
  elif tag.startswith('V'):
    return wordnet.VERB
  elif tag.startswith('N'):
    return wordnet.NOUN
  elif tag.startswith('R'):
    return wordnet.ADV
  else:
    return wordnet.NOUN

In [11]:
import nltk

# Download the required NLTK resources
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')  # Required for POS tagging
from nltk.corpus import stopwords
from nltk import pos_tag
from nltk.stem import WordNetLemmatizer
import string

# Define the lemmatizer
lemmatizer = WordNetLemmatizer()

# Define the set of stopwords and punctuations
stops = set(stopwords.words('english'))
punctuations = list(string.punctuation)
stops.update(punctuations)

# Define a function to map POS tags to lemmatizer format
def get_simple_pos(tag):
    if tag.startswith('J'):
        return 'a'  # Adjective
    elif tag.startswith('V'):
        return 'v'  # Verb
    elif tag.startswith('N'):
        return 'n'  # Noun
    elif tag.startswith('R'):
        return 'r'  # Adverb
    else:
        return 'n'  # Default to noun

# Define the function to clean the data
def clean_data(words):
    output_words = []
    for w in words:
        if w.lower() not in stops:
            pos = pos_tag([w])
            clean_word = lemmatizer.lemmatize(w, pos=get_simple_pos(pos[0][1]))
            output_words.append(clean_word.lower())
    return output_words

# Example usage of the function
example_words = ["This", "is", "a", "test", "sentence", "."]
cleaned_words = clean_data(example_words)
print(cleaned_words)


In [12]:
from nltk.tokenize import word_tokenize
tokenized_posts = [word_tokenize(post.lower()) for post in posts]

In [13]:
# Cleaning the data from each post:
clean_posts = [clean_data(post) for post in tokenized_posts]

In [14]:
posts = [" ".join(post) for post in clean_posts]

In [16]:
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(posts, labels,test_size=0.25, random_state=1)

In [17]:
"""from sklearn.feature_extraction.text import CountVectorizer
count_vec = CountVectorizer(max_features = 3000, ngram_range = (1,2),max_df = 0.8)
X_train_features = count_vec.fit_transform(X_train)
X_train_features.todense()"""

'from sklearn.feature_extraction.text import CountVectorizer\ncount_vec = CountVectorizer(max_features = 3000, ngram_range = (1,2),max_df = 0.8)\nX_train_features = count_vec.fit_transform(X_train)\nX_train_features.todense()'

In [18]:
from sklearn.feature_extraction.text import TfidfVectorizer
count_vec = TfidfVectorizer(max_features = 3000, ngram_range = (1,3), max_df = 0.8)
X_train_features = count_vec.fit_transform(X_train)
X_train_features.todense()

matrix([[0.        , 0.        , 0.        , ..., 0.        , 0.        ,
         0.        ],
        [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
         0.        ],
        [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
         0.        ],
        ...,
        [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
         0.        ],
        [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
         0.        ],
        [0.        , 0.        , 0.21240241, ..., 0.        , 0.        ,
         0.        ]])

In [19]:
# To get the feature names selected
#count_vec.get_feature_names_out()

array(['00', '000', '10', ..., 'zach', 'zero', 'zoloft'], dtype=object)

In [20]:
X_test_features = count_vec.transform(X_test)
X_test_features

<710x3000 sparse matrix of type '<class 'numpy.float64'>'
	with 22872 stored elements in Compressed Sparse Row format>

In [33]:
from sklearn.svm import SVC
svc = SVC()
svc.fit(X_train_features, Y_train)
svc.score(X_test_features, Y_test)

Y_pred = svc.predict(X_test_features)

# Evaluate the model
accuracy = accuracy_score(Y_test, Y_pred)
print(f"Accuracy: {accuracy:.2f}")

# Print other metrics
print("Classification Report:")
print(classification_report(Y_test, Y_pred))

Accuracy: 0.73
Classification Report:
              precision    recall  f1-score   support

           0       0.69      0.74      0.71       319
           1       0.77      0.72      0.75       391

    accuracy                           0.73       710
   macro avg       0.73      0.73      0.73       710
weighted avg       0.73      0.73      0.73       710



In [22]:
"""from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report

# Create a Multinomial Naive Bayes classifier
nb_classifier = MultinomialNB()

# Train the classifier on the vectorized training data
nb_classifier.fit(X_train_features, Y_train)

# Make predictions on the vectorized test data
Y_pred = nb_classifier.predict(X_test_features)

# Evaluate the model
accuracy = accuracy_score(Y_test, Y_pred)
print(f"Accuracy: {accuracy:.2f}")

# Print other metrics
print("Classification Report:")
print(classification_report(Y_test, Y_pred))"""

Accuracy: 0.73
Classification Report:
              precision    recall  f1-score   support

           0       0.73      0.62      0.67       319
           1       0.72      0.82      0.77       391

    accuracy                           0.73       710
   macro avg       0.73      0.72      0.72       710
weighted avg       0.73      0.73      0.72       710



In [32]:

# Take input text from the user
user_input = input("Enter text to predict its class: ")

# Tokenization of the user input
tokenized_user_input = word_tokenize(user_input.lower())

# stopword removal, lemmatization of tokenized_user_input
clean_user_input = clean_data(tokenized_user_input)
user_input = " ".join(clean_user_input)

# Vectorize the user input using the pre-trained vectorizer
user_input_vectorized = count_vec.transform([user_input])

# Make predictions using the pre-trained classifier
predicted_class = nb_classifier.predict(user_input_vectorized)

if(predicted_class[0] == 0):
    print("Not Stressed")
elif(predicted_class[0] == 1):
   print("Stressed")



Enter text to predict its class: i am feeling great today
Not Stressed
