**Mount Google Drive**

In [9]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


**Import Statements**

In [10]:
import re
import nltk
import spacy
import numpy as np
import pandas as pd

from collections import defaultdict
from nltk.tokenize import sent_tokenize

In [11]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

**Read Reddit Data**

In [12]:
reddit_df = pd.read_csv('/content/drive/MyDrive/CSE6250/data/500_Reddit_users_posts_labels.csv')
reddit_df.head()

Unnamed: 0,User,Post,Label
0,user-0,"['Its not a viable option, and youll be leavin...",Supportive
1,user-1,['It can be hard to appreciate the notion that...,Ideation
2,user-2,"['Hi, so last night i was sitting on the ledge...",Behavior
3,user-3,['I tried to kill my self once and failed badl...,Attempt
4,user-4,['Hi NEM3030. What sorts of things do you enjo...,Ideation


**Clean String Data**

In [13]:
def clean_string_data(post):
    
    # convert all characters to lowercase
    post = post.lower()

    # remove any URLs
    post = re.sub(r'http\S+', '', post)

    # remove any HTML tags
    post = re.sub('<[^<]+?>', '', post)

    # remove any non-alphanumeric characters and replace them with spaces
    post = re.sub(r'[^a-zA-Z0-9 ]', ' ', post)

    # remove extra whitespace and return the cleaned string
    return ' '.join(post.split())

**AFINN Score**

In [14]:
# read the data
afinn = pd.read_csv('/content/drive/MyDrive/CSE6250/data/AFINN-en-165.txt', 
                    sep="\t", 
                    header=None)

# convert to dictionary
afinn_dict = dict(zip(afinn[0], afinn[1]))

# function to obtain afinn score
def get_afinn_score(post):
  
  # clean the post data
  post = clean_string_data(post)

  # obtain the mean afinn score for the post
  afinn_score = [afinn_dict[p] for p in post.split() if p in afinn_dict.keys()]

  # if no terms match, we just return an afinn score of 0, else return the mean
  if len(afinn_score) > 0:
    return np.mean(afinn_score)
  else:
    return 0

# create a column and store afinn score
reddit_df["afinn_score"] = reddit_df["Post"].apply(get_afinn_score)

reddit_df.head()

Unnamed: 0,User,Post,Label,afinn_score
0,user-0,"['Its not a viable option, and youll be leavin...",Supportive,-1.181818
1,user-1,['It can be hard to appreciate the notion that...,Ideation,0.028708
2,user-2,"['Hi, so last night i was sitting on the ledge...",Behavior,0.525
3,user-3,['I tried to kill my self once and failed badl...,Attempt,0.160494
4,user-4,['Hi NEM3030. What sorts of things do you enjo...,Ideation,0.62963


**Language Assessment by Mechanical Turk - Happiness Rank Score**

In [15]:
# read the data
labMT = pd.read_csv("/content/drive/MyDrive/CSE6250/data/labMT")

# build a dictionary
labMT_happiness_rank_dict = dict(zip(labMT['word'], labMT['happiness_rank']))

def get_happiness_rank_score(post):

  # clean the post data
  post = clean_string_data(post)

  # obtain the mean happiness rank score for the post
  happiness_rank_score = [labMT_happiness_rank_dict[p] for p in post.split() if p in labMT_happiness_rank_dict.keys()]

  # if no terms match, we just return score of 0, else return the mean
  if len(happiness_rank_score) > 0:
    return np.mean(happiness_rank_score)
  else:
    return 0

# create a column and store happiness rank score
reddit_df['happiness_rank_score'] = reddit_df["Post"].apply(get_happiness_rank_score)

reddit_df.head()

Unnamed: 0,User,Post,Label,afinn_score,happiness_rank_score
0,user-0,"['Its not a viable option, and youll be leavin...",Supportive,-1.181818,5461.290323
1,user-1,['It can be hard to appreciate the notion that...,Ideation,0.028708,5118.062681
2,user-2,"['Hi, so last night i was sitting on the ledge...",Behavior,0.525,5148.436404
3,user-3,['I tried to kill my self once and failed badl...,Attempt,0.160494,4903.928066
4,user-4,['Hi NEM3030. What sorts of things do you enjo...,Ideation,0.62963,4587.260204


**Language Assessment by Mechanical Turk - Happiness Average Score**


In [16]:
# build a dictionary
labMT_happiness_avg_dict = dict(zip(labMT['word'], labMT['happiness_average']))

def get_happiness_average_score(post):

  # clean the post data
  post = clean_string_data(post)

  # obtain the mean happiness average score for the post
  happiness_average_score = [labMT_happiness_avg_dict[p] for p in post.split() if p in labMT_happiness_avg_dict.keys()]

  # if no terms match, we just return score of 0, else return the mean
  if len(happiness_average_score) > 0:
    return np.mean(happiness_average_score)
  else:
    return 0

# create a column and store happiness average score
reddit_df['happiness_average_score'] = reddit_df["Post"].apply(get_happiness_average_score)

reddit_df.head()

Unnamed: 0,User,Post,Label,afinn_score,happiness_rank_score,happiness_average_score
0,user-0,"['Its not a viable option, and youll be leavin...",Supportive,-1.181818,5461.290323,5.288065
1,user-1,['It can be hard to appreciate the notion that...,Ideation,0.028708,5118.062681,5.411909
2,user-2,"['Hi, so last night i was sitting on the ledge...",Behavior,0.525,5148.436404,5.409605
3,user-3,['I tried to kill my self once and failed badl...,Attempt,0.160494,4903.928066,5.472783
4,user-4,['Hi NEM3030. What sorts of things do you enjo...,Ideation,0.62963,4587.260204,5.552653


**Language Assessment by Mechanical Turk - Happiness Standard Deviation Score**


In [17]:
# build a dictionary
labMT_happiness_standard_deviation_dict = dict(zip(labMT['word'], 
                                                   labMT['happiness_standard_deviation']))

def get_happiness_standard_deviation_score(post):

  # clean the post data
  post = clean_string_data(post)

  # obtain the mean happiness standard deviation score for the post
  happiness_standard_deviation_score = [labMT_happiness_standard_deviation_dict[p] for p in post.split() if p in labMT_happiness_standard_deviation_dict.keys()]

  # if no terms match, we just return score of 0, else return the mean
  if len(happiness_standard_deviation_score) > 0:
    return np.mean(happiness_standard_deviation_score)
  else:
    return 0

# create a column and store happiness standard deviation score
reddit_df['happiness_standard_deviation_score'] = reddit_df["Post"].apply(get_happiness_standard_deviation_score)

reddit_df.head()

Unnamed: 0,User,Post,Label,afinn_score,happiness_rank_score,happiness_average_score,happiness_standard_deviation_score
0,user-0,"['Its not a viable option, and youll be leavin...",Supportive,-1.181818,5461.290323,5.288065,1.171006
1,user-1,['It can be hard to appreciate the notion that...,Ideation,0.028708,5118.062681,5.411909,1.165012
2,user-2,"['Hi, so last night i was sitting on the ledge...",Behavior,0.525,5148.436404,5.409605,1.153144
3,user-3,['I tried to kill my self once and failed badl...,Attempt,0.160494,4903.928066,5.472783,1.178218
4,user-4,['Hi NEM3030. What sorts of things do you enjo...,Ideation,0.62963,4587.260204,5.552653,1.180813


**Language Assessment by Mechanical Turk - Twitter Score**

In [18]:
# build a dictionary
labMT_twitter_rank_dict = dict(zip(labMT['word'], labMT['twitter_rank']))

def get_twitter_rank_score(post):

  # clean the post data
  post = clean_string_data(post)

  # obtain the mean twitter rank for the post
  twitter_rank_score = [labMT_twitter_rank_dict[p] for p in post.split() if ( p in labMT_twitter_rank_dict.keys() and not pd.isna(labMT_twitter_rank_dict[p]))]
  
  # if no terms match, we just return a score of 0, else return the mean
  if len(twitter_rank_score) > 0:
    return np.mean(twitter_rank_score)
  else:
    return 0

# create a column and store twitter score
reddit_df['twitter_rank_score'] = reddit_df["Post"].apply(get_twitter_rank_score)

reddit_df.head()

Unnamed: 0,User,Post,Label,afinn_score,happiness_rank_score,happiness_average_score,happiness_standard_deviation_score,twitter_rank_score
0,user-0,"['Its not a viable option, and youll be leavin...",Supportive,-1.181818,5461.290323,5.288065,1.171006,391.605042
1,user-1,['It can be hard to appreciate the notion that...,Ideation,0.028708,5118.062681,5.411909,1.165012,411.082669
2,user-2,"['Hi, so last night i was sitting on the ledge...",Behavior,0.525,5148.436404,5.409605,1.153144,365.233996
3,user-3,['I tried to kill my self once and failed badl...,Attempt,0.160494,4903.928066,5.472783,1.178218,384.455882
4,user-4,['Hi NEM3030. What sorts of things do you enjo...,Ideation,0.62963,4587.260204,5.552653,1.180813,427.042553


**Language Assessment by Mechanical Turk - Google Score**

In [19]:
# build a dictionary
labMT_google_rank_dict = dict(zip(labMT['word'], labMT['google_rank']))

def get_google_rank_score(post):

  # clean the post data
  post = clean_string_data(post)

  # obtain the mean google score for the post
  google_rank_score = [labMT_google_rank_dict[p] for p in post.split() if ( p in labMT_google_rank_dict.keys() and not pd.isna(labMT_google_rank_dict[p]))]
  
  # if no terms match, we just return a score of 0, else return the mean
  if len(google_rank_score) > 0:
    return np.mean(google_rank_score)
  else:
    return 0

# create a column and store google score
reddit_df['google_rank_score'] = reddit_df["Post"].apply(get_google_rank_score)

reddit_df.head()

Unnamed: 0,User,Post,Label,afinn_score,happiness_rank_score,happiness_average_score,happiness_standard_deviation_score,twitter_rank_score,google_rank_score
0,user-0,"['Its not a viable option, and youll be leavin...",Supportive,-1.181818,5461.290323,5.288065,1.171006,391.605042,466.862069
1,user-1,['It can be hard to appreciate the notion that...,Ideation,0.028708,5118.062681,5.411909,1.165012,411.082669,438.073795
2,user-2,"['Hi, so last night i was sitting on the ledge...",Behavior,0.525,5148.436404,5.409605,1.153144,365.233996,432.37587
3,user-3,['I tried to kill my self once and failed badl...,Attempt,0.160494,4903.928066,5.472783,1.178218,384.455882,460.693627
4,user-4,['Hi NEM3030. What sorts of things do you enjo...,Ideation,0.62963,4587.260204,5.552653,1.180813,427.042553,736.847368


**Language Assessment by Mechanical Turk - NYT Score**

In [20]:
# build a dictionary
labMT_nyt_rank_dict = dict(zip(labMT['word'], labMT['nyt_rank']))

def get_nyt_rank_score(post):

  # clean the post data
  post = clean_string_data(post)

  # obtain the mean nyt score for the post
  nyt_rank_score = [labMT_nyt_rank_dict[p] for p in post.split() if ( p in labMT_nyt_rank_dict.keys() and not pd.isna(labMT_nyt_rank_dict[p]))]
  
  # if no terms match, we just return a score of 0, else return the mean
  if len(nyt_rank_score) > 0:
    return np.mean(nyt_rank_score)
  else:
    return 0
    
# create a column and store nyt score
reddit_df['nyt_rank_score'] = reddit_df["Post"].apply(get_nyt_rank_score)

reddit_df.head()

Unnamed: 0,User,Post,Label,afinn_score,happiness_rank_score,happiness_average_score,happiness_standard_deviation_score,twitter_rank_score,google_rank_score,nyt_rank_score
0,user-0,"['Its not a viable option, and youll be leavin...",Supportive,-1.181818,5461.290323,5.288065,1.171006,391.605042,466.862069,544.101695
1,user-1,['It can be hard to appreciate the notion that...,Ideation,0.028708,5118.062681,5.411909,1.165012,411.082669,438.073795,532.416624
2,user-2,"['Hi, so last night i was sitting on the ledge...",Behavior,0.525,5148.436404,5.409605,1.153144,365.233996,432.37587,477.860465
3,user-3,['I tried to kill my self once and failed badl...,Attempt,0.160494,4903.928066,5.472783,1.178218,384.455882,460.693627,541.666252
4,user-4,['Hi NEM3030. What sorts of things do you enjo...,Ideation,0.62963,4587.260204,5.552653,1.180813,427.042553,736.847368,678.727778


**Language Assessment by Mechanical Turk - Lyrics Score**

In [21]:
# build a dictionary
labMT_lyrics_rank_dict = dict(zip(labMT['word'], labMT['lyrics_rank']))

def get_lyrics_rank_score(post):

  # clean the post data
  post = clean_string_data(post)

  # obtain the mean lyrics score for the post
  lyrics_rank_score = [labMT_lyrics_rank_dict[p] for p in post.split() if ( p in labMT_lyrics_rank_dict.keys() and not pd.isna(labMT_lyrics_rank_dict[p]))]

  # if no terms match, we just return a score of 0, else return the mean
  if len(lyrics_rank_score) > 0:
    return np.mean(lyrics_rank_score)
  else:
    return 0

# create a column and store lyrics score
reddit_df['lyrics_rank_score'] = reddit_df["Post"].apply(get_lyrics_rank_score)

reddit_df.head()

Unnamed: 0,User,Post,Label,afinn_score,happiness_rank_score,happiness_average_score,happiness_standard_deviation_score,twitter_rank_score,google_rank_score,nyt_rank_score,lyrics_rank_score
0,user-0,"['Its not a viable option, and youll be leavin...",Supportive,-1.181818,5461.290323,5.288065,1.171006,391.605042,466.862069,544.101695,524.554622
1,user-1,['It can be hard to appreciate the notion that...,Ideation,0.028708,5118.062681,5.411909,1.165012,411.082669,438.073795,532.416624,403.854555
2,user-2,"['Hi, so last night i was sitting on the ledge...",Behavior,0.525,5148.436404,5.409605,1.153144,365.233996,432.37587,477.860465,370.600907
3,user-3,['I tried to kill my self once and failed badl...,Attempt,0.160494,4903.928066,5.472783,1.178218,384.455882,460.693627,541.666252,401.807692
4,user-4,['Hi NEM3030. What sorts of things do you enjo...,Ideation,0.62963,4587.260204,5.552653,1.180813,427.042553,736.847368,678.727778,426.421622


**First Person Pronoun Ratio**

In [22]:
def get_first_person_pronoun_ratio(post):

  # list of pronouns
  first_person_pronouns = ["I", "me", "my", "mine", "we", "us", "our", "ours"]
  other_pronouns = ["he", "him", "his", "she", "her", "hers", "they", "them", "their", "theirs"]

  # initialize counts
  first_person_count, other_pronoun_count = 0, 0

  # clean the post data
  post = clean_string_data(post)

  # obtain counts
  for p in post.split():
    p_lower = p.lower()
    if p_lower in first_person_pronouns:
      first_person_count += 1
    elif p_lower in other_pronouns:
      other_pronoun_count += 1

  # total count 
  total_pronoun_count = first_person_count + other_pronoun_count
  
  # return first person pronoun ratio
  if total_pronoun_count > 0:
    return (first_person_count/total_pronoun_count)*100
  else:
    return 0

# create a column and store first person pronoun ratio 
reddit_df["FPPR"] = reddit_df["Post"].apply(get_first_person_pronoun_ratio)

reddit_df.head()

Unnamed: 0,User,Post,Label,afinn_score,happiness_rank_score,happiness_average_score,happiness_standard_deviation_score,twitter_rank_score,google_rank_score,nyt_rank_score,lyrics_rank_score,FPPR
0,user-0,"['Its not a viable option, and youll be leavin...",Supportive,-1.181818,5461.290323,5.288065,1.171006,391.605042,466.862069,544.101695,524.554622,0.0
1,user-1,['It can be hard to appreciate the notion that...,Ideation,0.028708,5118.062681,5.411909,1.165012,411.082669,438.073795,532.416624,403.854555,40.540541
2,user-2,"['Hi, so last night i was sitting on the ledge...",Behavior,0.525,5148.436404,5.409605,1.153144,365.233996,432.37587,477.860465,370.600907,80.0
3,user-3,['I tried to kill my self once and failed badl...,Attempt,0.160494,4903.928066,5.472783,1.178218,384.455882,460.693627,541.666252,401.807692,96.428571
4,user-4,['Hi NEM3030. What sorts of things do you enjo...,Ideation,0.62963,4587.260204,5.552653,1.180813,427.042553,736.847368,678.727778,426.421622,55.555556


**Number of Sentences**

In [23]:
def get_number_of_sentences(post):

  # use nltk to obtain number of sentences
  number_of_sentences = sent_tokenize(post)
  return len(number_of_sentences)

# create a column and store number of sentences
reddit_df['number_of_sentences'] = reddit_df['Post'].apply(get_number_of_sentences)

reddit_df.head()

Unnamed: 0,User,Post,Label,afinn_score,happiness_rank_score,happiness_average_score,happiness_standard_deviation_score,twitter_rank_score,google_rank_score,nyt_rank_score,lyrics_rank_score,FPPR,number_of_sentences
0,user-0,"['Its not a viable option, and youll be leavin...",Supportive,-1.181818,5461.290323,5.288065,1.171006,391.605042,466.862069,544.101695,524.554622,0.0,10
1,user-1,['It can be hard to appreciate the notion that...,Ideation,0.028708,5118.062681,5.411909,1.165012,411.082669,438.073795,532.416624,403.854555,40.540541,138
2,user-2,"['Hi, so last night i was sitting on the ledge...",Behavior,0.525,5148.436404,5.409605,1.153144,365.233996,432.37587,477.860465,370.600907,80.0,24
3,user-3,['I tried to kill my self once and failed badl...,Attempt,0.160494,4903.928066,5.472783,1.178218,384.455882,460.693627,541.666252,401.807692,96.428571,54
4,user-4,['Hi NEM3030. What sorts of things do you enjo...,Ideation,0.62963,4587.260204,5.552653,1.180813,427.042553,736.847368,678.727778,426.421622,55.555556,19


**Number of Definite Articles**

In [24]:
def get_number_of_definite_articles(post):
  post_lower = post.lower()
  words = post_lower.split()
  number_of_definite_articles = sum(1 for word in words if word == "the")
  return number_of_definite_articles

# create a column and store number of definite articles
reddit_df['number_of_definite_articles'] = reddit_df['Post'].apply(get_number_of_definite_articles)

reddit_df.head()

Unnamed: 0,User,Post,Label,afinn_score,happiness_rank_score,happiness_average_score,happiness_standard_deviation_score,twitter_rank_score,google_rank_score,nyt_rank_score,lyrics_rank_score,FPPR,number_of_sentences,number_of_definite_articles
0,user-0,"['Its not a viable option, and youll be leavin...",Supportive,-1.181818,5461.290323,5.288065,1.171006,391.605042,466.862069,544.101695,524.554622,0.0,10,2
1,user-1,['It can be hard to appreciate the notion that...,Ideation,0.028708,5118.062681,5.411909,1.165012,411.082669,438.073795,532.416624,403.854555,40.540541,138,45
2,user-2,"['Hi, so last night i was sitting on the ledge...",Behavior,0.525,5148.436404,5.409605,1.153144,365.233996,432.37587,477.860465,370.600907,80.0,24,14
3,user-3,['I tried to kill my self once and failed badl...,Attempt,0.160494,4903.928066,5.472783,1.178218,384.455882,460.693627,541.666252,401.807692,96.428571,54,18
4,user-4,['Hi NEM3030. What sorts of things do you enjo...,Ideation,0.62963,4587.260204,5.552653,1.180813,427.042553,736.847368,678.727778,426.421622,55.555556,19,2


**Maximum Verb Phrase Length**

In [25]:
nlp = spacy.load('en_core_web_sm')

def get_max_verb_phrase(post):
  
  # Process the text
  doc = nlp(post)
  # Initialize variables to store max verb phrase length and current verb phrase length
  max_length = 0
  current_length = 0
  
  # Iterate through the tokens in the processed text
  for token in doc:
      if token.pos_ == 'VERB':
          # If the token is a verb, increment the current verb phrase length
          current_length += 1
      elif current_length > 0:
          # If the token is not a verb and the current verb phrase length is greater than 0,
          # update the max verb phrase length if necessary and reset the current verb phrase length
          max_length = max(max_length, current_length)
          current_length = 0
  
  # Update the max verb phrase length if necessary after processing all tokens
  max_length = max(max_length, current_length)
  
  return max_length

reddit_df['max_verb_phrase_length'] = reddit_df['Post'].apply(get_max_verb_phrase)

reddit_df.head()

Unnamed: 0,User,Post,Label,afinn_score,happiness_rank_score,happiness_average_score,happiness_standard_deviation_score,twitter_rank_score,google_rank_score,nyt_rank_score,lyrics_rank_score,FPPR,number_of_sentences,number_of_definite_articles,max_verb_phrase_length
0,user-0,"['Its not a viable option, and youll be leavin...",Supportive,-1.181818,5461.290323,5.288065,1.171006,391.605042,466.862069,544.101695,524.554622,0.0,10,2,2
1,user-1,['It can be hard to appreciate the notion that...,Ideation,0.028708,5118.062681,5.411909,1.165012,411.082669,438.073795,532.416624,403.854555,40.540541,138,45,2
2,user-2,"['Hi, so last night i was sitting on the ledge...",Behavior,0.525,5148.436404,5.409605,1.153144,365.233996,432.37587,477.860465,370.600907,80.0,24,14,2
3,user-3,['I tried to kill my self once and failed badl...,Attempt,0.160494,4903.928066,5.472783,1.178218,384.455882,460.693627,541.666252,401.807692,96.428571,54,18,2
4,user-4,['Hi NEM3030. What sorts of things do you enjo...,Ideation,0.62963,4587.260204,5.552653,1.180813,427.042553,736.847368,678.727778,426.421622,55.555556,19,2,2


**Height of Dependency Parse Tree**

In [26]:
nlp = spacy.load("en_core_web_sm", disable=['ner'])

def get_avg_tree_height(post):
  # Computes the average height of the dependency parse tree for a given text
  
  def tree_height(root):
      # Recursively computes the height of a dependency parse tree
      if not list(root.children):
          return 0
      else:
          return 1 + max(tree_height(x) for x in root.children)
  
  doc = nlp(post)
  roots = [sent.root for sent in doc.sents]
  heights = [tree_height(root) for root in roots]
  avg_height = sum(heights) / len(heights)
  return avg_height

reddit_df['parse_tree_height'] = reddit_df['Post'].apply(get_avg_tree_height)

reddit_df.head()

Unnamed: 0,User,Post,Label,afinn_score,happiness_rank_score,happiness_average_score,happiness_standard_deviation_score,twitter_rank_score,google_rank_score,nyt_rank_score,lyrics_rank_score,FPPR,number_of_sentences,number_of_definite_articles,max_verb_phrase_length,parse_tree_height
0,user-0,"['Its not a viable option, and youll be leavin...",Supportive,-1.181818,5461.290323,5.288065,1.171006,391.605042,466.862069,544.101695,524.554622,0.0,10,2,2,3.545455
1,user-1,['It can be hard to appreciate the notion that...,Ideation,0.028708,5118.062681,5.411909,1.165012,411.082669,438.073795,532.416624,403.854555,40.540541,138,45,2,4.042945
2,user-2,"['Hi, so last night i was sitting on the ledge...",Behavior,0.525,5148.436404,5.409605,1.153144,365.233996,432.37587,477.860465,370.600907,80.0,24,14,2,5.12
3,user-3,['I tried to kill my self once and failed badl...,Attempt,0.160494,4903.928066,5.472783,1.178218,384.455882,460.693627,541.666252,401.807692,96.428571,54,18,2,4.33871
4,user-4,['Hi NEM3030. What sorts of things do you enjo...,Ideation,0.62963,4587.260204,5.552653,1.180813,427.042553,736.847368,678.727778,426.421622,55.555556,19,2,2,3.65


**First Person Pronoun Count**

In [27]:
def get_first_person_pronoun_count(post):

  post = clean_string_data(post)
  
  fpp = ["i", "me", "my", "mine", "we", "us", "our", "ours"]
  
  first_person = 0

  for word in post.split():
      if word in fpp:
          first_person += 1

  return first_person

reddit_df['fpp_count'] = reddit_df['Post'].apply(get_first_person_pronoun_count)

reddit_df.head()

Unnamed: 0,User,Post,Label,afinn_score,happiness_rank_score,happiness_average_score,happiness_standard_deviation_score,twitter_rank_score,google_rank_score,nyt_rank_score,lyrics_rank_score,FPPR,number_of_sentences,number_of_definite_articles,max_verb_phrase_length,parse_tree_height,fpp_count
0,user-0,"['Its not a viable option, and youll be leavin...",Supportive,-1.181818,5461.290323,5.288065,1.171006,391.605042,466.862069,544.101695,524.554622,0.0,10,2,2,3.545455,2
1,user-1,['It can be hard to appreciate the notion that...,Ideation,0.028708,5118.062681,5.411909,1.165012,411.082669,438.073795,532.416624,403.854555,40.540541,138,45,2,4.042945,58
2,user-2,"['Hi, so last night i was sitting on the ledge...",Behavior,0.525,5148.436404,5.409605,1.153144,365.233996,432.37587,477.860465,370.600907,80.0,24,14,2,5.12,21
3,user-3,['I tried to kill my self once and failed badl...,Attempt,0.160494,4903.928066,5.472783,1.178218,384.455882,460.693627,541.666252,401.807692,96.428571,54,18,2,4.33871,69
4,user-4,['Hi NEM3030. What sorts of things do you enjo...,Ideation,0.62963,4587.260204,5.552653,1.180813,427.042553,736.847368,678.727778,426.421622,55.555556,19,2,2,3.65,16


**Other Person Pronoun Count**

In [28]:
def get_other_person_pronoun_count(post):

  post = clean_string_data(post)
  
  spp = ["you", "your", "yours"]
  tpp = ["he", "him", "his", "she", "her", "hers", "they", "them", "their", "theirs"]
  other_pp = ["it", "its", "who", "whom", "whose", "which", "what", "that"]

  other_person = 0

  for word in post.split():
      if ((word in spp) or (word in tpp) or (word in other_pp)):
          other_person += 1

  return other_person

reddit_df['other_pronoun_count'] = reddit_df['Post'].apply(get_other_person_pronoun_count)

reddit_df.head()

Unnamed: 0,User,Post,Label,afinn_score,happiness_rank_score,happiness_average_score,happiness_standard_deviation_score,twitter_rank_score,google_rank_score,nyt_rank_score,lyrics_rank_score,FPPR,number_of_sentences,number_of_definite_articles,max_verb_phrase_length,parse_tree_height,fpp_count,other_pronoun_count
0,user-0,"['Its not a viable option, and youll be leavin...",Supportive,-1.181818,5461.290323,5.288065,1.171006,391.605042,466.862069,544.101695,524.554622,0.0,10,2,2,3.545455,2,15
1,user-1,['It can be hard to appreciate the notion that...,Ideation,0.028708,5118.062681,5.411909,1.165012,411.082669,438.073795,532.416624,403.854555,40.540541,138,45,2,4.042945,58,289
2,user-2,"['Hi, so last night i was sitting on the ledge...",Behavior,0.525,5148.436404,5.409605,1.153144,365.233996,432.37587,477.860465,370.600907,80.0,24,14,2,5.12,21,53
3,user-3,['I tried to kill my self once and failed badl...,Attempt,0.160494,4903.928066,5.472783,1.178218,384.455882,460.693627,541.666252,401.807692,96.428571,54,18,2,4.33871,69,84
4,user-4,['Hi NEM3030. What sorts of things do you enjo...,Ideation,0.62963,4587.260204,5.552653,1.180813,427.042553,736.847368,678.727778,426.421622,55.555556,19,2,2,3.65,16,20


**All Columns of the Reddit Data**

In [29]:
reddit_df.columns

Index(['User', 'Post', 'Label', 'afinn_score', 'happiness_rank_score',
       'happiness_average_score', 'happiness_standard_deviation_score',
       'twitter_rank_score', 'google_rank_score', 'nyt_rank_score',
       'lyrics_rank_score', 'FPPR', 'number_of_sentences',
       'number_of_definite_articles', 'max_verb_phrase_length',
       'parse_tree_height', 'fpp_count', 'other_pronoun_count'],
      dtype='object')

**Select only the User and Characteristic External Features**

In [33]:
reddit_df_ef = reddit_df[["User", "afinn_score", "FPPR", "happiness_rank_score", "happiness_average_score",
                          "happiness_standard_deviation_score", "twitter_rank_score", "google_rank_score", "nyt_rank_score",
                          "lyrics_rank_score", "parse_tree_height", "max_verb_phrase_length", "fpp_count", "number_of_sentences", "number_of_definite_articles", "other_pronoun_count"]]
reddit_df_ef.head()

Unnamed: 0,User,afinn_score,FPPR,happiness_rank_score,happiness_average_score,happiness_standard_deviation_score,twitter_rank_score,google_rank_score,nyt_rank_score,lyrics_rank_score,parse_tree_height,max_verb_phrase_length,fpp_count,number_of_sentences,number_of_definite_articles,other_pronoun_count
0,user-0,-1.181818,0.000000,5461.290323,5.288065,1.171006,391.605042,466.862069,544.101695,524.554622,3.545455,2,2,10,2,15
1,user-1,0.028708,40.540541,5118.062681,5.411909,1.165012,411.082669,438.073795,532.416624,403.854555,4.042945,2,58,138,45,289
2,user-2,0.525000,80.000000,5148.436404,5.409605,1.153144,365.233996,432.375870,477.860465,370.600907,5.120000,2,21,24,14,53
3,user-3,0.160494,96.428571,4903.928066,5.472783,1.178218,384.455882,460.693627,541.666252,401.807692,4.338710,2,69,54,18,84
4,user-4,0.629630,55.555556,4587.260204,5.552653,1.180813,427.042553,736.847368,678.727778,426.421622,3.650000,2,16,19,2,20
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
495,user-495,0.250000,39.130435,5133.065124,5.415801,1.162185,383.638522,416.846014,489.023593,372.796429,4.846154,2,36,67,34,177
496,user-496,-1.027027,13.888889,5282.018519,5.327500,1.148409,451.843823,443.576142,478.147500,395.847118,4.344828,1,19,33,11,51
497,user-497,0.020000,50.617284,5175.882293,5.392603,1.165938,367.565998,469.390155,491.831430,411.100639,3.545283,2,235,491,135,721
498,user-498,-0.406250,62.500000,5120.764286,5.401714,1.165336,384.026895,450.690909,464.580475,461.857143,5.142857,2,42,19,11,24


**Save the features**

In [35]:
reddit_df_ef.to_csv("/content/drive/MyDrive/CSE6250/data/External_Features.csv", index=False)