## Integrative complexity

See Owens and Wedeking (2011, pp. 1055–1057)

LIWC categories:

1. Sixl (% of words in a text with 6 or more letters) - rule-based
2. Discrepancy - LIWC 134
3. Tentativeness - LIWC 135
4. Inclusiveness - LIWC 138
5. Causation - LIWC 133
6. Insight - LIWC 132 
7. Inhibition - LIWC 137
8. Certainty - LIWC 136 
9. Negations - LIWC 19
10. Exclusiveness - LIWC 139 


### Equation
$IC = Sixl+ Discr + Tent + Incl + Cause + Insig - Inhib - Cert - Negate - Excl$

In [1]:
import pandas as pd
import numpy as np
import nltk
from nltk import word_tokenize

In [2]:
#import LIWC
LIWC_2007 = pd.read_csv("~/Desktop/LIWC2007_English100131.dic.csv", header = None)

#import dataset
full_data = pd.read_csv('data/full_data.csv')
clean_full_data = full_data[['commentText','ID']]

FileNotFoundError: [Errno 2] No such file or directory: '/Users/liaoyufang/Desktop/LIWC2007_English100131.dic.csv'

### Calculate Integrative Complexity

In [5]:
#Calculating sixl (words >= 6 letters)
def sixl(clean_token):
    six_count = 0

    for token in clean_token: #total tokens after pre-processing 
        if len(token) >= 6: #count if token length >= 6
            six_count += 1  

    try: 
        six_per = six_count / len(clean_token) #calculate percentage
    except ZeroDivisionError: #if total token = 0 (e.g., empty comment after pre-processing)
        six_per = 0 #set percentage = 0

    return six_per, six_count

In [31]:
#Calculating integrative complexity
'''
POSITIVE:
    Discrepancy - LIWC 134
    Tentativeness - LIWC 135
    Inclusiveness - LIWC 138
    Causation - LIWC 133
    Insight - LIWC 132 

NEGATIVE:
    Inhibition - LIWC 137
    Certainty - LIWC 136 
    Negations - LIWC 19
    Exclusiveness - LIWC 139 
'''
text = "not not not not not feeling great"


def IC_cal(text):
    post_word = 0 #set ini positive word count as 0
    neg_word = 0 #set ini negative word count as 0 

    #pre-processing
    total_token = nltk.word_tokenize(text) #tokenizing
    lower_tokens = [token.lower() for token in total_token if token.isalpha()] #to lower case and remove non alphabet tokens

    #calculating sixl
    sixl_per, sixl_count = sixl(lower_tokens)

    # calculating categories
    for token in lower_tokens:
        #print(f"Now processing: {token}")
        matching_row = LIWC_2007[LIWC_2007[0].str.contains(fr"^{token}\*?$", regex=True)]

        if matching_row.empty == False:
            #print(f"\n {token} table: \n -------------")
            #display(matching_row)

            for col in range(len(matching_row.columns)):
                if list(matching_row[col].astype(str).str.contains('132' or '133' or '134' or '135' or '138'))[0] == True:
                    post_word += 1
                elif list(matching_row[col].astype(str).str.contains('19' or '136' or '137' or '139'))[0] == True:
                    neg_word += 1
        #else:
            #print("No matching row!\n")

    #Calculating integrative complexity 
    IC_count = post_word+sixl_count-neg_word #only counting words
    IC_count_per = post_word+sixl_per-neg_word #counting positive words and negative words and the % of >six-letter words 

    print("-------------------------------\n")
    print(f"Integrative complexity score based on count:{IC_count_per}")

    return IC_count, IC_count_per

In [40]:
IC_all = clean_full_data.apply(lambda x: IC_cal(x['commentText']), axis = 1) #apply to all data

clean_full_data['IC_score'] = pd.DataFrame.from_records(IC_all)[0] #append to clean_full_data from tuple
clean_full_data['IC_score_per'] = pd.DataFrame.from_records(IC_all)[1] #append to clean_full_data from tuple

clean_full_data.to_csv("Integrative complexity.csv") #save file

clean_full_data.corr()

Unnamed: 0,IC_score,IC_score_per
IC_score,1.0,0.156302
IC_score_per,0.156302,1.0


In [44]:
clean_full_data['text_length'] = clean_full_data.apply(lambda x: len(x['commentText']), axis = 1) #calculate text length

clean_full_data.corr() #show correlation

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  clean_full_data['text_length'] = clean_full_data.apply(lambda x: len(x['commentText']), axis = 1)


Unnamed: 0,IC_score,IC_score_per,text_length
IC_score,1.0,0.156302,0.981979
IC_score_per,0.156302,1.0,0.071115
text_length,0.981979,0.071115,1.0
