In [6]:
import pandas as pd
import nltk

# Load the NRC lexicon
def load_nrc_lexicon():
    lexicon = {}
    with open('NRC-Emotion-Lexicon-Wordlevel-v0.92.txt', 'r') as file:
        for line in file:
            if line.startswith('#'):
                continue
            word, emotion, score = line.strip().split('\t')
            if word not in lexicon:
                lexicon[word] = {}
            lexicon[word][emotion] = int(score)
    return lexicon

# with an input dataframe and the column name, output a dataframe with the appended sentiment scores
def perform_sentiment_analysis(df, text_column):
    # load the lexicon
    lexicon = load_nrc_lexicon()

    # create a placeholder list for the sentiment scores
    sentiment_scores = []
    anger_scores = []
    anticipation_scores = []
    disgust_scores = []
    fear_scores = []
    joy_scores = []
    sadness_scores = []
    surprise_scores = []
    trust_scores = []
    
    # iterate through the tokens in the specified column
    for text in df[text_column]:
        # split into tokens
        tokens = nltk.word_tokenize(text)
        # convert to lowercase
        tokens = [token.lower() for token in tokens]
        # we can change the baseline for scores here
        positive_score = 0
        negative_score = 0
        anger_score = 0
        anticipation_score = 0
        disgust_score = 0
        fear_score = 0
        joy_score = 0
        sadness_score = 0
        surprise_score = 0
        trust_score = 0

        # find the token in the positive or negative list
        for token in tokens:
            if token in lexicon:
                # increment the score for each match
                if 'positive' in lexicon[token]:
                    positive_score += lexicon[token]['positive']
                if 'negative' in lexicon[token]:
                    negative_score += lexicon[token]['negative']
                if 'anger' in lexicon[token]:
                    anger_score += lexicon[token]['anger']
                if 'anticipation' in lexicon[token]:
                    anticipation_score += lexicon[token]['anticipation']
                if 'disgust' in lexicon[token]:
                    disgust_score += lexicon[token]['disgust']
                if 'fear' in lexicon[token]:
                    fear_score += lexicon[token]['fear']
                if 'joy' in lexicon[token]:
                    joy_score += lexicon[token]['joy']
                if 'sadness' in lexicon[token]:
                    sadness_score += lexicon[token]['sadness']
                if 'surprise' in lexicon[token]:
                    surprise_score += lexicon[token]['surprise']
                if 'trust' in lexicon[token]:
                    trust_score += lexicon[token]['trust']

        # append result to the disposition sentiment list
        if positive_score > negative_score:
            sentiment_scores.append('1')
        elif positive_score < negative_score:
            sentiment_scores.append('-1')
        else:
            sentiment_scores.append('0')
            
        # append sentiments to the lists
        if anger_score > 0:
            anger_scores.append('1')
        else:
            anger_scores.append('0') 
            
        if anticipation_score > 0:
            anticipation_scores.append('1')
        else:
            anticipation_scores.append('0')  
        
        if disgust_score > 0:
            disgust_scores.append('1')
        else:
            disgust_scores.append('0') 
            
        if fear_score > 0:
            fear_scores.append('1')
        else:
            fear_scores.append('0')  
        
        if joy_score > 0:
            joy_scores.append('1')
        else:
            joy_scores.append('0') 
            
        if sadness_score > 0:
            sadness_scores.append('1')
        else:
            sadness_scores.append('0')  
                
        if surprise_score > 0:
            surprise_scores.append('1')
        else:
            surprise_scores.append('0')  
         
        if trust_score > 0:
            trust_scores.append('1')
        else:
            trust_scores.append('0')  
     
    # append the sentiment column to the dataframe
    df['NRC Disposition'] = sentiment_scores
    df['NRC Anger'] = anger_scores
    df['NRC Anticipation'] = anticipation_scores
    df['NRC Disgust'] = disgust_scores
    df['NRC Fear'] = fear_scores
    df['NRC Joy'] = joy_scores
    df['NRC Sadness'] = sadness_scores
    df['NRC Surprise'] = surprise_scores
    df['NRC Trust'] = trust_scores
    
    return df

In [7]:
# Reading in reviews on Indeed from gig companies

df = pd.read_csv("indeed_reviews_super_cleaned_all.csv")

# Uncomment to combine different text columns for analysis
# df['combined_text'] = df['title'].astype(str) + ' ' + df['text']

df = perform_sentiment_analysis(df, 'combined_text')
print(df.head())

# If you create combined text columns above, you can use the line below to remove them afterwards
# del df['full_text']

df.to_csv('indeed_reviews_super_cleaned_all_nrc.csv')

                 id                                  title  \
0  1gk3c8hg5jcaj800                             Very mixed   
1  1ghbs1220je2s802  Lousy compensation. Lousy incentives.   
2  1ggf1cpl3k7r0800                        Keeps you busy!   
3  1ggb74f18k7r0802                                3 stars   
4  1gebd0km6mb24800                                  Awful   

                                                text  rating        reviewer  \
0  I've driven for uber for about 2 years and it'...       2     Uber Driver   
1  I think every Uber driver would agree that the...       2  Uber XL Driver   
2  I’ve been driving for Uber for 4 years now and...       4     Uber Driver   
3  What is the best part of working at the compan...       3          Driver   
4  Don’t do it you will wast your time and your c...       1          Driver   

        location                                               cons  \
0     Boston, MA                                                NaN   
1   