In [1]:
# this code is taken from the following website
# http://nlpforhackers.io/sentiment-analysis-intro/
# this code uses the unigram approach for SentiWordNet
# based off of this code we wrote a biram model for SentiWordNet

import pandas as pd

data = pd.read_csv("citation sentiment.txt", header=None, delimiter="\t")
data.columns = ['Source', 'target', 'sentiment', 'sentence']


# 25000 movie reviews
import random
random.seed(5)

sentiment_data = list(zip(data['sentence'], data['sentiment']))
random.shuffle(sentiment_data)


X, Y = data['sentence'], data['sentiment']

from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet as wn
from nltk.corpus import sentiwordnet as swn
from nltk import sent_tokenize, word_tokenize, pos_tag

lemmatizer = WordNetLemmatizer()


def penn_to_wn(tag):
    """
    Convert between the PennTreebank tags to simple Wordnet tags
    """
    if tag.startswith('J'):
        return wn.ADJ
    elif tag.startswith('N'):
        return wn.NOUN
    elif tag.startswith('R'):
        return wn.ADV
    elif tag.startswith('V'):
        return wn.VERB
    return None


def clean_text(text):
    text = text.replace("<br />", " ")
    return text


from collections import Counter

# Calculate word frequencies
word_frequencies = Counter(word for sentence in sentiment_data for word in word_tokenize(clean_text(sentence[0])))

def swn_polarity(text):
    """
    Return a sentiment polarity: 'p' = positive, 'n' = negative, 'o' = neutral
    """

    sentiment = 0.0
    tokens_count = 0

    text = clean_text(text)

    raw_sentences = sent_tokenize(text)
    for raw_sentence in raw_sentences:
        tagged_sentence = pos_tag(word_tokenize(raw_sentence))

        for word, tag in tagged_sentence:
            wn_tag = penn_to_wn(tag)
            if wn_tag not in (wn.NOUN, wn.ADJ, wn.ADV):
                continue

            lemma = lemmatizer.lemmatize(word, pos=wn_tag)
            if not lemma:
                continue

            synsets = wn.synsets(lemma, pos=wn_tag)
            if not synsets:
                continue

            # Take the first sense, the most common
            synset = synsets[0]
            swn_synset = swn.senti_synset(synset.name())

            # Check if the term carries sentiment and is not frequent

            if word_frequencies[word] > 100:  # Adjust the threshold as needed
                continue

            sentiment += swn_synset.pos_score() - swn_synset.neg_score()
            tokens_count += 1
    return sentiment

# Since we're shuffling, you'll get diffrent results
from sklearn.metrics import accuracy_score

pred_y = [swn_polarity(text) for text in X]

data['p_score'] = pred_y
data.to_excel('C:/Users/HP/Desktop/SentiWordNet_Citation.xlsx')