In [21]:
import nltk
import math
import string

from nltk.corpus import stopwords
from collections import Counter
from nltk.stem.porter import *

In [None]:
output_metric= {
                "Emissions":
                    ["emission", "co2 emission", "ghg emission", "emission reduction", "emission target", "Emissions Policy"],
                "Water":
                    ["Water Withdrawal", "Water Discharged", "Water Recycled", "water risk"],
                "Energy":
                    ["Energy Consumed", "Renewable Energy"],
                "Business Ethics":
                    ["Women Executives", "Women Board Members", "Board Members", "Committee Independence", "ESG Sustainability Reporting"],
                "Labor Practices":
                    ["Trade Union", "CEO Salary", "Average Salary", "Employee Turnover", "Avg Training Hours"],
                "Employee Engagement, Diversity & Inclusion":
                    ["Women Employees", "Women Managers", "Minority Employees"],
                "Employee Health & Safety":
                    ["Lost Time", "Injury Rate"],
                "Waste":
                    ["Waste"]
                }

In [22]:
text1 = "Natural language processing (NLP) is a field of computer science, artificial intelligence and computational linguistics concerned with the interactions between computers and human (natural) languages, and, in particular, concerned with programming computers to fruitfully process large natural language corpora. Challenges in natural language processing frequently involve natural language understanding, natural language generation (frequently from formal, machine-readable logical forms), connecting language and machine perception, managing human-computer dialog systems, or some combination thereof."

text2 = "The Georgetown experiment in 1954 involved fully automatic translation of more than sixty Russian sentences into English. The authors claimed that within three or five years, machine translation would be a solved problem.[2] However, real progress was much slower, and after the ALPAC report in 1966, which found that ten-year-long research had failed to fulfill the expectations, funding for machine translation was dramatically reduced. Little further research in machine translation was conducted until the late 1980s, when the first statistical machine translation systems were developed."

text3 = "During the 1970s, many programmers began to write conceptual ontologies, which structured real-world information into computer-understandable data. Examples are MARGIE (Schank, 1975), SAM (Cullingford, 1978), PAM (Wilensky, 1978), TaleSpin (Meehan, 1976), QUALM (Lehnert, 1977), Politics (Carbonell, 1979), and Plot Units (Lehnert 1981). During this time, many chatterbots were written including PARRY, Racter, and Jabberwacky。"

In [23]:
def get_tokens(text):
    lower = text.lower()
    remove_punctuation_map = dict((ord(char), None) for char in string.punctuation)
    no_punctuation = lower.translate(remove_punctuation_map)
    tokens = nltk.word_tokenize(no_punctuation)

    return tokens

In [24]:
def stem_tokens(tokens, stemmer):
    stemmed = []
    for item in tokens:
        stemmed.append(stemmer.stem(item))

    return stemmed

In [25]:
def tf(word, count):
    return count[word] / sum(count.values())
def n_containing(word, count_list):
    return sum(1 for count in count_list if word in count)
def idf(word, count_list):
    return math.log(len(count_list)) / (1 + n_containing(word, count_list))
def tfidf(word, count, count_list):
    return tf(word, count) * idf(word, count_list)

In [26]:
def count_term(text):
    tokens = get_tokens(text)
    filtered = [w for w in tokens if not w in stopwords.words('english')]
    stemmer = PorterStemmer()
    stemmed = stem_tokens(filtered, stemmer)
    count = Counter(stemmed)
    return count

def main():
    texts = [text1, text2, text3]
    countlist = []
    for text in texts:
        countlist.append(count_term(text))
    for i, count in enumerate(countlist):
        print("Top words in document {}".format(i + 1))
        scores = {word: tfidf(word, count, countlist) for word in count}
        sorted_words = sorted(scores.items(), key = lambda x: x[1], reverse=True)
        for word, score in sorted_words[:5]:
            print("\tWord: {}, TF-IDF: {}".format(word, round(score, 5)))

if __name__ == "__main__":
    main()

Top words in document 1
	Word: languag, TF-IDF: 0.07121
	Word: natur, TF-IDF: 0.06103
	Word: comput, TF-IDF: 0.04069
	Word: process, TF-IDF: 0.03052
	Word: concern, TF-IDF: 0.02034
Top words in document 2
	Word: translat, TF-IDF: 0.05086
	Word: machin, TF-IDF: 0.02713
	Word: research, TF-IDF: 0.02034
	Word: georgetown, TF-IDF: 0.01017
	Word: experi, TF-IDF: 0.01017
Top words in document 3
	Word: mani, TF-IDF: 0.02555
	Word: 1978, TF-IDF: 0.02555
	Word: lehnert, TF-IDF: 0.02555
	Word: 1970, TF-IDF: 0.01277
	Word: programm, TF-IDF: 0.01277
