In [1]:
# installation, uncomment to run the first time
# !pip install python-rake

In [2]:
import RAKE
import operator 
import re
from typing import List
from pathlib import Path
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import ast
import csv

import warnings
pd.options.mode.chained_assignment = None

In [3]:
# run RAKE algorithm: https://github.com/fabianvf/python-rake
# rake.run(<text>, minCharacters = 1, maxWords = 5, minFrequency = 1)
def get_keywords(text, rake_object, text_col): 
    keywords= []
    # make n like 10 or something if u want to run it quickly
    n = len(text)
    for i in range(n):
        kw = rake_object.run(text[text_col][i], maxWords = 30)
        # change number of keywords generated based on length of text
        keywords.append(kw[:3000])
        # print progress
        if i % 1000 == 0:
            print(i)
    return keywords

def rake(df, text_col, stop_dir):
  # check if user provided stopword directory 
    if stop_dir != "":
      rake_object = RAKE.Rake(stop_dir)
    else: 
      rake_object = RAKE.Rake(RAKE.SmartStopList())
  
    kw = get_keywords(df, rake_object, text_col) 
    
    df["kwdf"] = kw
    kwdf = []
    for i in range(len(df)):
        kwdf.append([x[0] for x in df[["kwdf"]].iloc[[i]].values[0][0]])

    df["kwdf"] = kwdf
    df["score"] = [kw[i][0][1] for i in range(len(kw))]
    df["length"] = [len(i) for i in df[text_col]]
  
    return df

In [4]:
# HELPER FUNCTIONS FOR hl
# set number of keywords
def get_kwdf_len(df):
    kwdf_len = []
    for i in range(len(df)):
        if df["length"].iloc[i] < 10000:
            n = np.ceil(df["length"].iloc[i]/200)
        else:
            n = 100
        kwdf_len.append(int(n))
    return kwdf_len

# add markers to highlight keywords
def highlight(text, kwds):
    hl_str = r"\b(?:" + '|'.join(kwds) + r")\b"
    hl_text = re.sub(hl_str, '<mark>\g<0></mark>', text, flags = re.I)
#     hl_text = re.sub(hl_str, '\033[43;1;91m\g<0>\033[m', text, flags = re.I)
#     hl_text = re.sub(hl_str, '<span class="ansi-red-intense-fg ansi-yellow-bg ansi-bold">\g<0>\</span>', text, flags = re.I)
    return hl_text

# escape characters that will affect regex 
def escape(text):
    text = re.sub(r"\^", "", text)
    text = re.sub(r"\+", "", text)
    text = re.sub(r"\-", " ", text)
    text = re.sub(r"\*", "", text)
    text = re.sub(r"\(", "", text)
    text = re.sub(r"\)", "", text)
    text = re.sub(r"\[", "", text)
    text = re.sub(r"\]", "", text)
    text = re.sub(r"\{", "", text)
    text = re.sub(r"\}", "", text)
    text = re.sub(r"\(s\)", "", text)
    return(text)

# helper function for format_tbl
def format_tag(txt, row = "", tag = "td"):
    
    if row == "first":
        txt = re.sub("^", "<tr><" + tag + ">", txt)
    else: 
        txt = re.sub("^", "<" + tag + ">", txt)
    if row == "last":
        txt = re.sub("$", "</" + tag + "></tr>", txt)
    else:
        txt = re.sub("$", "</" + tag + ">", txt)
    return txt

# add html tags to each line of the table
def format_tbl(df):
    for i in range(len(df.columns)):
        if i == 0:
            df[df.columns[i]] = df[df.columns[i]].apply(format_tag, row = "first")
        elif i == len(df.columns) - 1:
            df[df.columns[i]] = df[df.columns[i]].apply(format_tag, row = "last")
        else:
            df[df.columns[i]] = df[df.columns[i]].apply(format_tag)
    return df 

In [5]:
# highlight keywords and format text as html 
def hl(df, cols):
    kwdf_len = get_kwdf_len(df)
    hl_text = []
    tjc_hl_txt = []
    for i in range(len(df)):
        # highlight 
        text = escape(df["text"].iloc[i])
        kwds = df["kwdf"].iloc[i][:kwdf_len[i]]
        kwds = [escape(x) for x in kwds]
        hl_text.append(highlight(text, kwds))
        # shorten kwdf based on # of kwds highlighted
        df["kwdf"].iloc[i] = str(kwds)
    df["hl_text"] = hl_text
    
    df = df.applymap(str)
    return format_tbl(df)[cols]

In [19]:
# save dataframe to html 
def get_header(df):
    header = "<tr>"
    for i in df.columns.values:
        header = header + format_tag(i.upper(), tag = "th")
    header = header + "</tr>"
    return header

def save_html(df, filename, css_file = "hl.css"):
    # save csv
    csv_file = filename + ".csv"
    df.to_csv(csv_file, index = False, header = False, sep="\t", quoting=csv.QUOTE_NONE, quotechar="",  escapechar="\\")
    
    # open csv as txt 
    with open(csv_file) as csv_f:
        csv_txt = csv_f.read()
        
    # add header and tail of html document
    head_txt = '<!DOCTYPE html><html lang="en"><head><meta charset="utf-8"><title> data</title><link href="{css_file}" rel="stylesheet" type="text/css"></head><body><table>'.format(css_file = css_file) + get_header(df)
    tail_txt = '</table></body></html>'
    html_txt = head_txt + csv_txt + tail_txt 
    
    # create, write, and close html file 
    if not filename.endswith(".html"):
        filename = filename + ".html" 

    html_f = open(filename, "w")
    html_f.write(html_txt)
    html_f.close()
    
    print("saved as " + filename)
    

In [7]:
# df -> uploaded dataset
# text_col -> column containing text you want to analyze and highlight, make sure to pass in as list
# filename -> desired html file path 
# stopwords -> optional, pass in text file containing list of stopwords, otherwise rake provided SmartStopList() is used, please see RAKE documentation
# cols -> pass in list of columns to be included in output, default is [highlighted text, keywords]
# css_file -> desired css file path, default is provided in repo

def run(df, text_col = "text", filename = "kw_hl.html", stopwords = "", cols = ["hl_text", "kw_df"], css_file = "hl.css"):
    if cols == []:
        cols = df.columns
    kw_df = rake(df, stop_dir = stopwords, text_col = text_col)
    kw_hl = hl(kw_df, cols)
    save_html(kw_hl, filename)
    return kw_hl

In [9]:
# set path = "path/to/your/data.csv"
# make sure no NAs 

path = ""
# df = pd.read_csv(path)
# run(df = df, text_col = "text", filename = "kw_hl.html", stopwords = "", cols = ["hl_text", "kwdf"], css_file = "hl.css")