In [1]:
import pandas as pd, textacy, pickle, re, spacy, flashtext, contractions, unicodedata
from dateutil import parser
from collections import Counter
from textacy import preprocessing, ke, vsm, tm, text_utils, Corpus, doc
from flashtext import KeywordProcessor

In [2]:
def striplist(text):
    return ([x.strip() for x in text])  # to remove the extra spaces

def strip_accents(s):  # strips the accents in Latin languages for example ü to u
    return ''.join(c for c in unicodedata.normalize('NFD', s)
                   if unicodedata.category(c) != 'Mn')

def rem_nonasc(text):
    regex = r'[^\x00-\x7f]'
    w = text.str.replace(regex, ' ').str.strip()
    txt = [i for i in w if type(i) is not float]
    return txt

def rem_jpg(text):
    regex = r'[a-zA-Z0-9_.+-]+\.(jpg|png|JPG|jpeg)'
    w = text.str.replace(regex, ' ').str.strip()
    txt = [i for i in w if type(i) is not float]
    return txt

def rem_html(text):
    regex = r'(?=/).*?(?<=html)'
    w = text.str.replace(regex, ' ').str.strip()
    txt = [i for i in w if type(i) is not float]
    return txt

def remove_nonlatin(text):
    s = (ch for ch in text
         if unicodedata.name(ch).startswith(('LATIN', 'DIGIT', 'SPACE')))
    return ''.join(text)

def rem_html_tags(text):
    regex = r'<.*?>'
    w = text.str.replace(regex, ' ').str.strip()
    txt = [i for i in w if type(i) is not float]
    return txt

def rem_http(text):
    regex = r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'
    w = text.str.replace(regex, ' ').str.strip()
    txt = [i for i in w if type(i) is not float]
    return txt

In [7]:
# Replace GB words with US

gb2us = pd.read_csv('gb2us.csv', header=None, index_col=0, squeeze=True, encoding='utf8').to_dict() # path of gb2us.csv should be written.
gb2us_processor = KeywordProcessor(case_sensitive=False)
keyword_dict = []
for k, v in gb2us.items():
    temp = (str(k), [(v)])
    keyword_dict.append(temp)
gb2us_dict = dict(keyword_dict)
gb2us_processor.add_keywords_from_dict(gb2us_dict)

In [4]:
## Customized Cleaning

def findBet(sw, ew, text):  # Finds the text between two strings
    regex = r"(?=" + sw + ").*?(?<=" + ew + ")"
    w = text.str.findall(regex)
    txt = [i for i in w if type(i) is not float]
    return [item for sublist in txt for item in sublist]

def delBet(sw, ew, text):
    regex = r"(?=" + sw + ").*?(?<=" + ew + ")"
    w = text.str.replace(regex, ' ').str.strip()
    txt = [i for i in w if type(i) is not float]
    return txt

def kwic(kw, wd, dataframe):  # bunu ew sw ve ew icerecek sekilde duzelt
    for news in dataframe:
        w = text_utils.KWIC(news, kw, window_width=wd)

def listEmail(text):
    regex = r"[a-z0-9\.\-+_]+@[a-z0-9\.\-+_]+\.[a-z]+"
    w = text.str.findall(regex)
    xx = list(filter(None, w))
    zz = [i for i in w if type(i) is not float]
    dct = dict(Counter([item for sublist in zz for item in sublist]))
    return [(k, dct[k]) for k in sorted(dct, key=dct.get, reverse=True)]

def delEmail(text):
    regex = r"[a-z0-9\.\-+_]+@[a-z0-9\.\-+_]+\.[a-z]+"
    w = text.str.replace(regex, ' ').str.strip()
    txt = [i for i in w if type(i) is not float]
    return txt

def listTwt(text):
    regex = r'\s[@|#][a-zA-Z0-9_.+-]+'
    w = text.str.findall(regex)
    xx = list(filter(None, w))
    zz = [i for i in w if type(i) is not float]
    dct = dict(Counter([item for sublist in zz for item in sublist]))
    return [(k, dct[k]) for k in sorted(dct, key=dct.get, reverse=True)]

def delTwt(text):
    twtskeep = "(?!@TicBot|#BeBoldForChange|#MeToo|@nightmare_machine|#TalkingTech|#AlphaGo|@privacyproject)"
    regex =  twtskeep + "[@|#][a-zA-Z0-9_.+-]+"
    w = text.str.replace(regex, ' ').str.strip()
    txt = [i for i in w if type(i) is not float]
    return txt

def listDom(text):
    regex = r'\b\w+\.[com|net|org|info]+[\.[a-z]{2,3}]?'
    w = text.str.findall(regex)
    xx = list(filter(None, w))
    txt = [i for i in w if type(i) is not float]
    dct = dict(Counter([item for sublist in txt for item in sublist]))
    return [(k, dct[k]) for k in sorted(dct, key=dct.get, reverse=True)]

def delDom(text):
    webkeep = r"(?!\bAmazon\b|\bJD\b|\bSalesforce\b|\bsohu\b|\bGoogle\b|\bfacebook\b|\bthreadinmotion\b|\bmoodnode\b|\blinkedin\b|\baccenture\b)"
    regex = webkeep + r"\b\w+\.[com|net|org|info]+[\.[a-z]{2,3}]?"
    w = text.str.replace(regex, ' ').str.strip()
    txt = [i for i in w if type(i) is not float]
    return txt

def cleanup(dates):
    for date in dates:
        try:
            yield parser.parse(date, dayfirst=True)
        except (ValueError, TypeError) as e:
            print(" {}".format(e))


In [None]:
## Viterby algorithm for segmenting accidentally merged words

with open("C:/Users/asuer/Documents/BAP/text_analysis/gwdict.txt", "rb") as fp:  # Unpickling
    dic = pickle.load(fp)

def viterbi_segment(tx):
    probs, lasts = [1.0], [0]
    for i in range(1, len(tx) + 1):
        prob_k, k = max((probs[j] * word_prob(tx[j:i]), j)
                        for j in range(max(0, i - max_word_length), i))
        probs.append(prob_k)
        lasts.append(k)
    words = []
    i = len(tx)
    while 0 < i:
        words.append(tx[lasts[i]:i])
        i = lasts[i]
    words.reverse()
    return words, probs[-1]

In [5]:
def word_prob(word): return dictionary[word] / total

def words(tx): return re.findall('[a-z]+', tx.lower())

In [None]:
dic = striplist(dic)
dictionary = Counter(dic)
max_word_length = max(map(len, dictionary))
total = float(sum(list(dictionary.values())))