In [1]:
import pandas as pd
import numpy as np
from IPython.display import display
import matplotlib.pyplot as plt
import re
from time import time
import math

In [2]:
# http://scikit-learn.org/stable/modules/generated/sklearn.model_selection.StratifiedKFold.html
# https://stackoverflow.com/questions/45969390/
# https://www.kaggle.com/ogrellier/kfold-or-stratifiedkfold


from sklearn.model_selection import StratifiedShuffleSplit, StratifiedKFold
from sklearn import datasets

# ===============
splits = 5

tx = list(range(-splits,splits))
ty = [0] * 5 + [1] * 5

print('tx:',tx)
print('ty:',ty)
print(splits, 'splits')
# ===============

print('''\nStratified :: 
This cross-validation object is a variation of KFold that returns stratified folds.
The folds are made by preserving the --percentage of samples for each class--.''')

print("\n=== KFold ===")
print('''In KFolds, each test set should not overlap, even with shuffle.
With KFolds and shuffle, the data is shuffled once at the start, 
and then divided into the number of desired splits. 
The test data is always one of the splits, the train data is the rest.\n''')
kfold = StratifiedKFold(n_splits=splits, shuffle=True, random_state=42)
for train_index, test_index in kfold.split(tx, ty):
    print("TRAIN:", train_index-splits, "TEST:", test_index-splits)

print("\n=== Shuffle Split ===")
print('''In ShuffleSplit, the data is shuffled every time, and then split.
This means the test sets may overlap between the splits.\n''')
shufflesplit = StratifiedShuffleSplit(n_splits=splits, random_state=42, test_size=2)
for train_index, test_index in shufflesplit.split(tx, ty):
    print("TRAIN:", train_index-splits, "TEST:", test_index-splits)
print('Note the overlap of the elements in the test sets for ShuffleSplit.')



tx: [-5, -4, -3, -2, -1, 0, 1, 2, 3, 4]
ty: [0, 0, 0, 0, 0, 1, 1, 1, 1, 1]
5 splits

Stratified :: 
This cross-validation object is a variation of KFold that returns stratified folds.
The folds are made by preserving the --percentage of samples for each class--.

=== KFold ===
In KFolds, each test set should not overlap, even with shuffle.
With KFolds and shuffle, the data is shuffled once at the start, 
and then divided into the number of desired splits. 
The test data is always one of the splits, the train data is the rest.

TRAIN: [-5 -3 -2 -1  0  2  3  4] TEST: [-4  1]
TRAIN: [-5 -4 -3 -2  0  1  2  3] TEST: [-1  4]
TRAIN: [-5 -4 -2 -1  0  1  3  4] TEST: [-3  2]
TRAIN: [-4 -3 -2 -1  1  2  3  4] TEST: [-5  0]
TRAIN: [-5 -4 -3 -1  0  1  2  4] TEST: [-2  3]

=== Shuffle Split ===
In ShuffleSplit, the data is shuffled every time, and then split.
This means the test sets may overlap between the splits.

TRAIN: [ 3 -1 -4 -5  1  0  2 -3] TEST: [-2  4]
TRAIN: [ 2 -5 -2  4 -1  0 -4  1] TEST:

In [3]:

def normalizeDict(d):
    total = sum(d.values(), 0.0)
    for key in d:
        d[key] /= total


target_tokens = set()

## email keywords : from 'enron sent' and common english

In [4]:

# -- token set : emails --

# NL (natural language) token set is build from: 
#  english commons and the 'enron-sent' ds commons.

# * 'Enron sent' is a special cleaned nl-courups of the enron corporate emails dataset
# * Styler, Will (2011). The EnronSent Corpus. Technical Report 01-2011, 
# * University of Colorado at Boulder Institute of Cognitive Science, Boulder, CO.


# english common
df_cmn = pd.read_csv('email_ds/common_english/eng_top_100_common.csv')
print('loaded df common words: {}'.format(df_cmn.shape))
#print(df_cmn.keys())
df_cmn['WORD'].str.lower()
eng_cmb_list = df_cmn['WORD'].tolist()
#print(eng_cmb_list,'\n\n')


# ernon sent email commons

N_TOP_WORDS_ERNON = 500

def processErnonCommon():
    
    print('processsing ernon sent for common words...')
    wordfreqs = {}
    ignorelist= ['Enron', 'enron', ]
    
    for i in range(44+1):
        filename = 'enronsent' + str(i).zfill(2)
        
        with open('email_ds/ernon/enronsent/'+filename) as text:
            print(filename,' ', end='')
            for line in text.readlines():
                for word in line.split():
                    word = word.lower()
                    if word not in ignorelist:
                        if word in wordfreqs:
                            wordfreqs[word] = wordfreqs[word] + 1;
                        else:
                            wordfreqs[word]=1;

    print('# unique words:',len(wordfreqs))
    wf_sorted = sorted(wordfreqs.items(), key=lambda x: x[1], reverse=True)
    #print(wf_sorted[:N_TOP_WORDS_ERNON],'\n\n')
    
    list = [x[0] for x in wf_sorted[:N_TOP_WORDS_ERNON]]
    #print(list)
    return list

ernon_cmn_list = processErnonCommon()


# merge common english and common 'ernon sent'
# merge keeps order of decending frequency, 
# and outputs only unique values

def mergeUniqueInorder(a,b):
    def addinorder(a,i,c,s):
        if a[i] not in s:
            #print('unq:',a[i],s)
            s.add(a[i])
            c.append(a[i])
    s = set()
    c = []
    # a shorter
    if len(a)>len(b):
        b, a = a, b
    for i in range(len(a)):
        addinorder(a,i,c,s)
        addinorder(b,i,c,s)
    for k in range(i+1 ,len(b)):
        addinorder(b,k,c,s)
    print('mergeUniqueInorder sanity check: ',len(c)==len(s))
    return c

# combine english commons and 'ernon sent' commons
email_tokens = mergeUniqueInorder(eng_cmb_list, ernon_cmn_list)
print(email_tokens)
print('total natural language tokens: ',len(email_tokens))

# add to target tokens
target_tokens.update(email_tokens)
print('\nupdated to complete dataset target tokens')

loaded df common words: (100, 3)
processsing ernon sent for common words...
enronsent00  enronsent01  enronsent02  enronsent03  enronsent04  enronsent05  enronsent06  enronsent07  enronsent08  enronsent09  enronsent10  enronsent11  enronsent12  enronsent13  enronsent14  enronsent15  enronsent16  enronsent17  enronsent18  enronsent19  enronsent20  enronsent21  enronsent22  enronsent23  enronsent24  enronsent25  enronsent26  enronsent27  enronsent28  enronsent29  enronsent30  enronsent31  enronsent32  enronsent33  enronsent34  enronsent35  enronsent36  enronsent37  enronsent38  enronsent39  enronsent40  enronsent41  enronsent42  enronsent43  enronsent44  # unique words: 427686
mergeUniqueInorder sanity check:  True
['the', 'be', 'to', 'and', 'of', 'a', 'i', 'in', 'that', 'you', 'have', 'for', 'I', 'is', 'it', 'on', 'not', 'this', 'we', 'with', 'he', 'as', 'will', 'do', '>', 'at', 'are', 'but', 'if', 'his', 'by', 'from', 'or', 'they', 'your', 'say', '-', 'her', 'please', 'she', 'me', 'an'

ValueError: dictionary update sequence element #0 has length 3; 2 is required

## -- codes keywords : from dedicated XML --

In [None]:

# -- read code tokens from XML --

# for C, this can be the language's keywords
#  language-specific keywords can be maintained in a local xml

import xml.etree.ElementTree as ET
import re


tree = ET.parse('src_code_ds/langs_xml.xml')
root = tree.getroot()

code_tokens = set()

# setup what kind of tokens, of which languages, to take from the xml
langs = ['C','JAVA']
lang_tags = ['keywords', 'types', 'headers']

# retrieve from XML
for lang in root.findall('./langs/lang'):
    name = lang.find('name').text
    if name not in langs:
        continue
    print('\n==== adding code tokens for:', name, '====')
    for tag in lang_tags:
        words = lang.find(tag).text
        #print('-- adding from tag :', tag, ':', words)
        if words:
            code_tokens.update(words.split())
    print('loaded {} code tokens.'.format(len(code_tokens)))
    #print(code_tokens)

    
# conform to code dataset tokenization
def bowConform(text): 
    pat = '<(?P<header>.+).h|c>' 
    m = re.search(pat, text)
    if m:
        text = m.group('header')
        print(' {}'.format(text),end='')
    return text


print('\n===> cleaning code tokens set...', end='')
code_tokens = {bowConform(v) for v in code_tokens}
    
print('\n\n===> final code tokens set:\n\n',code_tokens)

# add to target tokens
target_tokens.update(code_tokens)
print('\n===> updated to complete-dataset target-tokens')

In [None]:
print('total target keywords:',len(target_tokens))

## -- tokens : term frequency-inverse document frequency (TF-IDF) --


In [None]:
from collections import Counter

#display(df_codes['source'][340:350])
#src = ast.literal_eval(src) #import ast 

print('text null values check:\n====================')
display(df['text'].isnull().value_counts())

# collect selected tokens from all code corpus 
print('\ncounting target tokens across all dataset text:\n====================')
global_cnt = Counter()
tokens_col = []
entry_cnt = Counter()
i = 0
print('total entries processed: ')
for text in df['text']:
    # loop control
    i=i+1
    if i % 350 == 0:
        pass
        #break
    # extract entry tokens
    tokens = [tk for tk in text.split()]
    tokens_list = [tk for tk in tokens if (tk and tk in target_tokens)]
    #count entry tokens
    entry_cnt.clear()
    entry_cnt.update(tokens_list)
    # update entry tokencounts column
    #  tokens_col.append(entry_cnt.most_common(20))  # will later use 01 vectors
    # update to global frequency counter
    global_cnt.update(entry_cnt)
    # report
    if i % 10000 == 0:
        print(i,end='... ')
print('done!')

In [None]:
# -- decide most common and final code tokens --

# display

#print(global_cnt)

# counter to pd
c = pd.Series(global_cnt, name='counts')
c = c.sort_values(ascending=False)

# box plot to find outliers
fig = plt.figure(figsize=(16, 5), dpi=200), plt.rc("font", size=8)
c.plot(kind='box', vert=False, grid=True)
plt.xticks(list(range(0,c.max()+1,100000)))
plt.xticks(rotation='vertical')
plt.show()

# bar plot
print('Global target tokens frequencies:')
fig = plt.figure(figsize=(16, 5), dpi=200), plt.rc("font", size=8)

ax = c.plot(kind='bar')
plt.xticks(rotation='vertical')
plt.show()

# -- CODE BIN --

# as dataframe - not used
# c = pd.DataFrame(list(d.items()), columns=['token','count'])
# c = c.sort_values(by='count', ascending=False)
# c.set_index('token', inplace=True)
# ax = c['count'].plot(kind='bar')
# plt.xticks(rotation='vertical')
# plt.tight_layout()
# plt.show()

In [None]:

# normalization and cutting
token_freq = (c/c.sum()).sort_values(ascending=False)
#print(token_freq)
TARGET_TKNS_NUM_MAX = 50
TARGET_TKNS_BAR = token_freq.mean(axis='counts'), print('mean is:',TARGET_TKNS_BAR)

token_freq_common = token_freq.where(token_freq > TARGET_TKNS_BAR)
token_freq_common = token_freq_common[:TARGET_TKNS_NUM_MAX]

# bar plot
fig = plt.figure(figsize=(16, 5), dpi=200), plt.rc("font", size=8)
ax = c.plot(kind='bar')
plt.xticks(rotation='vertical')
plt.show()



In [None]:
# -- draw a nice code word cloud --


from wordcloud import WordCloud
from scipy.misc import imread

wc = WordCloud(background_color='white', mask=None, font_path='cs_regular.ttf', \
               width=1600, height=800)
wc.generate_from_frequencies(token_freq_common)


plt.figure(figsize=(6 * 2, 4 * 2))
plt.imshow(wc)
plt.axis('off')
plt.show()

In [None]:
# http://scikit-learn.org/stable/modules/feature_extraction.html#text-feature-extraction

# preprocessor: 
#  a callable that takes an entire document as input (as a single string), 
#  and returns a possibly transformed version of the document, still as an entire string. 
#  This can be used to remove HTML tags, lowercase the entire document, etc.

# tokenizer: 
#  a callable that takes the output from the preprocessor and splits it into tokens, 
#  then returns a list of these.


# analyzer: 
#  a callable that replaces the preprocessor and tokenizer. 
#  The default analyzers all call the preprocessor and tokenizer, 
#  but custom analyzers will skip this. 
#  N-gram extraction and stop word filtering take place at the analyzer level, 
#  so a custom analyzer may have to reproduce these steps.

