In [1]:
import numpy as np
import pandas as pd
import random
import string
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from scipy.optimize import minimize
import math
import json
from nltk import tokenize
import collections
import re
import itertools
import nltk
from scipy.stats import mannwhitneyu

import statsmodels.api as sm
from statsmodels.base.model import GenericLikelihoodModel,\
        GenericLikelihoodModelResults

from statsmodels.nonparametric.smoothers_lowess import lowess

from scipy.special import zeta
from scipy.stats import binom

import pickle
import seaborn as sns

import warnings
warnings.filterwarnings("ignore")

lg = np.log10

In [164]:
web_train = pd.read_csv (r'data/webtext.train.csv')
web_train = web_train.loc[web_train['length']>= 1000]
web_train = web_train.dropna()
web_train = web_train['text'].tolist()

In [166]:
gpt_train = pd.read_csv (r'data/small-117M-k40.train.csv')
gpt_train = gpt_train.loc[gpt_train['length']>= 1000]
gpt_train = gpt_train.dropna()
gpt_train = gpt_train['text'].tolist()

In [2]:
web_train_20000 = pickle.load(open("web_train_20000.p", "rb" ))

In [3]:
gpt_train_20000 = pickle.load(open("gpt_train_20000.p", "rb" ))

In [105]:
set_GPT = open("data/GPT_set.txt", "r").read()
set_GPT = set_GPT.split("</doc>")

In [106]:
gpt_set = [make_file(corpus, multi=False, pos=True) for corpus in set_GPT]

In [107]:
set_human = open("data/GPT_set.txt", "r").read()
set_human = set_human.split("</doc>")

In [108]:
human_set = [make_file(corpus, multi=False, pos=True) for corpus in set_human]

# Pre-processing

In [61]:
# Pre-processing without part of speech tags
def remove_punctuation(text):
    text = text.lower()
    chars_to_remove = "[\n]!\"#$%&'()*+,-./:;<=>?@[\]^_`{|}~"
    tr = str.maketrans(" ", " ", chars_to_remove)
    return text.translate(tr)


def preprocess(corpus, sent = True):
    if sent:
        corpus = tokenize.sent_tokenize(corpus)
        corpus = [remove_punctuation(sent).split() for sent in corpus]
    else:
        corpus = remove_punctuation(corpus).split()
    return corpus

In [62]:
# Pre-processing with part of speech tags
def part_of_speech(corpus):
    corpus = tokenize.sent_tokenize(corpus)
    chars_to_remove = "[\n]"
    tr = str.maketrans(" ", " ", chars_to_remove)
    chars_to_remove2 = "[\n]!\"#$%&'()*+,-./:;<=>?@[\]^_`{|}~"
    new_corp = []
    test = []

    for sent in corpus:
        sent = sent.translate(tr)
        words_sent = tokenize.word_tokenize(sent)
        sent_pos = nltk.pos_tag(words_sent)
        new_sent = []
        for (word, pos) in sent_pos:
            tr2 = str.maketrans("", "", chars_to_remove2)
            word = word.translate(tr2)
            if word:
                new_sent.append((word.lower(), pos))
        new_corp.append(new_sent)
    return new_corp

In [63]:
# Total preprocessing function for a corpus. Input can be one string (corpus),
# for which you put multi = False, or a list of several strings (corpora) that 
# you want to turn into one big corpus, for which you put multi = True.
# For PoS tags, put pos = True.
def make_file(corp, multi = True, sent = True, pos = False):
    if multi:
        corpus = ''
        for subcorp in corp:
            corpus += subcorp
    else:
        corpus = corp
        
    if pos:
        corpus = part_of_speech(corpus)
    
    else:
        corpus = preprocess(corpus, sent = sent)
    
    return corpus

# Subsampling

In [168]:
# Returns 2 lists of corpora, one from which the ranks will be calculated
# and one from which the frequencies will be calculated. Each corpus consists of
# a list of tokenized sentences.
# Input: corpus that is to be subsampled. Should be a list of tokenized sentences.
# k is the amount of tokens that each sampled corpus should contain,
# m is the amount of subcorpera you want for both the ranks and frequencies.
# Max: I would read Valentin's thesis for an explanation on subsampling
def subsampling(corpus, k = 1000000, m = 10, sent = True):
    n = len(corpus)
    
    sen_len = {}

    
    rank_corpera = []
    freq_corpera = []

    if sent:
        for i in range(m):
            used_rank = set()
            used_freq = set()
            rank_count = 0
            freq_count = 0
            rank_samples = []
            freq_samples = []

            while rank_count < k:
                index = np.random.randint(n)
                if index in used_rank:
                    continue

                rank_sample = corpus[index]
                len_sample = len(rank_sample)

                if len_sample == 0:
                    continue

                if rank_count > k:
                    max_len = len_sample - (rank_count - k)
                    rank_sample = rank_sample[:max_len]
                    
                rank_samples += rank_sample
                rank_count += len_sample


                used_rank.add(index)

            while freq_count < k:
                index = np.random.randint(n)
                if index in used_freq:
                    continue
                freq_sample = corpus[index]
                len_sample = len(freq_sample)

                if len_sample == 0:
                    continue
                    
                if freq_count > k:
                    max_len = len_sample - (freq_count - k)
                    freq_sample = freq_sample[:max_len]

                freq_samples += freq_sample
                freq_count += len_sample

                if len_sample not in sen_len and len_sample < 200:
                    sen_len[len_sample] = 1
                elif len_sample < 200:
                    sen_len[len_sample] += 1

                used_freq.add(index)

            rank_corpera.append(rank_samples)
            freq_corpera.append(freq_samples)
#                 rank_corpera.append([item for sublist in rank_samples for item in sublist])
#                 freq_corpera.append([item for sublist in freq_samples for item in sublist])


    else:
        for i in range(m):
            rank_samples = random.sample(corpus, k)
            freq_samples = random.sample(corpus, k)
            rank_corpera.append(rank_samples)
            freq_corpera.append(freq_samples)
    
#     return rank_corpera, freq_corpera, sen_len
    return rank_corpera, freq_corpera

# Rank-Frequency calculations

In [65]:
# Returns a dataframe of word frequencies for list of corpora,
# with each column corresponding to a different corpus.
# Input: list of corpora. Each corpus consists of a list of tokenized sentences.
def calculate_freqs(freq_sents, norm=True, text=None):
    freq_dict = {}
    norm_dict = {}
    for i, corpus in enumerate(freq_sents):
        freq_dict['{} c_frequency {}'.format(text,i)] = collections.Counter(corpus)
        if norm:
            len_corp = len(corpus)
            norm_dict['{} c_frequency {}'.format(text, i)] = {k: v / len_corp for k, v in freq_dict['{} c_frequency {}'.format(text,i)].items()}
    
    if norm:
        freqs_df = pd.DataFrame(norm_dict)
    else:
        freqs_df = pd.DataFrame(freq_dict)
    freqs_df = freqs_df.fillna(0)
    
    
    return freqs_df

In [66]:
# Returns a dataframe with the mean frequency of each word across different corpora.
# Input: frequency dataframe
def mean_freqs(freqs_df):
    return(freqs_df.mean(axis=1))

In [67]:
# Returns a dataframe of word ranks for list of corpora,
# with each column corresponding to a different corpus.
# Input: list of corpora. Each corpus consists of a list of tokenized sentences.
def calculate_ranks(rank_sents, norm=False, text=None):
    ranks_dicts = {}
    for i, corpus in enumerate(rank_sents):
        freqs = collections.Counter(corpus)
        if norm:
            len_corp = len(corpus)
            for key in freqs:
                freqs[key] /= len_corp
        ranks_dicts['{} c_rank {}'.format(text, i)] = {w: r for r, (w, c) in enumerate(freqs.most_common(), 1)}
    
    ranks_df = pd.DataFrame(ranks_dicts)
    for column in ranks_df:
        min_rank = int(np.ceil(ranks_df[column].max() + 1))
        nan_rows = ranks_df[ranks_df[column].isnull()]
        num_nans = len(nan_rows)
        nan_ranks = list(range(min_rank, min_rank+num_nans))
        random.shuffle(nan_ranks)
        ranks_df.loc[ranks_df[column].isnull(), column] = nan_ranks

    return ranks_df

In [68]:
# Returns a dataframe with the mean rank of each word across different corpora.
# Input: rank dataframe
def mean_ranks(ranks_df):
    return ranks_df.mean(axis=1)

In [69]:
# Creates combined dataframe of ranks and frequencies
# Input: 2 lists (freq_sents and rank_sents) of corpora. Each corpus
# consists of a list of tokenized sentences. These lists are to be obtained form
# subsampling.
def ranks_freqs(freq_sents, rank_sents, text=None, norm=False):
    freqs_df = calculate_freqs(freq_sents, text=text, norm=norm)
    freqs_df['Frequency'] = mean_freqs(freqs_df)
    ranks_df = calculate_ranks(rank_sents, text=text, norm=norm)
    ranks_df['Rank'] = mean_ranks(ranks_df)
    
    # Put mean ranks and freqs together and remove all words that
    # do not have both a rank and frequency (which happens when a word)
    # is only present in freq_sents and not in rank_sents or vice versa
    ranks_freqs_df = pd.concat([ranks_df, freqs_df], axis = 1)
    ranks_freqs_df = ranks_freqs_df.dropna()
#     ranks_freqs_df = ranks_freqs_df.loc[ranks_freqs_df['Frequency'] >=1]
    return ranks_freqs_df

# Zipf's law

In [70]:
# MLE of Zipf's law parameters (alpha and beta)
class Mandelbrot(GenericLikelihoodModel):

    def __init__(self, frequencies, ranks, **kwargs):
        if not len(frequencies) == len(ranks):
            raise ValueError("NOT THE SAME NUMBER OF RANKS AND FREQS!")
        
        frequencies = np.asarray(frequencies)
        ranks = np.asarray(ranks)
        
        self.n_obs = np.sum(frequencies)
        
        super().__init__(endog=frequencies, exog=ranks, **kwargs)
        self.fit_result = None
    

    def prob(self, params, ranks=None, log=False):
        if ranks is None:
            ranks = self.exog
        
        alpha, beta = params
        if log:
            return -alpha*lg(beta+ranks) - lg(zeta(alpha, q=beta+1.))
        else:
            return ((beta + ranks)**(-alpha))/zeta(alpha, q=beta+1.)
    
    
    def loglike(self, params):
        rs = self.exog
        fs = self.endog
        alpha, beta = params
        
#        if alpha > 10 or beta > 20:
#            return -np.inf
        
#         if alpha < 1.0 or beta < 0.0:
#             return -np.inf
        
        # no need to calculate P(r) when observed f(r) was zero
        log_probs = -alpha*lg(beta+rs) - lg(zeta(alpha, q=beta+1.))
        log_probs = log_probs.reshape(-1, )
        return np.sum(fs * log_probs) - beta**5
    
    
    def register_fit(self, fit_result, overwrite=False):
        if not self.fit_result is None and not overwrite:
            raise ValueError("A fit result is already registered and overwrite=False!")
            
        self.fit_result = fit_result
        self.optim_params = fit_result.params
        self.pseudo_r_squared = self.pseudo_r_squared(self.optim_params)
        self.SE, self.SE_relative = fit_result.bse, fit_result.bse/self.optim_params
        self.BIC, self.BIC_relative = fit_result.bic,\
                            (-2*self.null_loglike())/fit_result.bic
        
        return self.optim_params
    
    def print_result(self, string=False):
        if self.fit_result is None:
            raise ValueError("Register a fitting result first!")

        def format_x(x):
            return float('{0:.3g}'.format(x))


        s = "="*50
        s += "\n" + "MANDELBROT"
        s += "\n" + "  Optimal Parameters " + str(tuple(map(format_x, self.optim_params)))
        
        s += "\n" + "  Standard Error [relative]: " + str(tuple(map(format_x, self.SE))) +\
              ", [" + str(tuple(map(format_x, self.SE_relative))) + "]"
        
        s += "\n" + "  Pseudo R^2: " + str(format_x(self.pseudo_r_squared))
        
        s += "\n" + "  BIC [relative]: " + str(format_x(self.BIC)) +\
              ", [" + str(format_x(self.BIC_relative)) + "]"
        s += "\n" + "="*50
        
        if string:
            return s
        
        print(s)
    
    
    def null_loglike(self, epsilon=1e-10):
        return self.loglike((1.+epsilon, 0.0))
    
    def pseudo_r_squared(self, params):
        return 1-self.loglike(params)/self.null_loglike()
    
    
    def predict(self, params, ranks=None, freqs=True, n_obs=None, 
                correct_for_finite_domain=True):
        if ranks is None:
            ranks = self.exog
        ranks = np.asarray(ranks)
        
        if n_obs is None:
            n_obs = self.n_obs
            
        alpha, beta = params
        pred_probs = self.prob(params, ranks=ranks, log=False)
        
        if correct_for_finite_domain:
            if not freqs:
                raise NotImplementedError("Correction for "\
                                          "finite domain not implemented with probabilities!")
            return pred_probs*(n_obs/np.sum(pred_probs))
        
        if freqs:
            return n_obs*pred_probs
        
        return pred_probs

In [71]:
# Returns a dataframe containing the mean frequencies and ranks, as well as 
# the estimated frequencies from Zipf's law and the error between the (log) mean
# frequencies and (log) estimated frequencies.
def zipfs_law(df, print_stats = True):
    mandelbrot = Mandelbrot(df['Frequency'], df['Rank'])
    mandelbrot_fit = mandelbrot.fit(start_params=np.asarray([1.0, 1.0]), # [1.0, 1.0]
                                method="powell", full_output=True, disp=0)
    mandelbrot.register_fit(mandelbrot_fit)
    if print_stats:
        mandelbrot.print_result()
    
    model_params = mandelbrot.optim_params
    alpha, beta =  mandelbrot.optim_params
    preds = mandelbrot.predict(model_params, df['Rank'])

    df['Estimated frequency'] = preds
    return df

In [None]:
def plot_zipf(ranks_freqs_df):
    ranks_freqs_df = ranks_freqs_df.sort_values(by=['Rank'])
    zipf_df = zipfs_law(ranks_freqs_df)
#     ranks_freqs_df = ranks_freqs_df.loc[ranks_freqs_df['Frequency'] >=1]
#     hexbin_plot(ranks_freqs_df['Rank'], ranks_freqs_df['Frequency'], est = ranks_freqs_df['Estimated frequency'])
#     plt.show()
#     hexbin_error(zipf_df['Rank (log)'], zipf_df['Error'])
#     plt.show()
    
    return zipf_df

# Mann-Whitney test

In [73]:
# Divides a big corpus into "n" subcorpera and calculates the frequencies for each
# subcorpus. Returns a dataframe containing the frequencies by word and by rank.
def sample_corpora(corpus, text, n=10, norm=True, subclasses=False):
    num_corp = len(corpus)
    corpus = [item for sublist in corpus for item in sublist]
    rank_corp, freq_corp = subsampling(corpus, k=num_corp*100, m=n)

    by_rank = pd.DataFrame()
    by_word = pd.DataFrame()

    ranks_freqs_df = ranks_freqs(rank_corp, freq_corp, text=text, norm=norm)
#     print(ranks_freqs_df)
    ranks_freqs_df = zipfs_law(ranks_freqs_df, print_stats=False)
    ranks_freqs_df['Error'] = abs(ranks_freqs_df['Frequency'] - ranks_freqs_df['Estimated frequency'])
    ranks_freqs_df['Tot rank'] = ranks_freqs_df['Rank'].rank(method='first')
    
    by_ranks_pos = ranks_freqs_df.copy()
    by_ranks_pos.reset_index(inplace=True)

    if subclasses == False:
        by_ranks_pos['level_1'] = by_ranks_pos['level_1'].replace({'NN': 'Noun', 'NNS':'Noun', 
                                           'NNP':'Noun', 'VB':'Verb', 'VBD':'Verb', 
                                           'VBG':'Verb', 'VBN':'Verb', 'VBP':'Verb', 
                                          'VBZ':'Verb', 'JJ':'Adjective', 
                                           'JJR':'Adjective', 'JJS':'Adjective', 'RB':'Adverb',
                                          'RBR':'Adverb', 'RBS':'Adverb'})
    

    by_ranks_pos['PoS rank'] = by_ranks_pos.groupby('level_1')['Rank'].rank(method='first')
    by_ranks_pos = by_ranks_pos.set_index(['level_1', 'PoS rank'])
    
    if subclasses == False:
        classes = ['Noun', 'Verb', 'Adjective', 'Adverb']
    else:
        classes = ['NN', 'NNS', 'NNP', 'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ', 'JJ', 'JJR', 'JJS', 'RB', 'RBR', 'RBS']
    
    by_ranks_pos = by_ranks_pos.loc[classes]
    by_ranks_pos = by_ranks_pos.filter(regex='c_frequency|Frequency|Error')
    by_ranks_pos = by_ranks_pos.rename(columns={"Frequency": "{} mean freq".format(text),
                                         "Error": "{} error".format(text)})
    

    
    by_rank = ranks_freqs_df.filter(regex='Tot rank|c_frequency|Frequency|Error').set_index(['Tot rank'])
    by_rank = by_rank.rename(columns={"Frequency": "{} mean freq".format(text),
                                         "Error": "{} error".format(text)})
    
    by_word = ranks_freqs_df.filter(regex='c_frequency|Frequency|Error')
    by_word = by_word.rename(columns={"Frequency": "{} mean freq".format(text),
                                         "Error": "{} error".format(text)})

    by_word = by_word.sort_values(by=['{} mean freq'.format(text)], ascending=False)
    by_rank = by_rank.sort_values(by=['Tot rank'])
    by_ranks_pos = by_ranks_pos.sort_index(by=['level_1', 'PoS rank'])
    
    by_word = by_word.fillna(0)
    by_rank = by_rank.dropna()
    by_ranks_pos = by_ranks_pos.dropna()
    
    return by_word, by_rank, by_ranks_pos
#     return None

In [74]:
# Takes 2 corpora and aligns their frequency values by specific words and ranks 
# so that the Mann-Whitney test can be applied to the frequencies of every word
# or rank.
def mann_whitney_df(corpus1, corpus2, n=10, t=0, norm=True, subclasses=False):
    words_corpus1, ranks_corpus1, ranks_pos_corpus1 = sample_corpora(corpus1, text="C1", n=n, norm=norm, subclasses=subclasses)
    words_corpus2, ranks_corpus2, ranks_pos_corpus2 = sample_corpora(corpus2, text="C2", n=n, norm=norm, subclasses=subclasses)
    
    words_df = pd.concat([words_corpus1, words_corpus2], axis=1)
    lol = words_df.loc[:, words_df.columns.str.contains('freq')].fillna(0)
    words_df.loc[:, words_df.columns.str.contains('freq')] = lol
    words_df['error diff'] = abs(words_df['C1 error'] - words_df['C2 error'])
#     words_df = words_df.fillna(0)
    
    
    ranks_df = pd.concat([ranks_corpus1, ranks_corpus2], axis=1)
    ranks_df = ranks_df.dropna()
    ranks_df['error diff'] = abs(ranks_df['C1 error'] - ranks_df['C2 error'])

    ranks_pos_df = pd.concat([ranks_pos_corpus1, ranks_pos_corpus2], axis=1)
    ranks_pos_df = ranks_pos_df.dropna()
    return words_df, ranks_df, ranks_pos_df

In [75]:
# Applies the Mann-Whitney test to a dataframe containing frequencies per word
# or rank.
def mann_whitney_test(df, n=10):
    stats = []
    p = []
    
    df1 = df.loc[:, df.columns.str.contains('frequency')]
    
    for index, row in df1.iterrows():
        mann = mannwhitneyu(row[0:n], row[n:])
        stats.append(mann[0])
        p.append(mann[1])
    df['statistics'] = stats
    df['p-value'] = p
    df["H0"] = df['p-value'] > 0.05
    df = df.filter(regex='mean freq|error|fano|statistics|p-value|H0')
    
    return df

In [76]:
# Takes 2 corpora, and applies the Mann-Whitney test to ranks and specific words.
# Returns dataframes containing the results for both methods.
def mann_whitney_words_ranks(corpus1, corpus2, n=10, repeat = 10, t=0, norm=True, subclasses=False):
    words_df, ranks_df, ranks_pos_df = mann_whitney_df(corpus1, corpus2, n=n, t=t, norm=norm, subclasses=subclasses)
    df1 = mann_whitney_test(words_df, n=n)
    df2 = mann_whitney_test(ranks_df, n=n)
    df3 = mann_whitney_test(ranks_pos_df, n=n)
    return df1, df2, df3

In [77]:
# Takes 2 corpora, and applies the Mann-Whitney test to specific words and ranks.
# Calculates for both methods the percentage of words/ranks that reject H0.
def stats_mw(corpus1, corpus2, n=10, t=0, norm=True, print_stats=True, subclasses=False):
    words_df, ranks_df, ranks_pos_df = mann_whitney_words_ranks(corpus1, corpus2, n=n, t=t, norm=norm, subclasses=subclasses)

    tot_words = len(words_df)
    no_h0_words = len(words_df.loc[words_df['p-value']<=0.05])
    perc_words = no_h0_words/tot_words*100
    
    tot_ranks = len(ranks_df)
    no_h0_ranks = len(ranks_df.loc[ranks_df['p-value']<=0.05])
    perc_ranks = no_h0_ranks/tot_ranks*100
    
    if print_stats:
        print("WORDS:\n")
        print("Total words: ", tot_words)
        print("No H0: ", no_h0_words)
        print("Percentage: ", perc_words)
    
        print("\n\nRANKS:\n")
        print("Total ranks: ", tot_ranks)
        print("No H0: ", no_h0_ranks)
        print("Percentage: ", perc_ranks)
    
    stats = [perc_words, perc_ranks]
    
    return words_df, ranks_df, stats, ranks_pos_df

In [78]:
# Input: a dataframe with calculated Mann-Whitney values (of 2 corpora) for words 
# or ranks.
# Returns: dataframe with percentages of words or ranks that reject H0, grouped by
# ranks.
def dif_ranks(df, error=True):
    df_by_ranks = pd.Series()
    error_by_ranks = pd.Series()
    
    ranks = df[0:10]
    
    try:
        top_ten = len(ranks.loc[ranks['p-value']<=0.05]) * 10
    except ZeroDivisionError:
        top_ten = None
    df_by_ranks['1-10'] = top_ten
    
    if error:
        error_ten = ranks['error diff'].mean()
        error_by_ranks['1-10'] = error_ten
    cur_ranks = 10
    max_rank = len(df)
    
    while 2*cur_ranks <= max_rank:
        ranks = df[cur_ranks:2*cur_ranks]
        try:
            perc = len(ranks.loc[ranks['p-value']<=0.05])/len(ranks) * 100
        except ZeroDivisionError:
            perc = None
        df_by_ranks['{}-{}'.format(cur_ranks+1,2*cur_ranks)] = perc
        if error:
            error1 = ranks['error diff'].mean()
            error_by_ranks['{}-{}'.format(cur_ranks+1,2*cur_ranks)] = error1
        cur_ranks *= 2
        
    ranks = df[cur_ranks:]
    try:
        perc = len(ranks.loc[ranks['p-value']<=0.05])/len(ranks) * 100
    except ZeroDivisionError:
        perc = None
    df_by_ranks['{}-end'.format(cur_ranks+1,max_rank)] = perc
    
    if error:
        error1 = ranks['error diff'].mean()
        error_by_ranks['{}-end'.format(cur_ranks+1,2*cur_ranks)] = error1
        return df_by_ranks, error_by_ranks
    
    return df_by_ranks.sort_index()

In [79]:
# Takes 2 corpora, and applies the Mann-Whitney procedure to "times" subparts
# of both corpora. 
# Returns dataframes containing distributions of the total percentages as well 
# as per-rank percentages of rejected H0 ranks and words.
def stats_dist(corpus1, corpus2, times=10, n=10, t=0, norm=True, subclasses=False):
    if subclasses == False:
        classes = ['Noun', 'Verb', 'Adjective', 'Adverb']
    else:
        classes = ['NN', 'NNS', 'NNP', 'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ', 'JJ', 'JJR', 'JJS', 'RB', 'RBR', 'RBS']
    
    len_corp = int(len(corpus1)/times)
    ranks_stats_df = {}
    words_stats_df = {}
    
    pos_stats_df = pd.DataFrame()
    
    error_ranks_stats_df = {}
    error_words_stats_df = {}
    
    
    for i in range(times):
        i_words = {}
        i_ranks = {}
        
        i_error_words = {}
        i_error_ranks = {}
        
        corpus1_samp = corpus1[i*len_corp:(i+1)*len_corp]
        corpus2_samp = corpus2[i*len_corp:(i+1)*len_corp]
        words_df, ranks_df, stats, ranks_pos_df = stats_mw(corpus1_samp, corpus2_samp, n=n, t=t, norm=norm, print_stats=False, subclasses=subclasses)
        words_df['comb mean'] = words_df[['C1 mean freq', 'C2 mean freq']].mean(axis=1)
        words_df = words_df.sort_values(by=['comb mean'], ascending=False)
        ranks_df = ranks_df.sort_values(by=['Tot rank'])
        
        
        dif_ranks_words, dif_ranks_error_words = dif_ranks(words_df)
        dif_ranks_ranks, dif_ranks_error_ranks = dif_ranks(ranks_df)
        dif_ranks_pos = dif_ranks(ranks_pos_df, error=False)
        
        
        i_words['total']= stats[0]
        for index, value in dif_ranks_words.items():
            i_words[index] = value
        
        i_ranks['total'] = stats[1]
        for index, value in dif_ranks_ranks.items():
            i_ranks[index] = value
         
        
    
        i_pos = pd.DataFrame()
        pos = {}
        for pos_class in classes:
            pos[i] = {}
            df = ranks_pos_df.loc[[pos_class]]
            try:
                pos[i][(pos_class, 'total')] = len(df.loc[df['p-value']<=0.05])/len(df) * 100
            except ZeroDivisionError:
                pos[i][(pos_class,'total')] = None
            dif_ranks_pos = dif_ranks(ranks_pos_df.loc[[pos_class]], error=False).sort_index()
            for index, value in dif_ranks_pos.items():
                pos[i][(pos_class,index)] = value
            i_pos = pd.concat([i_pos, pd.DataFrame(pos)], axis=0)
            
        

        pos_stats_df = pd.concat([i_pos, pos_stats_df], axis=1).sort_index()
        words_stats_df[i] = i_words
        ranks_stats_df[i] = i_ranks
        
#         print(ranks_df.loc[ranks_df['H0'] == False]['error diff'])
        
        i_error_words['total'] = words_df['error diff'].mean()
        for index, value in dif_ranks_error_words.items():
            i_error_words[index] = value
        
        i_error_ranks['total'] = ranks_df['error diff'].mean()
        for index, value in dif_ranks_error_ranks.items():
            i_error_ranks[index] = value
            
        error_words_stats_df[i] = i_error_words
        error_ranks_stats_df[i] = i_error_ranks
        
   
    return pd.DataFrame(words_stats_df), pd.DataFrame(ranks_stats_df), pos_stats_df, pd.DataFrame(error_words_stats_df), pd.DataFrame(error_ranks_stats_df)

In [80]:
def readable_stats(corpus1, corpus2, times=10, n=10, t=0, norm=True, subclasses=False):
    words_stats_df, ranks_stats_df, pos_stats_df, words_error_stats_df, ranks_error_stats_df = stats_dist(corpus1, corpus2, times=times, n=n, t=t, norm=norm, subclasses=subclasses)
    ranks = pd.DataFrame()
    words = pd.DataFrame()
    pos = pd.DataFrame()
    
    words['mean perc'] = words_stats_df.mean(axis=1)
    words['std perc'] = words_stats_df.std(axis=1)
#     words['mean error'] = words_error_stats_df.mean(axis=1)
#     words['std error'] = words_error_stats_df.std(axis=1)

    
    ranks['mean perc'] = ranks_stats_df.mean(axis=1)
    ranks['std perc'] = ranks_stats_df.std(axis=1)
#     ranks['mean error'] = ranks_error_stats_df.mean(axis=1)
#     ranks['std error'] = ranks_error_stats_df.std(axis=1)
    
    pos['mean perc'] = pos_stats_df.mean(axis = 1)
    pos['std perc'] = pos_stats_df.std(axis = 1)
    
#     return words, ranks, pos, words_stats_df, ranks_stats_df, pos_stats_df, words_error_stats_df, ranks_error_stats_df


    return words, ranks, pos

In [81]:
def parameter_tests(size, start=0, times=10, n=10):
    words_gpt_web, ranks_gpt_web, pos_gpt_web = readable_stats(gpt_train_20000[start:start+size*times], web_train_20000[start:start+size*times], times=times, n=n, t=0, norm=True, subclasses=False)
    words_web_web, ranks_web_web, pos_web_web = readable_stats(web_train_20000[start:start+size*times], web_train_20000[start+size*times:start+2*size*times], times=times, n=n, t=0, norm=True, subclasses=False)
    words_gpt_gpt, ranks_gpt_gpt, pos_gpt_gpt = readable_stats(gpt_train_20000[start:start+size*times], gpt_train_20000[start+size*times:start+2*size*times], times=times, n=n, t=0, norm=True, subclasses=False)
    
    words = pd.concat([words_gpt_web, words_web_web, words_gpt_gpt], axis=1)
    ranks = pd.concat([ranks_gpt_web, ranks_web_web, ranks_gpt_gpt], axis=1)
    pos =  pd.concat([pos_gpt_web, pos_web_web, pos_gpt_gpt], axis=1)
    return words, ranks, pos
    

# Automated text classification

In [86]:
# Takes 2 corpora, and applies the Mann-Whitney procedure to "times" subparts
# of both corpora. 
# Returns dataframes containing distributions of the total percentages as well 
# as per-rank percentages of rejected H0 ranks and words.
def stats_dist2(corpus1, corpus2, times=10, n=10, t=0, norm=True, subclasses=False):
    if subclasses == False:
        classes = ['Noun', 'Verb', 'Adjective', 'Adverb']
    else:
        classes = ['NN', 'NNS', 'NNP', 'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ', 'JJ', 'JJR', 'JJS', 'RB', 'RBR', 'RBS']
    
    len_corp = int(len(corpus2)/times)
    ranks_stats_df = {}
    words_stats_df = {}
    
    pos_stats_df = pd.DataFrame()
    
    error_ranks_stats_df = {}
    error_words_stats_df = {}
    
    
    for i in range(times):
        i_words = {}
        i_ranks = {}
        
        i_error_words = {}
        i_error_ranks = {}
        
        corpus1_samp = corpus1
        corpus2_samp = corpus2[i*len_corp:(i+1)*len_corp]
        words_df, ranks_df, stats, ranks_pos_df = stats_mw(corpus1_samp, corpus2_samp, n=n, t=t, norm=norm, print_stats=False, subclasses=subclasses)
        words_df['comb mean'] = words_df[['C1 mean freq', 'C2 mean freq']].mean(axis=1)
        words_df = words_df.sort_values(by=['comb mean'], ascending=False)
        ranks_df = ranks_df.sort_values(by=['Tot rank'])
        
        
        dif_ranks_words, dif_ranks_error_words = dif_ranks(words_df)
        dif_ranks_ranks, dif_ranks_error_ranks = dif_ranks(ranks_df)
        dif_ranks_pos = dif_ranks(ranks_pos_df, error=False)
        
        
        i_words['total']= stats[0]
        for index, value in dif_ranks_words.items():
            i_words[index] = value
        
        i_ranks['total'] = stats[1]
        for index, value in dif_ranks_ranks.items():
            i_ranks[index] = value
         
        
    
        i_pos = pd.DataFrame()
        pos = {}
        for pos_class in classes:
            pos[i] = {}
            df = ranks_pos_df.loc[[pos_class]]
            try:
                pos[i][(pos_class, 'total')] = len(df.loc[df['p-value']<=0.05])/len(df) * 100
            except ZeroDivisionError:
                pos[i][(pos_class,'total')] = None
            dif_ranks_pos = dif_ranks(ranks_pos_df.loc[[pos_class]], error=False).sort_index()
            for index, value in dif_ranks_pos.items():
                pos[i][(pos_class,index)] = value
            i_pos = pd.concat([i_pos, pd.DataFrame(pos)], axis=0)
            
        

        pos_stats_df = pd.concat([i_pos, pos_stats_df], axis=1).sort_index()
        words_stats_df[i] = i_words
        ranks_stats_df[i] = i_ranks
        
#         print(ranks_df.loc[ranks_df['H0'] == False]['error diff'])
        
        i_error_words['total'] = words_df['error diff'].mean()
        for index, value in dif_ranks_error_words.items():
            i_error_words[index] = value
        
        i_error_ranks['total'] = ranks_df['error diff'].mean()
        for index, value in dif_ranks_error_ranks.items():
            i_error_ranks[index] = value
            
        error_words_stats_df[i] = i_error_words
        error_ranks_stats_df[i] = i_error_ranks
        
   
    return pd.DataFrame(words_stats_df), pd.DataFrame(ranks_stats_df), pos_stats_df, pd.DataFrame(error_words_stats_df), pd.DataFrame(error_ranks_stats_df)

In [87]:
def readable_stats2(corpus1, corpus2, times=10, n=10, t=0, norm=True, subclasses=False):
    words_stats_df, ranks_stats_df, pos_stats_df, words_error_stats_df, ranks_error_stats_df = stats_dist2(corpus1, corpus2, times=times, n=n, t=t, norm=norm, subclasses=subclasses)
    ranks = pd.DataFrame()
    words = pd.DataFrame()
    pos = pd.DataFrame()
    
    words['mean perc'] = words_stats_df.mean(axis=1)
    ranks['mean perc'] = ranks_stats_df.mean(axis=1)
    
    pos['mean perc'] = pos_stats_df.mean(axis = 1)
    
    return words, ranks, pos

In [1]:
# Funtion that automatically classifes corpora as being human or machine, using
# the Mann-Whitney test based on the top 60 ranks of individual words and ranks.
#
# Parameters:
# - gpt: True when you're classifying gpt-generated corpus, false when
# you're classifying a human-written corpus.
# 
# - size: amount of texts you want to classify at a time (I normally use 10 
# so that you have about 10 x 1.000 = 10.000 tokens)
# 
# - own_set: When none, it will use the webtext and gpt set, or you can enter
# Riemer's sets which it will then use.
# 
# - rep: amount of repetitions, so how many corpora of size "size" you want to 
# classify
# 
# - times = how many times you want the mann-whitney test to be repeated on a corpus
# 
# - n: mann-whitney parameter, namely the amount of subcorpora you want to divide the
# input into

# returns: the amount of times a text got classified as human and as machine-generated
# for both the use of individual words and ranks, alongside a list of the mean 
# percentages for these outcomes.

    def test_rep(gpt, size, own_set=None, rep = 50, times = 5, n=10):
    j = size
    
    num_corp = size * times
    
    w_human = 0
    w_machine = 0
    r_human = 0
    r_machine = 0
    
    means_w_human = []
    means_w_machine =[]
    means_r_human = []
    means_r_machine = []

    indexen = [0, 14, 24, 40, 53, 65]
    
    for i in range(rep):
        random.shuffle(gpt_train_20000)
        random.shuffle(web_train_20000)
        
        if gpt == True:
            if own_set is not None:
                corpus = own_set[indexen[i]:indexen[i+1]]
            else:
                corpus = gpt_train_20000[:size]
        else:
            if own_set is not None:
                corpus = own_set[indexen[i]:indexen[i+1]]
            else:
                corpus = web_train_20000[:size]
        
        web = web_train_20000[size:size+num_corp]
        gpt = gpt_train_20000[size:size+num_corp]
        words_web, ranks_web, pos_web = readable_stats2(corpus, web, times=times, n=n)
        words_gpt, ranks_gpt, pos_gpt = readable_stats2(corpus, gpt, times=times, n=n)
        
        print()
        
        if words_web['mean perc'][1:5].mean() < words_gpt['mean perc'][1:5].mean():
            w_human += 1
        
        else:
            w_machine += 1
        means_w_machine.append(words_gpt['mean perc'][1:5].mean())
        
        if ranks_web['mean perc'][1:5].mean() < ranks_gpt['mean perc'][1:5].mean():
            r_human += 1
        else:
            r_machine += 1
        
        means_w_human.append(words_web['mean perc'][1:5].mean())
        means_w_machine.append(words_gpt['mean perc'][1:5].mean())
        means_r_human.append(ranks_web['mean perc'][1:5].mean())
        means_r_machine.append(ranks_gpt['mean perc'][1:5].mean())
        
    means_list = [np.mean(means_w_human), np.mean(means_r_human), np.mean(means_w_machine), np.mean(means_r_machine)]
    return w_human, r_human, w_machine, r_machine, means_list

# Voor Riemer:
test_rep is de functie die automatisch teksten classificeert; hierbij krijg je niet de mega tabellen te zien waarin de Mann-Whitney test weergeven staat, maar
slechts de resultaten. Stel je wil maar 1 van je generated corpussen checken, van bijv 10 teksten, vul je dus bijv in ```test_rep(True, 10, own_set=jouw_set[0:10], rep = 1, times = 10, n=10)```

In [162]:
test_rep(False, 10, own_set=True, rep = 5, times = 10, n=10)








(1, 0, 4, 5, [59.825, 55.725, 50.01042656530461, 35.81152993348115])

In [109]:
test = [[item for sublist in corpus for item in sublist] for corpus in human_set]

In [110]:
lens = [len(corpus) for corpus in test]

In [125]:
np.sum(lens[10:20])

5721

In [112]:
test2 = [[item for sublist in corpus for item in sublist] for corpus in gpt_set]

In [113]:
lens2 = [len(corpus) for corpus in test2]

In [150]:
np.sum(lens[54:65])

10033

# Color stuff

In [176]:
test = pd.read_csv (r'data/webtext.test.csv')
test = test.loc[test['length']>= 1000]
test = test.dropna()
test = test['text'].tolist()

In [210]:
test = pd.read_csv (r'data/small-117M-k40.test.csv')
test = test.loc[test['length']>= 1000]
test = test.dropna()
test = test['text'].tolist()

In [243]:
pd.read_csv (r'data/small-117M-k40.test.csv')

Unnamed: 0,id,text,length,ended
0,255001,"Dawn on its way to victory, but where does it ...",1024,False
1,255002,\nDrew Angerer/Getty Images\n\nThere are many ...,1024,False
2,255003,"In my last post, I gave a post on the use of t...",1024,False
3,255004,Brought to you by:\n\nKFC\n\nHogan's Day\n\nSo...,1024,False
4,255005,"\nIt's time for the first time since 2008, and...",480,True
...,...,...,...,...
4995,259996,I've read articles talking about how to manage...,111,True
4996,259997,"The first part of my work-in-progress, The Unf...",1024,False
4997,259998,"A lot of these things are already pretty cool,...",395,True
4998,259999,S. 3148 – 114th Congress (2017-2018) To amend ...,1024,False


In [177]:
test_web = [make_file(text, multi=False, pos=True) for text in test]

In [211]:
test_gpt = [make_file(text, multi=False, pos=True) for text in test]

In [212]:
pickle.dump(test_gpt, open( "test_gpt.p", "wb" ))

In [179]:
df_real_1 = pickle.load(open("df_real_1.pkl", "rb" ))

In [None]:
big_df = pickle.load(open("web_train_20000.p", "rb" ))

In [170]:
def highlight(colour, text):
    if colour == "red":
        return "\033[1;41m" + str(text) + "\033[1;m"
    if colour == "green":
        return "\033[1;42m" + str(text) + "\033[1;m"
    if colour == "yellow":
        return "\033[1;43m" + str(text) + "\033[1;m"
    if colour == "cyan":
        return "\033[1;46m" + str(text) + "\033[1;m"
    return str(text)

In [206]:
[0,0,0].add([1,1,1])

AttributeError: 'list' object has no attribute 'add'

In [246]:
def color_text(corpus):
    text = ""
    count = [0,0,0,0,0,0]
    for word, pos in corpus:
        try:
            rank = df_reals.loc[word]['Tot rank'][0]
            if rank <= 10:
                text += highlight("green", word)
                count[0] += 1
            elif rank <= 100:
                text += highlight("yellow", word)
                count[1] += 1
            elif rank <= 1000:
                text += highlight("red", word)
                count[2] += 1
            elif rank <= 10000:
                text += highlight("cyan", word)
                count[3] += 1
            else:
                text += word
                count[4] += 1

        except KeyError:
            text += highlight("magenta", word)
            count[5] += 1
        text += " "
    count = [0 for num in count if not num]
    count = [num/len(corpus) for num in count]
    return text, count

In [238]:
lol = [[item for sublist in corpus for item in sublist] for corpus in test_gpt]

In [240]:
loll = [len(corpus) for corpus in lol]

In [250]:
tot = [0,0,0,0,0,0]
for corpus in test_web:
    corpus = [item for sublist in corpus for item in sublist]
    if len(corpus) == 0:
        continue
    count = color_text(corpus)[1]
    tot = [x + y for x, y in zip(tot, count)]

In [251]:
tot_gpt = [0,0,0,0,0,0]
for corpus in test_gpt:
    corpus = [item for sublist in corpus for item in sublist]
    if len(corpus) == 0:
        continue
    count = color_text(corpus)[1]
    tot_gpt = [x + y for x, y in zip(tot, count)]

In [252]:
tot

[]

In [253]:
tot_gpt

[]

In [216]:
[num/len(test_web) for num in tot]

[160.08554996405465,
 165.39611790079078,
 157.84974838245867,
 175.60388209920922,
 76.72969086987779,
 29.74550682961898]

In [217]:
[num/len(test_gpt) for num in tot_gpt]

[133.48532055122828,
 137.86339125224686,
 131.64230077890952,
 146.46434991012583,
 63.98382264829239,
 24.829239065308567]

In [219]:
gpt_1 = color_text(test_gpt[0])

In [220]:
print(gpt_1[0])

dawn [1;42mon[1;m [1;43mits[1;m [1;41mway[1;m [1;42mto[1;m victory [1;43mbut[1;m [1;43mwhere[1;m [1;41mdoes[1;m [1;42mit[1;m [1;41mend[1;m [1;42mit[1;m s [1;43mbeen[1;m [1;43mabout[1;m [1;42mthe[1;m [1;43mlast[1;m [1;41mfive[1;m [1;41mdays[1;m [1;42mon[1;m [1;42mthe[1;m [1;43mlast[1;m [1;41mday[1;m [1;42mof[1;m [1;41moctober[1;m [1;43mi[1;m [1;41mthought[1;m [1;43mthere[1;m [1;43mwere[1;m [1;41mgoing[1;m [1;42mto[1;m [1;43mbe[1;m [1;42ma[1;m [1;41mlot[1;m [1;42mof[1;m announcements [1;43mfrom[1;m [1;42mthe[1;m [1;41mgame[1;m makers [1;43mover[1;m [1;43mat[1;m eurogamer [1;42mand[1;m [1;42mthe[1;m [1;41mfact[1;m [1;43mthat[1;m [1;43mthere[1;m [1;43mwill[1;m [1;43monly[1;m [1;43mbe[1;m [1;43mone[1;m announcement [1;43mat[1;m [1;42mthe[1;m close [1;43mwas[1;m [1;41mgreat[1;m [1;43mbut[1;m [1;43mwas[1;m [1;43malso[1;m [1;42mthe[1;m beginning [1;42mof[1;m [1;42ma[1;m [1;41mreally

In [221]:
web_1 = color_text(test_web[0])

In [223]:
print(web_1[0])

[1;42mthe[1;m wolf [1;41mcurrently[1;m [1;43mhas[1;m [1;42mthe[1;m [1;41mformer[1;m seminole [1;43mas[1;m [1;43mhis[1;m rb33 [1;43mbut[1;m expect dalvin cook [1;42mto[1;m [1;43mget[1;m [1;42ma[1;m gigantic bump [1;42min[1;m [1;42mthe[1;m rankings [1;43mafter[1;m [1;41mtaking[1;m advantage [1;42mof[1;m latavius murray s absence [1;41mduring[1;m [1;42mthe[1;m [1;43mfirst[1;m [1;41mweek[1;m [1;42mof[1;m [1;41mtraining[1;m camp cook [1;42mis[1;m catching [1;42mthe[1;m eye [1;42mof[1;m [1;43mhis[1;m teammates [1;42mand[1;m coaches [1;42mand[1;m [1;41mcurrently[1;m possesses workhorse [1;41mpotential[1;m [1;42min[1;m minnesota [1;42ma[1;m succession [1;42mof[1;m poor decisions [1;41moff[1;m [1;42mthe[1;m [1;41mfield[1;m ball [1;41msecurity[1;m [1;41missues[1;m [1;42mon[1;m [1;42mthe[1;m [1;41mfield[1;m [1;42mand[1;m [1;42ma[1;m mediocre combine dropped [1;42mthe[1;m ubertalented dalvin cook [1;43mall[1;

In [186]:
df_reals = df_real_1[['Rank', 'Frequency']]
df_reals['Tot rank'] = df_reals['Rank'].rank(method='first')

In [187]:
df_reals

Unnamed: 0,Unnamed: 1,Rank,Frequency,Tot rank
the,DT,1.0,57448.9,1.0
to,TO,2.0,27899.9,2.0
of,IN,3.0,26095.2,3.0
and,CC,4.0,25389.7,4.0
a,DT,5.0,23446.6,5.0
...,...,...,...,...
spectaclehis,JJ,211743.1,0.1,179737.0
townsquare,NNP,211927.5,0.1,179738.0
rarecasualplay,NNP,212223.7,0.1,179739.0
unrewarding,VBG,213806.5,0.1,179740.0
