In [3]:
import numpy as np
import pandas as pd
import random
import string
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from scipy.optimize import minimize
import math
import json
from nltk import tokenize
import collections
import re
import itertools
import nltk
from scipy.stats import mannwhitneyu

import statsmodels.api as sm
from statsmodels.base.model import GenericLikelihoodModel,\
        GenericLikelihoodModelResults

from statsmodels.nonparametric.smoothers_lowess import lowess

from scipy.special import zeta
from scipy.stats import binom

import pickle
import seaborn as sns

import warnings
warnings.filterwarnings("ignore")

lg = np.log10

from scipy.stats import chisquare

In [9]:
web_train_20000 = pickle.load(open("datasets/web_train_20000.p", "rb" ))

In [10]:
gpt_train_20000 = pickle.load(open("datasets/gpt_train_20000.p", "rb" ))

In [78]:
gpt_set = pickle.load(open("gpt_set.p", "rb" ))

In [79]:
human_set = pickle.load(open("human_set.p", "rb" ))

# Pre-processing

In [12]:
# Pre-processing without part of speech tags
def remove_punctuation(text):
    text = text.lower()
    chars_to_remove = "[\n]!\"#$%&'()*+,-./:;<=>?@[\]^_`{|}~"
    tr = str.maketrans(" ", " ", chars_to_remove)
    return text.translate(tr)


def preprocess(corpus, sent = True):
    if sent:
        corpus = tokenize.sent_tokenize(corpus)
        corpus = [remove_punctuation(sent).split() for sent in corpus]
    else:
        corpus = remove_punctuation(corpus).split()
    return corpus

In [13]:
# Pre-processing with part of speech tags
def part_of_speech(corpus):
    corpus = tokenize.sent_tokenize(corpus)
    chars_to_remove = "[\n]"
    tr = str.maketrans(" ", " ", chars_to_remove)
    chars_to_remove2 = "[\n]!\"#$%&'()*+,-./:;<=>?@[\]^_`{|}~"
    new_corp = []
    test = []

    for sent in corpus:
        sent = sent.translate(tr)
        words_sent = tokenize.word_tokenize(sent)
        sent_pos = nltk.pos_tag(words_sent)
        new_sent = []
        for (word, pos) in sent_pos:
            tr2 = str.maketrans("", "", chars_to_remove2)
            word = word.translate(tr2)
            if word:
                new_sent.append((word.lower(), pos))
        new_corp.append(new_sent)
    return new_corp

In [14]:
# Total preprocessing function for a corpus. Input can be one string (corpus),
# for which you put multi = False, or a list of several strings (corpora) that 
# you want to turn into one big corpus, for which you put multi = True.
# For PoS tags, put pos = True.
def make_file(corp, multi = True, sent = True, pos = False):
    if multi:
        corpus = ''
        for subcorp in corp:
            corpus += subcorp
    else:
        corpus = corp
        
    if pos:
        corpus = part_of_speech(corpus)
    
    else:
        corpus = preprocess(corpus, sent = sent)
    
    return corpus

# Subsampling

In [15]:
# Returns 2 lists of corpora, one from which the ranks will be calculated
# and one from which the frequencies will be calculated. Each corpus consists of
# a list of tokenized sentences.
# Input: corpus that is to be subsampled. Should be a list of tokenized sentences.
# k is the amount of tokens that each sampled corpus should contain,
# m is the amount of subcorpera you want for both the ranks and frequencies.
# Max: I would read Valentin's thesis for an explanation on subsampling
def subsampling(corpus, k = 1000000, m = 10, sent = True):
    n = len(corpus)
    
    sen_len = {}

    
    rank_corpera = []
    freq_corpera = []

    if sent:
        for i in range(m):
            used_rank = set()
            used_freq = set()
            rank_count = 0
            freq_count = 0
            rank_samples = []
            freq_samples = []

            while rank_count < k:
                index = np.random.randint(n)
                if index in used_rank:
                    continue

                rank_sample = corpus[index]
                len_sample = len(rank_sample)

                if len_sample == 0:
                    continue

                if rank_count > k:
                    max_len = len_sample - (rank_count - k)
                    rank_sample = rank_sample[:max_len]
                    
                rank_samples += rank_sample
                rank_count += len_sample


                used_rank.add(index)

            while freq_count < k:
                index = np.random.randint(n)
                if index in used_freq:
                    continue
                freq_sample = corpus[index]
                len_sample = len(freq_sample)

                if len_sample == 0:
                    continue
                    
                if freq_count > k:
                    max_len = len_sample - (freq_count - k)
                    freq_sample = freq_sample[:max_len]

                freq_samples += freq_sample
                freq_count += len_sample

                if len_sample not in sen_len and len_sample < 200:
                    sen_len[len_sample] = 1
                elif len_sample < 200:
                    sen_len[len_sample] += 1

                used_freq.add(index)

            rank_corpera.append(rank_samples)
            freq_corpera.append(freq_samples)
#                 rank_corpera.append([item for sublist in rank_samples for item in sublist])
#                 freq_corpera.append([item for sublist in freq_samples for item in sublist])


    else:
        for i in range(m):
            rank_samples = random.sample(corpus, k)
            freq_samples = random.sample(corpus, k)
            rank_corpera.append(rank_samples)
            freq_corpera.append(freq_samples)
    
#     return rank_corpera, freq_corpera, sen_len
    return rank_corpera, freq_corpera

# Rank-Frequency calculations

In [16]:
# Returns a dataframe of word frequencies for list of corpora,
# with each column corresponding to a different corpus.
# Input: list of corpora. Each corpus consists of a list of tokenized sentences.
def calculate_freqs(freq_sents, norm=True, text=None):
    freq_dict = {}
    norm_dict = {}
    for i, corpus in enumerate(freq_sents):
        freq_dict['{} c_frequency {}'.format(text,i)] = collections.Counter(corpus)
        if norm:
            len_corp = len(corpus)
            norm_dict['{} c_frequency {}'.format(text, i)] = {k: v / len_corp for k, v in freq_dict['{} c_frequency {}'.format(text,i)].items()}
    
    if norm:
        freqs_df = pd.DataFrame(norm_dict)
    else:
        freqs_df = pd.DataFrame(freq_dict)
    freqs_df = freqs_df.fillna(0)
    
    
    return freqs_df

In [17]:
# Returns a dataframe with the mean frequency of each word across different corpora.
# Input: frequency dataframe
def mean_freqs(freqs_df):
    return(freqs_df.mean(axis=1))

In [18]:
# Returns a dataframe of word ranks for list of corpora,
# with each column corresponding to a different corpus.
# Input: list of corpora. Each corpus consists of a list of tokenized sentences.
def calculate_ranks(rank_sents, norm=False, text=None):
    ranks_dicts = {}
    for i, corpus in enumerate(rank_sents):
        freqs = collections.Counter(corpus)
        if norm:
            len_corp = len(corpus)
            for key in freqs:
                freqs[key] /= len_corp
        ranks_dicts['{} c_rank {}'.format(text, i)] = {w: r for r, (w, c) in enumerate(freqs.most_common(), 1)}
    
    ranks_df = pd.DataFrame(ranks_dicts)
    for column in ranks_df:
        min_rank = int(np.ceil(ranks_df[column].max() + 1))
        nan_rows = ranks_df[ranks_df[column].isnull()]
        num_nans = len(nan_rows)
        nan_ranks = list(range(min_rank, min_rank+num_nans))
        random.shuffle(nan_ranks)
        ranks_df.loc[ranks_df[column].isnull(), column] = nan_ranks

    return ranks_df

In [19]:
# Returns a dataframe with the mean rank of each word across different corpora.
# Input: rank dataframe
def mean_ranks(ranks_df):
    return ranks_df.mean(axis=1)

In [20]:
# Creates combined dataframe of ranks and frequencies
# Input: 2 lists (freq_sents and rank_sents) of corpora. Each corpus
# consists of a list of tokenized sentences. These lists are to be obtained form
# subsampling.
def ranks_freqs(freq_sents, rank_sents, text=None, norm=False):
    freqs_df = calculate_freqs(freq_sents, text=text, norm=norm)
    freqs_df['Frequency'] = mean_freqs(freqs_df)
    ranks_df = calculate_ranks(rank_sents, text=text, norm=norm)
    ranks_df['Rank'] = mean_ranks(ranks_df)
    
    # Put mean ranks and freqs together and remove all words that
    # do not have both a rank and frequency (which happens when a word)
    # is only present in freq_sents and not in rank_sents or vice versa
    ranks_freqs_df = pd.concat([ranks_df, freqs_df], axis = 1)
    ranks_freqs_df = ranks_freqs_df.dropna()
#     ranks_freqs_df = ranks_freqs_df.loc[ranks_freqs_df['Frequency'] >=1]
    return ranks_freqs_df

# Zipf's law

In [21]:
# MLE of Zipf's law parameters (alpha and beta)
class Mandelbrot(GenericLikelihoodModel):

    def __init__(self, frequencies, ranks, **kwargs):
        if not len(frequencies) == len(ranks):
            raise ValueError("NOT THE SAME NUMBER OF RANKS AND FREQS!")
        
        frequencies = np.asarray(frequencies)
        ranks = np.asarray(ranks)
        
        self.n_obs = np.sum(frequencies)
        
        super().__init__(endog=frequencies, exog=ranks, **kwargs)
        self.fit_result = None
    

    def prob(self, params, ranks=None, log=False):
        if ranks is None:
            ranks = self.exog
        
        alpha, beta = params
        if log:
            return -alpha*lg(beta+ranks) - lg(zeta(alpha, q=beta+1.))
        else:
            return ((beta + ranks)**(-alpha))/zeta(alpha, q=beta+1.)
    
    
    def loglike(self, params):
        rs = self.exog
        fs = self.endog
        alpha, beta = params
        
#        if alpha > 10 or beta > 20:
#            return -np.inf
        
#         if alpha < 1.0 or beta < 0.0:
#             return -np.inf
        
        # no need to calculate P(r) when observed f(r) was zero
        log_probs = -alpha*lg(beta+rs) - lg(zeta(alpha, q=beta+1.))
        log_probs = log_probs.reshape(-1, )
        return np.sum(fs * log_probs) - beta**5
    
    
    def register_fit(self, fit_result, overwrite=False):
        if not self.fit_result is None and not overwrite:
            raise ValueError("A fit result is already registered and overwrite=False!")
            
        self.fit_result = fit_result
        self.optim_params = fit_result.params
        self.pseudo_r_squared = self.pseudo_r_squared(self.optim_params)
        self.SE, self.SE_relative = fit_result.bse, fit_result.bse/self.optim_params
        self.BIC, self.BIC_relative = fit_result.bic,\
                            (-2*self.null_loglike())/fit_result.bic
        
        return self.optim_params
    
    def print_result(self, string=False):
        if self.fit_result is None:
            raise ValueError("Register a fitting result first!")

        def format_x(x):
            return float('{0:.3g}'.format(x))


        s = "="*50
        s += "\n" + "MANDELBROT"
        s += "\n" + "  Optimal Parameters " + str(tuple(map(format_x, self.optim_params)))
        
        s += "\n" + "  Standard Error [relative]: " + str(tuple(map(format_x, self.SE))) +\
              ", [" + str(tuple(map(format_x, self.SE_relative))) + "]"
        
        s += "\n" + "  Pseudo R^2: " + str(format_x(self.pseudo_r_squared))
        
        s += "\n" + "  BIC [relative]: " + str(format_x(self.BIC)) +\
              ", [" + str(format_x(self.BIC_relative)) + "]"
        s += "\n" + "="*50
        
        if string:
            return s
        
        print(s)
    
    
    def null_loglike(self, epsilon=1e-10):
        return self.loglike((1.+epsilon, 0.0))
    
    def pseudo_r_squared(self, params):
        return 1-self.loglike(params)/self.null_loglike()
    
    
    def predict(self, params, ranks=None, freqs=True, n_obs=None, 
                correct_for_finite_domain=True):
        if ranks is None:
            ranks = self.exog
        ranks = np.asarray(ranks)
        
        if n_obs is None:
            n_obs = self.n_obs
            
        alpha, beta = params
        pred_probs = self.prob(params, ranks=ranks, log=False)
        
        if correct_for_finite_domain:
            if not freqs:
                raise NotImplementedError("Correction for "\
                                          "finite domain not implemented with probabilities!")
            return pred_probs*(n_obs/np.sum(pred_probs))
        
        if freqs:
            return n_obs*pred_probs
        
        return pred_probs

In [22]:
# Returns a dataframe containing the mean frequencies and ranks, as well as 
# the estimated frequencies from Zipf's law and the error between the (log) mean
# frequencies and (log) estimated frequencies.
def zipfs_law(df, print_stats = True):
    mandelbrot = Mandelbrot(df['Frequency'], df['Rank'])
    mandelbrot_fit = mandelbrot.fit(start_params=np.asarray([1.0, 1.0]), # [1.0, 1.0]
                                method="powell", full_output=True, disp=0)
    mandelbrot.register_fit(mandelbrot_fit)
    if print_stats:
        mandelbrot.print_result()
    
    model_params = mandelbrot.optim_params
    alpha, beta =  mandelbrot.optim_params
    preds = mandelbrot.predict(model_params, df['Rank'])

    df['Estimated frequency'] = preds
    return df

In [23]:
def plot_zipf(ranks_freqs_df):
    ranks_freqs_df = ranks_freqs_df.sort_values(by=['Rank'])
    zipf_df = zipfs_law(ranks_freqs_df)
#     ranks_freqs_df = ranks_freqs_df.loc[ranks_freqs_df['Frequency'] >=1]
#     hexbin_plot(ranks_freqs_df['Rank'], ranks_freqs_df['Frequency'], est = ranks_freqs_df['Estimated frequency'])
#     plt.show()
#     hexbin_error(zipf_df['Rank (log)'], zipf_df['Error'])
#     plt.show()
    
    return zipf_df

# Mann-Whitney test

In [24]:
# Divides a big corpus into "n" subcorpera and calculates the frequencies for each
# subcorpus. Returns a dataframe containing the frequencies by word and by rank.
def sample_corpora(corpus, text, n=10, norm=True, subclasses=False):
    num_corp = len(corpus)
    corpus = [item for sublist in corpus for item in sublist]
    rank_corp, freq_corp = subsampling(corpus, k=num_corp*100, m=n)

    by_rank = pd.DataFrame()
    by_word = pd.DataFrame()

    ranks_freqs_df = ranks_freqs(rank_corp, freq_corp, text=text, norm=norm)
    ranks_freqs_df = zipfs_law(ranks_freqs_df, print_stats=False)
    ranks_freqs_df['Error'] = abs(ranks_freqs_df['Frequency'] - ranks_freqs_df['Estimated frequency'])
    ranks_freqs_df['Tot rank'] = ranks_freqs_df['Rank'].rank(method='first')
    
    by_ranks_pos = ranks_freqs_df.copy()
    by_ranks_pos.reset_index(inplace=True)

    if subclasses == False:
        by_ranks_pos['level_1'] = by_ranks_pos['level_1'].replace({'NN': 'Noun', 'NNS':'Noun', 
                                           'NNP':'Noun', 'VB':'Verb', 'VBD':'Verb', 
                                           'VBG':'Verb', 'VBN':'Verb', 'VBP':'Verb', 
                                          'VBZ':'Verb', 'JJ':'Adjective', 
                                           'JJR':'Adjective', 'JJS':'Adjective', 'RB':'Adverb',
                                          'RBR':'Adverb', 'RBS':'Adverb'})
    

    by_ranks_pos['PoS rank'] = by_ranks_pos.groupby('level_1')['Rank'].rank(method='first')
    by_ranks_pos = by_ranks_pos.set_index(['level_1', 'PoS rank'])
    
    if subclasses == False:
        classes = ['Noun', 'Verb', 'Adjective', 'Adverb']
    else:
        classes = ['NN', 'NNS', 'NNP', 'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ', 'JJ', 'JJR', 'JJS', 'RB', 'RBR', 'RBS']
    
    by_ranks_pos = by_ranks_pos.loc[classes]
    by_ranks_pos = by_ranks_pos.filter(regex='c_frequency|Frequency|Error')
    by_ranks_pos = by_ranks_pos.rename(columns={"Frequency": "{} mean freq".format(text),
                                         "Error": "{} error".format(text)})
    

    
    by_rank = ranks_freqs_df.filter(regex='Tot rank|c_frequency|Frequency|Error').set_index(['Tot rank'])
    by_rank = by_rank.rename(columns={"Frequency": "{} mean freq".format(text),
                                         "Error": "{} error".format(text)})
    
    by_word = ranks_freqs_df.filter(regex='c_frequency|Frequency|Error')
    by_word = by_word.rename(columns={"Frequency": "{} mean freq".format(text),
                                         "Error": "{} error".format(text)})

    by_word = by_word.sort_values(by=['{} mean freq'.format(text)], ascending=False)
    by_rank = by_rank.sort_values(by=['Tot rank'])
    by_ranks_pos = by_ranks_pos.sort_index(by=['level_1', 'PoS rank'])
    
    by_word = by_word.fillna(0)
    by_rank = by_rank.dropna()
    by_ranks_pos = by_ranks_pos.dropna()
    
    return by_word, by_rank, by_ranks_pos
#     return None

In [180]:
# Takes 2 corpora and aligns their frequency values by specific words and ranks 
# so that the Mann-Whitney test can be applied to the frequencies of every word
# or rank.
def mann_whitney_df(corpus1, corpus2, n=10, t=0, norm=True, subclasses=False):
    words_corpus1, ranks_corpus1, ranks_pos_corpus1 = sample_corpora(corpus1, text="C1", n=n, norm=norm, subclasses=subclasses)
    words_corpus2, ranks_corpus2, ranks_pos_corpus2 = sample_corpora(corpus2, text="C2", n=n, norm=norm, subclasses=subclasses)
    
    words_df = pd.concat([words_corpus1, words_corpus2], axis=1)
    temp = words_df.loc[:, words_df.columns.str.contains('freq')].fillna(0)
    words_df.loc[:, words_df.columns.str.contains('freq')] = temp
    words_df['error diff'] = abs(words_df['C1 error'] - words_df['C2 error'])
#     words_df = words_df.fillna(0)
    
    
    ranks_df = pd.concat([ranks_corpus1, ranks_corpus2], axis=1)
    ranks_df = ranks_df.dropna()
    ranks_df['error diff'] = abs(ranks_df['C1 error'] - ranks_df['C2 error'])

    ranks_pos_df = pd.concat([ranks_pos_corpus1, ranks_pos_corpus2], axis=1)
    ranks_pos_df = ranks_pos_df.dropna()
    return words_df, ranks_df, ranks_pos_df

In [43]:
# Applies the Mann-Whitney test to a dataframe containing frequencies per word
# or rank.
def mann_whitney_test(df, n=10):
    stats = []
    p = []
    
    df1 = df.loc[:, df.columns.str.contains('frequency')]
    
    for index, row in df1.iterrows():
        mann = mannwhitneyu(row[0:n], row[n:])
#         mann = chisquare(row[0:n], row[n:])
        stats.append(mann[0])
        p.append(mann[1])
    df['statistics'] = stats
    df['p-value'] = p
    df["H0"] = df['p-value'] > 0.05
    df = df.filter(regex='mean freq|error|fano|statistics|p-value|H0')
    
    return df

In [27]:
# Takes 2 corpora, and applies the Mann-Whitney test to ranks and specific words.
# Returns dataframes containing the results for both methods.
def mann_whitney_words_ranks(corpus1, corpus2, n=10, repeat = 10, t=0, norm=True, subclasses=False):
    words_df, ranks_df, ranks_pos_df = mann_whitney_df(corpus1, corpus2, n=n, t=t, norm=norm, subclasses=subclasses)
    df1 = mann_whitney_test(words_df, n=n)
    df2 = mann_whitney_test(ranks_df, n=n)
    df3 = mann_whitney_test(ranks_pos_df, n=n)
    return df1, df2, df3

In [28]:
# Takes 2 corpora, and applies the Mann-Whitney test to specific words and ranks.
# Calculates for both methods the percentage of words/ranks that reject H0.
def stats_mw(corpus1, corpus2, n=10, t=0, norm=True, print_stats=True, subclasses=False):
    words_df, ranks_df, ranks_pos_df = mann_whitney_words_ranks(corpus1, corpus2, n=n, t=t, norm=norm, subclasses=subclasses)

    tot_words = len(words_df)
    no_h0_words = len(words_df.loc[words_df['p-value']<=0.05])
    perc_words = no_h0_words/tot_words*100
    
    tot_ranks = len(ranks_df)
    no_h0_ranks = len(ranks_df.loc[ranks_df['p-value']<=0.05])
    perc_ranks = no_h0_ranks/tot_ranks*100
    
    if print_stats:
        print("WORDS:\n")
        print("Total words: ", tot_words)
        print("No H0: ", no_h0_words)
        print("Percentage: ", perc_words)
    
        print("\n\nRANKS:\n")
        print("Total ranks: ", tot_ranks)
        print("No H0: ", no_h0_ranks)
        print("Percentage: ", perc_ranks)
    
    stats = [perc_words, perc_ranks]
    
    return words_df, ranks_df, stats, ranks_pos_df

In [29]:
# Input: a dataframe with calculated Mann-Whitney values (of 2 corpora) for words 
# or ranks.
# Returns: dataframe with percentages of words or ranks that reject H0, grouped by
# ranks.
def dif_ranks(df, error=True):
    df_by_ranks = pd.Series()
    error_by_ranks = pd.Series()
    
    ranks = df[0:10]
    
    try:
        top_ten = len(ranks.loc[ranks['p-value']<=0.05]) * 10
    except ZeroDivisionError:
        top_ten = None
    df_by_ranks['1-10'] = top_ten
    
    if error:
        error_ten = ranks['error diff'].mean()
        error_by_ranks['1-10'] = error_ten
    cur_ranks = 10
    max_rank = len(df)
    
    while 2*cur_ranks <= max_rank:
        ranks = df[cur_ranks:2*cur_ranks]
        try:
            perc = len(ranks.loc[ranks['p-value']<=0.05])/len(ranks) * 100
        except ZeroDivisionError:
            perc = None
        df_by_ranks['{}-{}'.format(cur_ranks+1,2*cur_ranks)] = perc
        if error:
            error1 = ranks['error diff'].mean()
            error_by_ranks['{}-{}'.format(cur_ranks+1,2*cur_ranks)] = error1
        cur_ranks *= 2
        
    ranks = df[cur_ranks:]
    try:
        perc = len(ranks.loc[ranks['p-value']<=0.05])/len(ranks) * 100
    except ZeroDivisionError:
        perc = None
    df_by_ranks['{}-end'.format(cur_ranks+1,max_rank)] = perc
    
    if error:
        error1 = ranks['error diff'].mean()
        error_by_ranks['{}-end'.format(cur_ranks+1,2*cur_ranks)] = error1
        return df_by_ranks, error_by_ranks
    
    return df_by_ranks.sort_index()

In [30]:
# Takes 2 corpora, and applies the Mann-Whitney procedure to "times" subparts
# of both corpora. 
# Returns dataframes containing distributions of the total percentages as well 
# as per-rank percentages of rejected H0 ranks and words.
def stats_dist(corpus1, corpus2, times=10, n=10, t=0, norm=True, subclasses=False):
    if subclasses == False:
        classes = ['Noun', 'Verb', 'Adjective', 'Adverb']
    else:
        classes = ['NN', 'NNS', 'NNP', 'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ', 'JJ', 'JJR', 'JJS', 'RB', 'RBR', 'RBS']
    
    len_corp = int(len(corpus1)/times)
    ranks_stats_df = {}
    words_stats_df = {}
    
    pos_stats_df = pd.DataFrame()
    
    error_ranks_stats_df = {}
    error_words_stats_df = {}
    
    
    for i in range(times):
        i_words = {}
        i_ranks = {}
        
        i_error_words = {}
        i_error_ranks = {}
        
        corpus1_samp = corpus1[i*len_corp:(i+1)*len_corp]
        corpus2_samp = corpus2[i*len_corp:(i+1)*len_corp]
        words_df, ranks_df, stats, ranks_pos_df = stats_mw(corpus1_samp, corpus2_samp, n=n, t=t, norm=norm, print_stats=False, subclasses=subclasses)
        words_df['comb mean'] = words_df[['C1 mean freq', 'C2 mean freq']].mean(axis=1)
        words_df = words_df.sort_values(by=['comb mean'], ascending=False)
        ranks_df = ranks_df.sort_values(by=['Tot rank'])
        
        
        dif_ranks_words, dif_ranks_error_words = dif_ranks(words_df)
        dif_ranks_ranks, dif_ranks_error_ranks = dif_ranks(ranks_df)
        dif_ranks_pos = dif_ranks(ranks_pos_df, error=False)
        
        
        i_words['total']= stats[0]
        for index, value in dif_ranks_words.items():
            i_words[index] = value
        
        i_ranks['total'] = stats[1]
        for index, value in dif_ranks_ranks.items():
            i_ranks[index] = value
         
        
    
        i_pos = pd.DataFrame()
        pos = {}
        for pos_class in classes:
            pos[i] = {}
            df = ranks_pos_df.loc[[pos_class]]
            try:
                pos[i][(pos_class, 'total')] = len(df.loc[df['p-value']<=0.05])/len(df) * 100
            except ZeroDivisionError:
                pos[i][(pos_class,'total')] = None
            dif_ranks_pos = dif_ranks(ranks_pos_df.loc[[pos_class]], error=False).sort_index()
            for index, value in dif_ranks_pos.items():
                pos[i][(pos_class,index)] = value
            i_pos = pd.concat([i_pos, pd.DataFrame(pos)], axis=0)
            
        

        pos_stats_df = pd.concat([i_pos, pos_stats_df], axis=1).sort_index()
        words_stats_df[i] = i_words
        ranks_stats_df[i] = i_ranks
        
#         print(ranks_df.loc[ranks_df['H0'] == False]['error diff'])
        
        i_error_words['total'] = words_df['error diff'].mean()
        for index, value in dif_ranks_error_words.items():
            i_error_words[index] = value
        
        i_error_ranks['total'] = ranks_df['error diff'].mean()
        for index, value in dif_ranks_error_ranks.items():
            i_error_ranks[index] = value
            
        error_words_stats_df[i] = i_error_words
        error_ranks_stats_df[i] = i_error_ranks
        
   
    return pd.DataFrame(words_stats_df), pd.DataFrame(ranks_stats_df), pos_stats_df, pd.DataFrame(error_words_stats_df), pd.DataFrame(error_ranks_stats_df)

In [67]:
def readable_stats(corpus1, corpus2, times=10, n=10, t=0, norm=True, subclasses=False):
    words_stats_df, ranks_stats_df, pos_stats_df, words_error_stats_df, ranks_error_stats_df = stats_dist(corpus1, corpus2, times=times, n=n, t=t, norm=norm, subclasses=subclasses)
    ranks = pd.DataFrame()
    words = pd.DataFrame()
    pos = pd.DataFrame()
    
    words['mean perc'] = words_stats_df.mean(axis=1)
#     words['std perc'] = words_stats_df.std(axis=1)
#     words['mean error'] = words_error_stats_df.mean(axis=1)
#     words['std error'] = words_error_stats_df.std(axis=1)

    
    ranks['mean perc'] = ranks_stats_df.mean(axis=1)
#     ranks['std perc'] = ranks_stats_df.std(axis=1)
#     ranks['mean error'] = ranks_error_stats_df.mean(axis=1)
#     ranks['std error'] = ranks_error_stats_df.std(axis=1)
    
    pos['mean perc'] = pos_stats_df.mean(axis = 1)
#     pos['std perc'] = pos_stats_df.std(axis = 1)
    
#     return words, ranks, pos, words_stats_df, ranks_stats_df, pos_stats_df, words_error_stats_df, ranks_error_stats_df


    return words, ranks, pos

In [32]:
def parameter_tests(size, start=0, times=10, n=10):
    words_gpt_web, ranks_gpt_web, pos_gpt_web = readable_stats(gpt_train_20000[start:start+size*times], web_train_20000[start:start+size*times], times=times, n=n, t=0, norm=True, subclasses=False)
    words_web_web, ranks_web_web, pos_web_web = readable_stats(web_train_20000[start:start+size*times], web_train_20000[start+size*times:start+2*size*times], times=times, n=n, t=0, norm=True, subclasses=False)
    words_gpt_gpt, ranks_gpt_gpt, pos_gpt_gpt = readable_stats(gpt_train_20000[start:start+size*times], gpt_train_20000[start+size*times:start+2*size*times], times=times, n=n, t=0, norm=True, subclasses=False)
    
    words = pd.concat([words_gpt_web, words_web_web, words_gpt_gpt], axis=1)
    ranks = pd.concat([ranks_gpt_web, ranks_web_web, ranks_gpt_gpt], axis=1)
    pos =  pd.concat([pos_gpt_web, pos_web_web, pos_gpt_gpt], axis=1)
    return words, ranks, pos
    

In [63]:
def parameter_tests2(size, start=0, times=10, n=10):
    words_gpt_web, ranks_gpt_web, pos_gpt_web = readable_stats(gpt_set[start:start+size*times], human_set[start:start+size*times], times=times, n=n, t=0, norm=True, subclasses=False)
    words_web_web, ranks_web_web, pos_web_web = readable_stats(human_set[start:start+size*times], human_set[start+size*times:start+2*size*times], times=times, n=n, t=0, norm=True, subclasses=False)
    words_gpt_gpt, ranks_gpt_gpt, pos_gpt_gpt = readable_stats(gpt_set[start:start+size*times], gpt_set[start+size*times:start+2*size*times], times=times, n=n, t=0, norm=True, subclasses=False)
    
    words = pd.concat([words_gpt_web, words_web_web, words_gpt_gpt], axis=1)
    ranks = pd.concat([ranks_gpt_web, ranks_web_web, ranks_gpt_gpt], axis=1)
    pos =  pd.concat([pos_gpt_web, pos_web_web, pos_gpt_gpt], axis=1)
    return words, ranks, pos
    

In [1]:
final_1 = parameter_tests(1, start=0, times=10, n=10)

In [319]:
final_10 = parameter_tests(10, start=0, times=10, n=10)

In [None]:
final_100 = parameter_tests(100, start=0, times=10, n=10)

In [None]:
final_1000 = parameter_tests(1000, start=0, times=10, n=10)

In [None]:
final_10000 = parameter_tests(10000, start=0, times=10, n=10)

In [320]:
final_10[0]

Unnamed: 0,mean perc,mean perc.1,mean perc.2
1-10,66.0,42.0,48.0
11-20,52.0,46.0,45.0
1281-2560,1.904297,7.515625,
1281-end,0.0,,0.0
161-320,62.75,60.875,57.1875
21-40,48.0,40.5,48.5
2561-end,0.0,0.0,
321-640,68.71875,69.28125,65.21875
41-80,53.25,45.25,51.0
641-1280,73.734375,80.109375,55.265625


In [322]:
final_10[1]

Unnamed: 0,mean perc,mean perc.1,mean perc.2
1-10,49.0,34.0,43.0
11-20,49.090909,36.363636,32.727273
1281-end,2.941176,3.175683,3.219814
161-320,24.534161,23.043478,26.21118
21-40,46.666667,29.52381,27.142857
321-640,11.869159,13.551402,11.339564
41-80,40.487805,34.146341,32.195122
641-1280,4.75819,5.99064,4.9922
641-end,5.212066,,5.220293
81-160,30.740741,25.308642,25.432099


In [323]:
final_10[2]

Unnamed: 0,Unnamed: 1,mean perc,mean perc.1,mean perc.2
Adjective,1-10,37.0,23.0,28.0
Adjective,11-20,30.0,25.0,28.0
Adjective,161-end,0.0,5.657982,1.086957
Adjective,21-40,22.5,17.0,22.0
Adjective,41-80,10.25,9.0,11.75
Adjective,81-160,3.75,8.0,7.1875
Adjective,81-end,5.607917,,7.033289
Adjective,total,11.892367,9.716776,12.915096
Adverb,1-10,49.0,40.0,41.0
Adverb,11-20,27.0,26.0,24.0


In [58]:
testing = readable_stats(human_set[0:10], gpt_set[0:10], times=10, n=10, t=0, norm=True, subclasses=False)

In [137]:
testing[0].reset_index()[testing[0].reset_index()['index'].str.contains("1-10|11-20|21-40|41-80")]

Unnamed: 0,index,mean perc,std perc
1,1-10,53.0,13.374935
2,11-20,52.0,9.189366
3,21-40,53.0,13.374935
4,41-80,65.25,11.928607


In [60]:
testing[1]

Unnamed: 0,mean perc,std perc
total,9.982679,2.107774
1-10,44.0,15.055453
11-20,32.727273,17.248787
21-40,26.190476,8.76616
41-80,10.731707,3.085149
81-160,7.283951,3.104201
161-end,3.037089,1.549318
161-320,0.621118,
321-end,0.0,


In [61]:
testing[2]

Unnamed: 0,Unnamed: 1,mean perc,std perc
Adjective,1-10,16.0,13.498971
Adjective,11-20,5.0,7.559289
Adjective,11-end,0.0,0.0
Adjective,21-40,5.0,
Adjective,21-end,10.714286,19.669895
Adjective,41-end,0.0,
Adjective,total,9.977358,5.984382
Adverb,1-10,9.0,8.75595
Adverb,11-end,6.666667,11.547005
Adverb,total,9.121795,8.680431


# Automated text classification

In [188]:
# Takes 2 corpora, and applies the Mann-Whitney procedure to "times" subparts
# of both corpora. 
# Returns dataframes containing distributions of the total percentages as well 
# as per-rank percentages of rejected H0 ranks and words.
def stats_dist2(corpus1, corpus2, times=10, n=10, t=0, norm=True, subclasses=False):
    if subclasses == False:
        classes = ['Noun', 'Verb', 'Adjective', 'Adverb']
    else:
        classes = ['NN', 'NNS', 'NNP', 'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ', 'JJ', 'JJR', 'JJS', 'RB', 'RBR', 'RBS']
    
    len_corp = int(len(corpus2)/times)
    ranks_stats_df = {}
    words_stats_df = {}
    
    pos_stats_df = pd.DataFrame()
    
    error_ranks_stats_df = {}
    error_words_stats_df = {}
    
    
    for i in range(times):
        i_words = {}
        i_ranks = {}
        
        i_error_words = {}
        i_error_ranks = {}
        
        corpus1_samp = corpus1
        corpus2_samp = corpus2[i*len_corp:(i+1)*len_corp]
        words_df, ranks_df, stats, ranks_pos_df = stats_mw(corpus1_samp, corpus2_samp, n=n, t=t, norm=norm, print_stats=False, subclasses=subclasses)
        words_df['comb mean'] = words_df[['C1 mean freq', 'C2 mean freq']].mean(axis=1)
        words_df = words_df.sort_values(by=['comb mean'], ascending=False)
        ranks_df = ranks_df.sort_values(by=['Tot rank'])
        
        
        dif_ranks_words, dif_ranks_error_words = dif_ranks(words_df)
        dif_ranks_ranks, dif_ranks_error_ranks = dif_ranks(ranks_df)
        dif_ranks_pos = dif_ranks(ranks_pos_df, error=False)
        
        
        i_words['total']= stats[0]
        for index, value in dif_ranks_words.items():
            i_words[index] = value
        
        i_ranks['total'] = stats[1]
        for index, value in dif_ranks_ranks.items():
            i_ranks[index] = value
         
        
    
        i_pos = pd.DataFrame()
        pos = {}
        for pos_class in classes:
            pos[i] = {}
            df = ranks_pos_df.loc[[pos_class]]
            try:
                pos[i][(pos_class, 'total')] = len(df.loc[df['p-value']<=0.05])/len(df) * 100
            except ZeroDivisionError:
                pos[i][(pos_class,'total')] = None
            dif_ranks_pos = dif_ranks(ranks_pos_df.loc[[pos_class]], error=False).sort_index()
            for index, value in dif_ranks_pos.items():
                pos[i][(pos_class,index)] = value
            i_pos = pd.concat([i_pos, pd.DataFrame(pos)], axis=0)
            
        

        pos_stats_df = pd.concat([i_pos, pos_stats_df], axis=1).sort_index()
        words_stats_df[i] = i_words
        ranks_stats_df[i] = i_ranks
        
#         print(ranks_df.loc[ranks_df['H0'] == False]['error diff'])
        
        i_error_words['total'] = words_df['error diff'].mean()
        for index, value in dif_ranks_error_words.items():
            i_error_words[index] = value
        
        i_error_ranks['total'] = ranks_df['error diff'].mean()
        for index, value in dif_ranks_error_ranks.items():
            i_error_ranks[index] = value
            
        error_words_stats_df[i] = i_error_words
        error_ranks_stats_df[i] = i_error_ranks
        
   
    return pd.DataFrame(words_stats_df), pd.DataFrame(ranks_stats_df), pos_stats_df, pd.DataFrame(error_words_stats_df), pd.DataFrame(error_ranks_stats_df)

In [189]:
def readable_stats2(corpus1, corpus2, times=10, n=10, t=0, norm=True, subclasses=False):
    words_stats_df, ranks_stats_df, pos_stats_df, words_error_stats_df, ranks_error_stats_df = stats_dist2(corpus1, corpus2, times=times, n=n, t=t, norm=norm, subclasses=subclasses)
    ranks = pd.DataFrame()
    words = pd.DataFrame()
    pos = pd.DataFrame()
    
    words['mean perc'] = words_stats_df.mean(axis=1)
    ranks['mean perc'] = ranks_stats_df.mean(axis=1)
    
    pos['mean perc'] = pos_stats_df.mean(axis = 1)
    
    return words, ranks, pos

In [192]:
# Funtion that automatically classifes corpora as being human or machine, using
# the Mann-Whitney test based on the top 60 ranks of individual words and ranks.
#
# Parameters:
# - gpt: True when you're classifying gpt-generated corpus, false when
# you're classifying a human-written corpus.
# 
# - size: amount of texts you want to classify at a time (I normally use 10 
# so that you have about 10 x 1.000 = 10.000 tokens)
# 
# - own_set: When none, it will use the webtext and gpt set, or you can enter
# Riemer's sets which it will then use.
# 
# - rep: amount of repetitions, so how many corpora of size "size" you want to 
# classify
# 
# - times = how many times you want the mann-whitney test to be repeated on a corpus
# 
# - n: mann-whitney parameter, namely the amount of subcorpora you want to divide the
# input into

# returns: the amount of times a text got classified as human and as machine-generated
# for both the use of individual words and ranks, alongside a list of the mean 
# percentages for these outcomes.

def test_rep(gpt, size, rep = 50, times = 5, n=10):
    j = size
    
    num_corp = size * times
    
    w_human = 0
    w_machine = 0
    r_human = 0
    r_machine = 0
    
    means_w_human = []
    means_w_machine =[]
    means_r_human = []
    means_r_machine = []

    
    for i in range(rep):
        random.shuffle(gpt_train_20000)
        random.shuffle(web_train_20000)
        
        if gpt == True:
            corpus = gpt_train_20000[:size]
        else:
            corpus = web_train_20000[:size]
        
        web = web_train_20000[size:size+num_corp]
        gpt = gpt_train_20000[size:size+num_corp]
        words_web, ranks_web, pos_web = readable_stats2(corpus, web, times=times, n=n)
        words_gpt, ranks_gpt, pos_gpt = readable_stats2(corpus, gpt, times=times, n=n)
        
#         print(words_web, words_gpt)
#         print(ranks_web, ranks_gpt)
        
        words_web_mean = words_web.reset_index()[words_web.reset_index()['index'].str.contains("1-10|11-20|21-40|41-80")].mean()[0]
        ranks_web_mean = ranks_web.reset_index()[ranks_web.reset_index()['index'].str.contains("1-10|11-20|21-40|41-80")].mean()[0]
#         pos_both_mean = pos_both.reset_index()[pos_both.reset_index()['index'].str.contains("1-10|11-20|21-40|41-80")].mean()
        
        words_gpt_mean = words_gpt.reset_index()[words_gpt.reset_index()['index'].str.contains("1-10|11-20|21-40|41-80")].mean()[0]
        ranks_gpt_mean = ranks_gpt.reset_index()[ranks_gpt.reset_index()['index'].str.contains("1-10|11-20|21-40|41-80")].mean()[0]
#         pos_human_mean = pos_human.reset_index()[pos_human.reset_index()['index'].str.contains("1-10|11-20|21-40|41-80")].mean()
        

        if words_web_mean < words_gpt_mean:
            w_human += 1
        
        else:
            w_machine += 1
        
        if ranks_web_mean < ranks_gpt_mean:
            r_human += 1
        else:
            r_machine += 1
        
        means_w_human.append(words_web_mean)
        means_w_machine.append(words_gpt_mean)
        means_r_human.append(ranks_web_mean)
        means_r_machine.append(ranks_gpt_mean)
        
    means_list = [np.mean(means_w_human), np.mean(means_r_human), np.mean(means_w_machine), np.mean(means_r_machine)]
    return w_human, r_human, w_machine, r_machine, means_list

In [195]:
test_rep(True, 10, rep = 10, times = 10, n=10)

           mean perc
total      34.532011
1-10       53.000000
11-20      45.000000
21-40      55.000000
41-80      58.500000
81-160     63.125000
161-320    61.312500
321-640    71.468750
641-1280   70.953125
1281-2560   1.158854
2561-end    0.000000
1281-end    0.000000           mean perc
total     34.229362
1-10      42.000000
11-20     43.000000
21-40     43.500000
41-80     43.500000
81-160    48.000000
161-320   55.312500
321-640   64.906250
641-1280  46.500000
1281-end   0.000000
         mean perc
total    15.843740
1-10     56.000000
11-20    41.818182
21-40    44.761905
41-80    50.487805
81-160   38.888889
161-320  24.223602
321-640  12.398754
641-end   6.451869          mean perc
total    13.266271
1-10     33.000000
11-20    36.363636
21-40    30.000000
41-80    32.682927
81-160   29.876543
161-320  24.037267
321-640  12.087227
641-end   5.012370
           mean perc
total      32.220346
1-10       45.000000
11-20      40.000000
21-40      47.000000
41-80      46.500000
8

           mean perc
total      30.898915
1-10       51.000000
11-20      48.000000
21-40      40.000000
41-80      44.500000
81-160     53.625000
161-320    62.312500
321-640    67.625000
641-1280   81.296875
1281-2560   5.726562
2561-end    0.000000            mean perc
total      32.264489
1-10       61.000000
11-20      55.000000
21-40      52.000000
41-80      55.500000
81-160     53.750000
161-320    59.937500
321-640    66.406250
641-1280   74.750000
1281-2560   0.875000
2561-end    0.000000
          mean perc
total     10.464700
1-10      42.000000
11-20     35.454545
21-40     34.761905
41-80     30.975610
81-160    28.148148
161-320   23.043478
321-640   12.398754
641-1280   6.677067
1281-end   3.394208           mean perc
total     13.469477
1-10      65.000000
11-20     46.363636
21-40     44.285714
41-80     42.682927
81-160    31.481481
161-320   23.354037
321-640   12.959502
641-1280   5.018201
1281-end   3.804129
641-end    5.780519


(8, 9, 2, 1, [46.23125, 35.873337028824835, 52.49375, 41.89892566782811])

In [235]:
def test_rep2(kind, size, rep = 50, times = 5, n=10, sub=True):
    j = size
    
    num_corp = size * times
    
    w_human = 0
    w_machine = 0
    r_human = 0
    r_machine = 0
    
    means_w_human = []
    means_w_machine =[]
    means_r_human = []
    means_r_machine = []
    
    wrong_r = []
    right_r = []
    wrong_w = []
    right_w = []
    
    indices = random.sample(range(0, int(20000/size)), rep)
    
    if kind == "gpt":
        random.shuffle(test_gpt)
        corpora = [test_gpt[i*size:(i+1)*size] for i in indices]
    else:
        random.shuffle(test_web)
        corpora = [test_web[i*size:i*size+size] for i in indices]
    
    
    for i in range(rep):
        corpus = corpora[i]
        random.shuffle(web_train_20000)
        random.shuffle(gpt_train_20000)
        web = web_train_20000[0:num_corp]
        gpt = gpt_train_20000[0:num_corp]
        words_web, ranks_web, pos_web = readable_stats2(corpus, web, times=times, n=n)
        words_gpt, ranks_gpt, pos_gpt = readable_stats2(corpus, gpt, times=times, n=n)

        
        if words_web['mean perc'][1:4].mean() < words_gpt['mean perc'][1:4].mean():
            w_human += 1
            wrong_w.append(corpus)
        else:
            w_machine += 1
            right_w.append(corpus)
        
        if ranks_web['mean perc'][1:4].mean() < ranks_gpt['mean perc'][1:4].mean():
            r_human += 1
            wrong_r.append(corpus)
        else:
            r_machine += 1
            right_r.append(corpus)
            
        means_w_human.append(words_web['mean perc'][1:4].mean())
        means_w_machine.append(words_gpt['mean perc'][1:4].mean())
        means_r_human.append(ranks_web['mean perc'][1:4].mean())
        means_r_machine.append(ranks_gpt['mean perc'][1:4].mean())
        
    means_list = [np.mean(means_w_human), np.mean(means_w_machine), np.mean(means_r_human), np.mean(means_r_machine)]
    return [w_human, r_human, w_machine, r_machine], means_list, wrong_r, right_r, wrong_w, right_w

In [204]:
test_rep2("gpt", 10, 10, 10, 10, True)

(2, 2, 8, 8, [55.35, 48.4875, 44.09827367754197, 34.12089008552423])

In [2]:
uno = test_rep2("gpt", 1, 10, 10, 10, True)

In [None]:
# gpt_test_100 = test_rep2("gpt", 100, 10, 10, 10, True)

In [None]:
test_rep2("gpt", 1, 10, 10, 10, True)

# Automatic text classification - Wiki

In [177]:
# Funtion that automatically classifes corpora as being human or machine, using
# the Mann-Whitney test based on the top 60 ranks of individual words and ranks.
#
# Parameters:
# - gpt: True when you're classifying gpt-generated corpus, false when
# you're classifying a human-written corpus.
# 
# - size: amount of texts you want to classify at a time (I normally use 10 
# so that you have about 10 x 1.000 = 10.000 tokens)
# 
# - own_set: When none, it will use the webtext and gpt set, or you can enter
# Riemer's sets which it will then use.
# 
# - rep: amount of repetitions, so how many corpora of size "size" you want to 
# classify
# 
# - times = how many times you want the mann-whitney test to be repeated on a corpus
# 
# - n: mann-whitney parameter, namely the amount of subcorpora you want to divide the
# input into

# returns: the amount of times a text got classified as human and as machine-generated
# for both the use of individual words and ranks, alongside a list of the mean 
# percentages for these outcomes.

# 1 = gpt-wiki
# 2 = wiki-wiki
# 3 = gpt-gpt

def test_rep2(size, total=False, rep = 50, n=10):
    j = size
    inds = []
    
    if total:
        crop = len(human_set) - int(len(human_set)%size)
        human = human_set[:crop]
        gpt = gpt_set[:crop]
        rep = crop/size
    
    else:
        human = human_set
        gpt = gpt_set
    
#     num_corp = size * times
    
    w_correct = 0
    w_false = 0
    r_correct = 0
    r_false = 0
    
    means_w = [[], [], []]
    means_r =  [[], [], []]

    
    for i in range(rep):
        ind = 0
        while ind == 0 or (ind > (i-1)*size and ind < (i+1)*size):
            ind = random.randint(0, len(human_set) - size)
        inds.append(ind)
        
        gpt_comp = gpt[i*size:(i+1)*size]
        human_comp = human[i*size:(i+1)*size]
    
        
        human_rand = human[ind:ind+size]
        gpt_rand = gpt[ind:ind+size]
        
#         print(gpt_comp, human_comp)
    
                

        words_both, ranks_both, pos_both = readable_stats(gpt_comp, human_comp, times=1, n=n)
        words_human, ranks_human, pos_human = readable_stats(human_comp, human_rand, times=1, n=n)
        words_gpt, ranks_gpt, pos_gpt = readable_stats(gpt_comp, gpt_rand, times=1, n=n)
        
        words_both_mean = words_both.reset_index()[words_both.reset_index()['index'].str.contains("1-10|11-20|21-40")].mean()[0]
        ranks_both_mean = ranks_both.reset_index()[ranks_both.reset_index()['index'].str.contains("1-10|11-20|21-40|41-80")].mean()[0]
#         pos_both_mean = pos_both.reset_index()[pos_both.reset_index()['index'].str.contains("1-10|11-20|21-40|41-80")].mean()
        
        words_human_mean = words_human.reset_index()[words_human.reset_index()['index'].str.contains("1-10|11-20|21-40")].mean()[0]
        ranks_human_mean = ranks_human.reset_index()[ranks_human.reset_index()['index'].str.contains("1-10|11-20|21-40")].mean()[0]
#         pos_human_mean = pos_human.reset_index()[pos_human.reset_index()['index'].str.contains("1-10|11-20|21-40|41-80")].mean()
        
        words_gpt_mean = words_gpt.reset_index()[words_gpt.reset_index()['index'].str.contains("1-10|11-20|21-40")].mean()[0]
        ranks_gpt_mean = ranks_gpt.reset_index()[ranks_gpt.reset_index()['index'].str.contains("1-10|11-20|21-40")].mean()[0]
#         pos_gpt_mean = pos_gpt.reset_index()[pos_gpt.reset_index()['index'].str.contains("1-10|11-20|21-40|41-80")].mean()
        
        if words_both_mean > words_gpt_mean and words_both_mean > words_human_mean:
            w_correct += 1
        
        else:
            w_false += 1
        
        if ranks_both_mean > ranks_gpt_mean and ranks_both_mean > ranks_human_mean:
            r_correct += 1
        
        else:
            r_false += 1
        
        means_w[0].append(words_both_mean)
        means_w[1].append(words_human_mean)
        means_w[2].append(words_gpt_mean)
        
        means_r[0].append(ranks_both_mean)
        means_r[1].append(ranks_human_mean)
        means_r[2].append(ranks_gpt_mean)
        
    means_w = [np.mean(w) for w in means_w]
    means_r = [np.mean(r) for r in means_r]
    return w_correct, w_false, r_correct, r_false, means_w, means_r
    

In [178]:
def test_demo(size):
    w_correct, w_false, r_correct, r_false, means_w, means_r = test_rep2(size, total=False, rep = 1, n=10)
    if w_correct > w_false:
        print("words: different source")
    else:
        print("words: same source")
        
    if r_correct > r_false:
        print("ranks: different source")
    else:
        print("ranks: same source")

In [179]:
test_demo(10)

words: different source
ranks: same source


# Color stuff

In [223]:
test_web = pickle.load(open("datasets/test_web.p", "rb" ))

In [224]:
test_gpt = pickle.load(open("datasets/test_gpt.p", "rb" ))

In [243]:
pd.read_csv (r'data/small-117M-k40.test.csv')

Unnamed: 0,id,text,length,ended
0,255001,"Dawn on its way to victory, but where does it ...",1024,False
1,255002,\nDrew Angerer/Getty Images\n\nThere are many ...,1024,False
2,255003,"In my last post, I gave a post on the use of t...",1024,False
3,255004,Brought to you by:\n\nKFC\n\nHogan's Day\n\nSo...,1024,False
4,255005,"\nIt's time for the first time since 2008, and...",480,True
...,...,...,...,...
4995,259996,I've read articles talking about how to manage...,111,True
4996,259997,"The first part of my work-in-progress, The Unf...",1024,False
4997,259998,"A lot of these things are already pretty cool,...",395,True
4998,259999,S. 3148 – 114th Congress (2017-2018) To amend ...,1024,False


In [258]:
df_real_1 = pickle.load(open("df_real_1.pkl", "rb" ))
df_reals = df_real_1[['Rank', 'Frequency']]
df_reals['Tot rank'] = df_reals['Rank'].rank(method='first')

In [225]:
def highlight(colour, text):
    if colour == "red":
        return "\033[1;41m" + str(text) + "\033[1;m"
    if colour == "green":
        return "\033[1;42m" + str(text) + "\033[1;m"
    if colour == "yellow":
        return "\033[1;43m" + str(text) + "\033[1;m"
    if colour == "cyan":
        return "\033[1;46m" + str(text) + "\033[1;m"
    return str(text)

In [306]:
def color_text(corpus):
    text = ""
    count = [0,0,0,0,0,0]
    corpus = [item for sublist in corpus for item in sublist]
#     corpus = [item for sublist in corpus for item in sublist]
    for word, pos in corpus:
        try:
            rank = df_reals.loc[word]['Tot rank'][0]
            if rank <= 10:
                text += highlight("green", word)
                count[0] += 1
            elif rank <= 100:
                text += highlight("yellow", word)
                count[1] += 1
            elif rank <= 1000:
                text += highlight("red", word)
                count[2] += 1
            elif rank <= 10000:
#                 text += highlight("cyan", word)
                text += word
                count[3] += 1
            else:
                text += word
                count[4] += 1

        except KeyError:
            text += highlight("magenta", word)
            count[5] += 1
        text += " "
    count = [0 for num in count if not num]
    count = [num/len(corpus) for num in count]
    return text, count

In [250]:
tot = [0,0,0,0,0,0]
for corpus in test_web:
    corpus = [item for sublist in corpus for item in sublist]
    if len(corpus) == 0:
        continue
    count = color_text(corpus)[1]
    tot = [x + y for x, y in zip(tot, count)]

In [251]:
tot_gpt = [0,0,0,0,0,0]
for corpus in test_gpt:
    corpus = [item for sublist in corpus for item in sublist]
    if len(corpus) == 0:
        continue
    count = color_text(corpus)[1]
    tot_gpt = [x + y for x, y in zip(tot, count)]

In [216]:
[num/len(test_web) for num in tot]

[160.08554996405465,
 165.39611790079078,
 157.84974838245867,
 175.60388209920922,
 76.72969086987779,
 29.74550682961898]

In [217]:
[num/len(test_gpt) for num in tot_gpt]

[133.48532055122828,
 137.86339125224686,
 131.64230077890952,
 146.46434991012583,
 63.98382264829239,
 24.829239065308567]

In [303]:
gpt_wrong_2 = color_text(wrong_corp)

In [325]:
web_1 = color_text(test_web[0])

In [326]:
print(web_1[0])

editor s [1;41mnote[1;m [1;43mthis[1;m [1;41mpost[1;m [1;42mis[1;m [1;41mpart[1;m [1;42mof[1;m [1;42mthe[1;m overheard [1;42mon[1;m cnncom [1;41mseries[1;m [1;42ma[1;m regular feature [1;43mthat[1;m examines interesting [1;41mcomments[1;m [1;42mand[1;m thoughtprovoking conversations posted [1;43mby[1;m [1;42mthe[1;m [1;41mcommunity[1;m [1;41mformer[1;m senator [1;42mand[1;m [1;41mdemocratic[1;m [1;41mpresidential[1;m candidate [1;41mjohn[1;m edwards [1;42mis[1;m beginning [1;43mhis[1;m criminal trial [1;42mon[1;m felony [1;42mand[1;m misdemeanor counts dating [1;41mback[1;m [1;42mto[1;m [1;43mhis[1;m failed [1;41mcampaign[1;m [1;43mone[1;m [1;42mof[1;m [1;42mthe[1;m [1;41mkey[1;m allegations [1;41magainst[1;m edwards [1;42mis[1;m [1;43mthat[1;m [1;43mhe[1;m received [1;41m1[1;m [1;41mmillion[1;m [1;42min[1;m illegal [1;41mcampaign[1;m contributions [1;42mto[1;m conceal [1;43mhis[1;m pregnant mistress 

In [327]:
web_2 = color_text(test_web[1])

In [328]:
print(web_2[0])

[1;42min[1;m honour [1;42mof[1;m [1;42mthe[1;m [1;41mfact[1;m [1;43mthat[1;m [1;43mthis[1;m [1;42mis[1;m [1;43mmy[1;m 28th [1;41mpost[1;m [1;42mon[1;m [1;42mthe[1;m [1;41mweek[1;m [1;42mof[1;m [1;43mmy[1;m 28th birthday [1;43mi[1;m [1;41mfeel[1;m [1;43mits[1;m appropriate [1;42mto[1;m celebrate [1;43mthat[1;m [1;41mnumber[1;m [1;43mwith[1;m [1;42ma[1;m [1;41mshort[1;m guide [1;42mto[1;m [1;42mthe[1;m hilariousawesomecrazy [1;41mthings[1;m [1;43mi[1;m ve noticed [1;43mfrom[1;m [1;43mmy[1;m [1;43mfirst[1;m [1;43myear[1;m living [1;42min[1;m austria [1;43mthese[1;m [1;43mare[1;m [1;42mthe[1;m [1;41mlittle[1;m [1;41mthings[1;m [1;43myou[1;m [1;41mneed[1;m [1;42mto[1;m embrace [1;42mto[1;m truly uncover [1;43myour[1;m inner austrian 1be [1;42mon[1;m [1;43mtime[1;m [1;43mthey[1;m [1;41mreally[1;m [1;41mreally[1;m [1;43mlike[1;m [1;41mbeing[1;m [1;42mon[1;m [1;43mtime[1;m [1;42mthe[1;m germ

In [334]:
web_3 = color_text(test_web[2])

In [335]:
print(web_3[0])

logger plusa [1;41msimple[1;m javascript library [1;42mto[1;m [1;41madd[1;m optional functionality [1;41mdirectly[1;m [1;42mto[1;m consolelog avoiding [1;42mthe[1;m [1;41mneed[1;m [1;42mto[1;m refactor existing [1;41mcode[1;m featuresautomatic datetime tagging [1;42mof[1;m log messages [1;43mwith[1;m custom datetime format [1;41mability[1;m [1;42mto[1;m [1;41madd[1;m tags [1;42mto[1;m messages [1;41mcoming[1;m [1;43mfrom[1;m [1;41mparticular[1;m functions [1;41mincluding[1;m [1;41mchild[1;m calls extensible [1;43mwith[1;m custom transformations [1;41mrun[1;m [1;42mon[1;m messages [1;42mand[1;m objects [1;41mbefore[1;m [1;43mthey[1;m [1;43mare[1;m logged [1;41mmakes[1;m [1;41mchanges[1;m [1;41mdirectly[1;m [1;42mto[1;m consolelog [1;43mso[1;m [1;43mno[1;m refactoring [1;41mrequired[1;m [1;42mfor[1;m existing [1;41mcode[1;m [1;43mso[1;m [1;43mno[1;m refactoring [1;41mrequired[1;m [1;42mfor[1;m existing [1;41

In [329]:
gpt_1 = color_text(test_gpt[1])

In [330]:
print(gpt_1[0])

[1;43myou[1;m ve [1;41mfound[1;m [1;43mwhat[1;m [1;43myou[1;m re [1;41mlooking[1;m [1;42mfor[1;m [1;43myou[1;m ve [1;41mgot[1;m [1;43mwhat[1;m [1;42mit[1;m [1;41mtakes[1;m [1;42mto[1;m succeed [1;43mor[1;m [1;43myou[1;m ve [1;41mgot[1;m [1;43mwhat[1;m [1;43myou[1;m [1;41mneed[1;m [1;41mhere[1;m [1;43mare[1;m [1;41m15[1;m [1;42mof[1;m [1;43mour[1;m [1;41mtop[1;m picks [1;43mfrom[1;m 2018 [1;41m1[1;m jeff bezos [1;42mthe[1;m amazon founderwhy [1;43mhave[1;m [1;43mso[1;m [1;43mmany[1;m entrepreneurs [1;41mleft[1;m [1;42mthe[1;m [1;41mbusiness[1;m [1;42mof[1;m creating [1;42mand[1;m selling books [1;43mthat[1;m s [1;43mnot[1;m [1;42ma[1;m [1;41mquestion[1;m [1;41manyone[1;m [1;42min[1;m [1;43mhis[1;m [1;41mposition[1;m [1;43mwould[1;m [1;41mwant[1;m [1;42mto[1;m answer [1;43mbut[1;m [1;43mhe[1;m [1;43mwas[1;m [1;42ma[1;m [1;41mgreat[1;m businessman [1;43mhe[1;m [1;41mcame[1;m [1;43mfr

In [332]:
gpt_2 = color_text(test_gpt[2])

In [333]:
print(gpt_2[0])

[1;41mkeep[1;m reading [1;42mthe[1;m mode [1;43mhas[1;m changed [1;41mhelp[1;m [1;43mme[1;m [1;43mif[1;m [1;41manyone[1;m knows [1;43mwhat[1;m [1;42mthe[1;m fix [1;42mis[1;m [1;43mi[1;m m [1;41mgoing[1;m [1;42mto[1;m fix [1;42mit[1;m [1;41msoon[1;m [1;42min[1;m [1;41mfact[1;m [1;43mwhen[1;m [1;43mi[1;m [1;43mget[1;m [1;41mback[1;m [1;43mi[1;m hope [1;42mto[1;m [1;43mhave[1;m [1;42mit[1;m [1;43mas[1;m [1;41mmuch[1;m [1;43mas[1;m [1;41mpossible[1;m implemented [1;41mwithin[1;m [1;43mthis[1;m [1;41mpost[1;m [1;42min[1;m [1;41mfuture[1;m posts [1;43mwe[1;m ll [1;43mhave[1;m [1;42ma[1;m [1;41mfull[1;m review [1;43mbut[1;m [1;42mfor[1;m [1;43mnow[1;m [1;41mhere[1;m s [1;43mwhat[1;m [1;43mi[1;m [1;43mhave[1;m [1;43mi[1;m m [1;43mnot[1;m [1;42ma[1;m [1;41mbig[1;m fan [1;42mof[1;m fixing [1;43many[1;m [1;41mspecific[1;m bugs [1;43monly[1;m [1;41mthings[1;m [1;43mthat[1;m [1;43mhave[1;

In [337]:
gpt_3 = color_text(test_gpt[3])

In [338]:
print(gpt_3[0])

rationalein [1;42mthe[1;m [1;43mfirst[1;m [1;41mpart[1;m [1;42mof[1;m [1;42mthe[1;m [1;41mseries[1;m [1;43mi[1;m ll [1;41mtry[1;m [1;42mto[1;m explain [1;43mwhat[1;m [1;41mright[1;m [1;41mmeans[1;m [1;42min[1;m [1;41mgeneral[1;m [1;43myou[1;m [1;43mcan[1;m [1;41mread[1;m [1;43mthat[1;m [1;41msection[1;m [1;42mto[1;m [1;41mmake[1;m [1;42mit[1;m [1;41mclear[1;m [1;43mbut[1;m [1;43mif[1;m [1;43myou[1;m re [1;41mstill[1;m [1;42min[1;m [1;42mthe[1;m [1;41mgame[1;m [1;42mthe[1;m [1;41mright[1;m [1;41mpart[1;m [1;42mis[1;m [1;43mnot[1;m necessarily [1;43mabout[1;m [1;41mright[1;m [1;42min[1;m [1;41mgeneral[1;m [1;42mand[1;m [1;42mthe[1;m [1;41mright[1;m [1;41mpart[1;m [1;42mis[1;m [1;43mabout[1;m [1;41mright[1;m [1;42mfor[1;m [1;42mthe[1;m [1;41mgame[1;m [1;41mhere[1;m s [1;42mfor[1;m [1;42mthe[1;m [1;43mfirst[1;m [1;41mpart[1;m [1;42mof[1;m [1;42mthe[1;m [1;41mseries[1;m [1;42mt