# Preprocessing the statistics (after MAT tagger was used) (see paper section 3.4)

In [None]:
import pandas as pd
import glob, os

In [None]:
pwd

'/Users/Documents/dta/CLS/CLS_final'

In [None]:
df = pd.read_csv("/Users/Documents/dta/CLS/CLS_final/chunks_1500/MAT_chunks_1500/Statistics/Statistics_chunks_1500.csv")

#


In [None]:
import nltk
import numpy as np
import statistics
sent_tok = nltk.data.load('tokenizers/punkt/english.pickle')

In [None]:
def average_sentence_length(doc):
    sent_tok = nltk.data.load('tokenizers/punkt/english.pickle')
    
    tokenized_sentence = nltk.sent_tokenize(doc)
    
    av_sent_length = statistics.mean([len(text.split()) for text in tokenized_sentence])
    
    return round(av_sent_length, 2)

In [None]:
import os, glob

In [None]:
def get_temp_df(path):
    """Takes as argument a path to the directory containing
    chunk-txt files, and returns a dataframe with the basic information of those
    txt files - the title (filename), year, and text content - as well as the
    calculated statistic of average sentence length for each chunk.
    The returned dataframe is a temporary dataframe, made to be merged and matched up with 
    the statistics dataframe."""
    items = []

    for filename in glob.glob(path): 
        with open(filename) as f: 
            text = f.read()

        fn = os.path.basename(filename).replace(".txt", "")
        year = fn[:4]

        items.append((fn, year, text))
    
    filenames, years, texts = zip(*items)
    
    temp = pd.DataFrame({"filenames" : [tup[0] for tup in items],
                 "years": [tup[1] for tup in items],
                 "texts" : [tup[2] for tup in items]})
    
    temp.insert(3, "Average_sentence_length", temp.texts.apply(average_sentence_length))
    temp.rename(columns={"filenames":"Filename"}, inplace = True)
    
    return temp

In [None]:
def merge_stats_with_temp(stats, temp, merge_key):
    """Function to merge the stats dataframe with the temporary dataframe.
    Some characters such as brackets are removed from the column names, as 
    these are difficult to read by R."""
    merged_df = pd.merge(stats, temp, on=merge_key)
    merged_df.columns = merged_df.columns.str.replace(r"[\[\]]", "")
    return merged_df

In [None]:
path = "/Users/Documents/dta/CLS/CLS_final/chunks_1500/*.txt"

In [None]:
chunks_1500_temp = get_temp_df(path)

In [None]:
chunks_1500_temp.head()

Unnamed: 0,Filename,years,texts,Average_sentence_length
0,2015_manifesto-20,2015,"economy, the biblical prohibition against ""usu...",27.8
1,2015_manifesto-34,2015,merely a collective propagandist and collectiv...,23.44
2,2015_manifesto-35,2015,"their own truth and their victory.""the interna...",21.75
3,2015_manifesto-21,2015,"in the conventional sense, because they cannot...",20.28
4,2015_manifesto-37,2015,to set the stage for the arrival of the crisis...,26.8


In [None]:
df.head()

Unnamed: 0,Filename,Tokens,AWL,TTR,AMP,ANDC,[BEMA],[BYPA],CAUS,CONC,...,VBD,VPRT,[WHCL],[WHOBJ],[WHQU],[WHSUB],[WZPAST],[WZPRES],XX0,Unnamed: 69
0,1995_manifesto-11,1500.0,5.14,573.0,0.27,0.4,1.07,0.33,0.07,0.0,...,1.27,5.6,0.07,0.07,0.07,0.0,0.27,0.0,0.67,
1,2015_manifesto-12,1501.0,5.06,549.0,0.47,0.8,1.67,0.07,0.0,0.13,...,1.0,5.46,0.07,0.0,0.0,0.07,0.07,0.07,0.53,
2,2010_manifesto-12,1501.0,5.11,574.0,0.2,0.6,1.93,0.2,0.2,0.0,...,0.8,6.26,0.07,0.0,0.0,0.2,0.0,0.2,0.47,
3,2010_manifesto-3,1506.0,4.89,557.0,0.13,0.6,1.93,0.07,0.27,0.2,...,1.0,6.44,0.0,0.13,0.0,0.13,0.0,0.2,1.13,
4,1995_manifesto-18,1501.0,5.08,541.0,0.2,0.6,1.8,0.07,0.6,0.0,...,0.6,5.86,0.13,0.0,0.0,0.13,0.13,0.13,0.53,


In [None]:
merged_chunks_1500 = merge_stats_with_temp(df, chunks_1500_temp, "Filename")
merged_chunks_1500.head()

  merged_df.columns = merged_df.columns.str.replace(r"[\[\]]", "")


Unnamed: 0,Filename,Tokens,AWL,TTR,AMP,ANDC,BEMA,BYPA,CAUS,CONC,...,WHOBJ,WHQU,WHSUB,WZPAST,WZPRES,XX0,Unnamed: 69,years,texts,Average_sentence_length
0,1995_manifesto-11,1500.0,5.14,573.0,0.27,0.4,1.07,0.33,0.07,0.0,...,0.07,0.07,0.0,0.27,0.0,0.67,,1995,system. it is not possible to make a lasting c...,21.43
1,2015_manifesto-12,1501.0,5.06,549.0,0.47,0.8,1.67,0.07,0.0,0.13,...,0.0,0.0,0.07,0.07,0.07,0.53,,2015,of dangerous competition would first be establ...,28.85
2,2010_manifesto-12,1501.0,5.11,574.0,0.2,0.6,1.93,0.2,0.2,0.0,...,0.0,0.0,0.2,0.0,0.2,0.47,,2010,than when attempting to avoid a punishment or ...,22.39
3,2010_manifesto-3,1506.0,4.89,557.0,0.13,0.6,1.93,0.07,0.27,0.2,...,0.13,0.0,0.13,0.0,0.2,1.13,,2010,principles. 29. here is an illustration of the...,21.74
4,1995_manifesto-18,1501.0,5.08,541.0,0.2,0.6,1.8,0.07,0.6,0.0,...,0.0,0.0,0.13,0.13,0.13,0.53,,1995,and start screaming that if we fall behind in ...,20.83


In [None]:
merged_chunks_1500.Filename = merged_chunks_1500.Filename.str.replace("2015", "2016")
merged_chunks_1500.insert(1, "year", [file[:4] for file in merged_chunks_1500["Filename"]])

In [None]:
merged_chunks_1500.head()

Unnamed: 0,Filename,year,Tokens,AWL,TTR,AMP,ANDC,BEMA,BYPA,CAUS,...,WHOBJ,WHQU,WHSUB,WZPAST,WZPRES,XX0,Unnamed: 69,years,texts,Average_sentence_length
0,1995_manifesto-11,1995,1500.0,5.14,573.0,0.27,0.4,1.07,0.33,0.07,...,0.07,0.07,0.0,0.27,0.0,0.67,,1995,system. it is not possible to make a lasting c...,21.43
1,2016_manifesto-12,2016,1501.0,5.06,549.0,0.47,0.8,1.67,0.07,0.0,...,0.0,0.0,0.07,0.07,0.07,0.53,,2015,of dangerous competition would first be establ...,28.85
2,2010_manifesto-12,2010,1501.0,5.11,574.0,0.2,0.6,1.93,0.2,0.2,...,0.0,0.0,0.2,0.0,0.2,0.47,,2010,than when attempting to avoid a punishment or ...,22.39
3,2010_manifesto-3,2010,1506.0,4.89,557.0,0.13,0.6,1.93,0.07,0.27,...,0.13,0.0,0.13,0.0,0.2,1.13,,2010,principles. 29. here is an illustration of the...,21.74
4,1995_manifesto-18,1995,1501.0,5.08,541.0,0.2,0.6,1.8,0.07,0.6,...,0.0,0.0,0.13,0.13,0.13,0.53,,1995,and start screaming that if we fall behind in ...,20.83


In [None]:
merged_chunks_1500.to_csv("chunks_1500/chunks_1500_stats.csv", sep=",", encoding="utf8", index=False)

In [None]:
pwd

'/Users/Documents/dta/CLS/CLS_final'