# IMPORT

In [3]:
import pandas as pd
import numpy as np
from textblob import WordList, TextBlob
import re
import codecs
from datetime import datetime
import time
import string
from collections import Counter
from nltk import word_tokenize
from nltk import pos_tag
from nltk.stem import WordNetLemmatizer

In [None]:
data = pd.read_csv("~/Desktop/FOIA/master-1-11.csv", header = 0, low_memory = False)

# CLEAN AND EXTRACT

In [None]:
del data['Unnamed: 0'] 
data = data[data['closed_date'] != 'TBD']
data = data[data['description'] != '']
data = data[data['description'] != 'The description of this request is under Agency review.']

In [None]:
def remove_non_ascii(text):
    return ''.join([i if ord(i) < 128 else '' for i in text])

data["request"] = data['description'].str.apply(remove_non_ascii)

In [None]:
def del_whites(text):
    text = re.sub(r'\s', ' ', text).strip()
    text = re.sub(r'\s+', ' ', text).strip()
    return text

data["request"] = data['request'].apply(del_whites)

In [None]:
def cut_greeting(text):
    match = re.search('(|.+?)(Mrs?\..+?[,:]|Ms\..+?[,:]|Miss.+?[,:]|Mister.+?[,:]|[Mm]adams?[,:]|[Ss]irs?[,:]|To [Ww]hom [Ii]t [Mm]ay [Cc]oncern[,:]|Dear.+?[,:])(.+)', text)
    if match:
        return match.groups(0)[2]
    else:
        return text
        
data['request_no_greeting'] = data['request'].apply(cut_greeting)
data['request_no_greeting'] = data['request_no_greeting'].str.strip()

In [None]:
def cut_salutation(text):
    match = re.search('(.+)(Sincerely,|[Bb]est,|From,|[Tt]hanks,|Best [Ww]ishes,|[Rr]egards,|Thank [Yy]ou,|My best to you,|Warmly,|Take care,|Thanks so much,|Thank you,|Thanks for your consideration,|Looking forward,|Be well,)(.*)', text)
    if match:
        return match.groups(0)[0]
    else:
        return text
    
data['request_body'] = data['request_no_greeting'].apply(cut_salutation)
data['request_body'] = data['request_body'].str.strip()

In [None]:
data['body_lowercase'] = data['request_body'].str.lower()

In [None]:
def charify(text):
    exclude = set(string.punctuation)
    text = ''.join(char for char in text if char not in exclude)
    text = re.sub('\s+', '', text).strip()
    return len(text)

data['char_count'] = data['body_lowercase'].apply(charify)

In [None]:
def wordify(text):
    text = TextBlob(text)
    list_words = text.words
    return len(list_words)

data['word_count'] = data['body_lowercase'].apply(wordify)

In [None]:
def sentify(text):
    text = TextBlob(text)
    list_sens = text.sentences
    return len(list_sens)

data['sen_count'] = data['body_lowercase'].apply(sentify)

In [None]:
data['avg_sen_len'] = data['word_count']/data['sen_count']

In [None]:
def to_date(x):
    t = datetime.strptime(x, "%a %b %d %H %M %S %Z %Y")
    return t

data["closed_datetime"] = data["closed_date"].apply(to_date)
data["submitted_datetime"] = data["date_submitted"].apply(to_date)

In [None]:
def to_seconds(x):
    return time.mktime(x.timetuple())

data['duration'] = (data["closed_datetime"].apply(to_seconds) - data["submitted_datetime"].apply(to_seconds)) / 86400

In [None]:
def foia(text):
    if ('freedom of information' in text) or ('foia' in text):
        return 1
    else:
        return 0

data["ref_foia"] = data['body_lowercase'].apply(foia)

In [None]:
def fees(text):
    match = re.search('[.,\/#!$%\^&\*;:{}=\-_`~()\s]fees?[.,\/#!$%\^&\*;:{}=\-_`~()\s]', text)
    if match:
        return 1
    else:
        return 0

data["ref_fees"] = data['body_lowercase'].apply(fees)

In [None]:
def agency(tracking_number):
    match = re.search('(.*?)(-.*)', tracking_number)
    if match:
        return match.groups(0)[0]

data["_agency"] = data['tracking number'].apply(agency)

In [None]:
def phone_number(text):
    match = re.search('(\d{3}[-\.\s]??\d{3}[-\.\s]??\d{4}|\(\d{3}\)\s*\d{3}[-\.\s]??\d{4}|\d{3}[-\.\s]??\d{4})', text)
    if match:
        return 1
    else:
        return 0

data["phone_number"] = data['request_no_greeting'].apply(phone_number)

In [None]:
def hyperlink(text):
    match = re.search('((https?):((//)|(\\\\))+[\w\d:#@%/;$()~_?\+-=\\\.&]*)', text)
    match2 = re.search('(www.(?:[a-z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-f][0-9a-f]))+)', text)
    if match or match2:
        return 1
    else:
        return 0

data["hyperlink"] = data['body_lowercase'].apply(hyperlink)

In [None]:
def email_address(text):
    match = re.search('([\w\-\.]+@(\w[\w\-]+\.)+[\w\-]+)', text)
    if match:
        return 1
    else:
        return 0

data["email_address"] = data['request_no_greeting'].apply(email_address)

In [None]:
def date(text):
    match = re.search('(\d{1,2}[\.\-\/]\d{1,2}[\.\-\/]\d{2,4}|[Jj]anuary|[Ff]ebruary|[Mm]arch|[Aa]pril|[Mm]ay|[Jj]une|[Jj]uly|[Aa]ugust|[Ss]eptember|[Nn]ovember|[Dd]ecember|[Mm]on\b|[Tt]ues\b|[Ww]ed\b|[Tt]hurs\b|[Ff]ri\b|[S]at\b|[Ss]un\b|[[Jj]an\b|[Ff]eb\b|[Mm]ar\b|[Aa]pr\b|[Aa]ug\b|[Ss]ept\b|[Oo]ct\b|[Nn]ov\b|[Dd]ec[Mm]onday|[Tt]uesday|[Ww]ednesday|[Tt]hursday|[Ff]riday|[Ss]aturday|[Ss]unday)', text)
    match2 = re.search('( )(19|20)(\d\d)( |-19\d\d|-20\d\d|-\d\d)', text)
    if match or match2:
        return 1
    else:
        return 0

data["ref_date"] = data['request_body'].apply(date)

In [None]:
us_states = pd.read_csv("~/Desktop/FOIA/states.csv", header = 0)

states = us_states['State']
states_lower = us_states['State'].str.lower()
abbrev = us_states['Abbreviation']
postal = us_states['Postal']
postal_lower = us_states['Postal'].str.lower()

def ref_place(text):
    if any(substring in text for substring in states):
        return 1
    elif any(substring in text for substring in states_lower):
        return 1
    elif any(substring in text for substring in abbrev):
        return 1
    elif any(substring in text for substring in postal):
        return 1
    elif any(substring in text for substring in postal_lower):
        return 1
    else:
        return 0
 
data["ref_place"] = data['request_body'].apply(ref_place)

In [None]:
# https://en.wikipedia.org/wiki/Automated_readability_index
# Note actual intellgience paper

data['readability'] = 4.71*(data['char_count']/data['word_count']) + 0.5*(data['word_count']/data['sen_count']) - 21.43

def max_out(x):
    if x > 20:
        return 20
    else:
        return x
    
data_filtered['readability'] = data_filtered['readability'].apply(max_out)

In [None]:
def mention_data(text):
    data_terms = ['affymetrix',
    'aiff',
    'apache',
    'arcgrid',
    'bdf',
    'binary',
    'bmp',
    'byte',
    'byu',
    'bzip',
    'cded',
    'cdf',
    'column',
    'csv',
    'data',
    'delimited',
    'directory',
    'dx',
    'dxf',
    'edf',
    'emf',
    'eml',
    'eps',
    'excel',
    'exr',
    'fastq',
    'flac',
    'flv',
    'fq',
    'fsa',
    'gbk',
    'genbank',
    'geotiff',
    'gml',
    'gpx',
    'graphlet',
    'grb',
    'grd',
    'grib',
    'gtopo30',
    'gw',
    'gxl',
    'gz',
    'gzip',
    'harwellboeing',
    'hdf',
    'hdf5',
    'j2k',
    'jcm',
    'jdx',
    'jp2',
    'json',
    'jvx',
    'kml',
    'kmz',
    'latex',
    'lgr',
    'lwo',
    'mbox',
    'mdb',
    'mgf',
    'mol2',
    'mpfa',
    'mps',
    'mtp',
    'mtx',
    'mx',
    'nb',
    'ndk',
    'netcdf',
    'noff',
    'nxs',
    'obj',
    'pbm',
    'pcx',
    'pdb',
    'pgm',
    'pnm',
    'ppm',
    'pxr',
    'row',
    'rss',
    'sct',
    'sdf',
    'sff',
    'shp',
    'sp3',
    'sql',
    'structured query language',
    'stx',
    'svg',
    'swf',
    'sxc',
    'table',
    'tabular',
    'tce',
    'tga',
    'tgf',
    'tle',
    'tsv',
    'uue',
    'vcf',
    'vcs',
    'vrml',
    'vtk',
    'webp',
    'wmf',
    'x3d',
    'xbm',
    'xls',
    'xml',
    'xport',
    'xpt',
    'xyz',
    'zpr']
    if any(substring in text for substring in data_terms):
        return 1
    else:
        return 0

data['ref_data'] = data['body_lowercase'].apply(mention_data)

In [None]:
def specificity_count(text):

    edited = str(text)

    bag_of_words = word_tokenize(edited)
    pos_tags = pos_tag(bag_of_words)
    
    counter = 0
    last_tag = None 
    
    for tagged_tuple in pos_tags:
        if (tagged_tuple[1]=='NNP' and last_tag!='NNP'):
            counter += 1 
        last_tag = tagged_tuple[1]

    return counter

data['specificity'] = data['request_body'].apply(specificity_count)

In [None]:
wnl = WordNetLemmatizer()

def lemmatizer(text):
    stem_list = []
    text = TextBlob(text)
    for word in text.words:
        word = TextBlob(word)
        POS = word.tags[0][1]
        if POS.startswith("N"):
            stem = wnl.lemmatize(word, pos = 'n')
            stem_list.append(str(stem))
        elif POS.startswith("V"):
            stem = wnl.lemmatize(word, pos = 'v')
            stem_list.append(str(stem))
        else:
            stem_list.append(str(word))
    return ' '.join(stem_list)

data['lemmatized_body'] = data['body_lowercase'].apply(lemmatizer)

In [None]:
def biclassify_strict(disposition):
  if disposition == 'Full grant':
    return 1
  else:
    return 0

data['bi_strict'] = data['final_disposition'].apply(biclassify_strict)

In [None]:
def biclassify_generous(disposition):
  if (disposition == 'Full grant') or (disposition == 'Partial grant/partial denial'):
    return 1
  else:
    return 0

data['bi_gen'] = data['final_disposition'].apply(biclassify_generous)

In [None]:
def id(text):
    match = re.search('((ID|No[.\s]|Number|#).{,5}-\d{4})|([A-Z]{2,}\d{6,})|((ID|No[.\s]|Number|#|[Ff]orm).{,5}\d{4}-)|(API.*\d{6,})|((ID|No[.\s]|Number|#).{,5}\d{3}-\d{3})', text)
    if match:
        return 1
    else:
        return 0

data["id"] = data['request_body'].apply(id)

In [None]:
data.to_csv('~/Desktop/FOIA/rich_foia_master.csv')