numpy
pandas
scipy
scikit-learn
matplotlib
spacy

# This notebook is meant to conduct all the preprocessing. 

### Specifically, we take raw ocr outputs from the PDFs, and convert them into count and tfidf vector representations suitable for a range of analyses

In [1]:
PROCESSED_DOCS = False 

## Load In Metadata

In [2]:
import pandas as pd

## Master CSV: https://github.com/datadrivenenvirolab/net_zero/blob/master/data/Sidd/net_zero_NLP_metadata_master.csv
csv_path = '/Users/siddharthsachdeva/Downloads/net_zero_NLP_metadata_master.csv'
# csv_path = 'sample_data/sample_meta.csv'
meta = pd.read_csv(csv_path)
meta

Unnamed: 0,iso,entity_type,name,country,region,area,lat,lng,population,population_year,...,ghg_reduction_target,target_year,percent_reduction,initiatives_committed,net_zero_target_status,econ_wide_net_zero,pop_density,emis_per_capita,coordinator_name,support_type
0,GBR,Region,Aberdeenshire,United Kingdom,Europe,6313.00,57.166667,-2.666667,243510.0,,...,,2020,20.0,GlobalCovenantofMayors2019,24,1.0,38.572786,,"Convention of Scottish Local Authorities, GB",Supporter
1,GBR,Region,Aberdeenshire,United Kingdom,Europe,6313.00,57.166667,-2.666667,243510.0,,...,,2020,20.0,GlobalCovenantofMayors2019,24,1.0,38.572786,,Energy Saving Trust,Coordinator
2,ITA,City,Acquappesa,Italy,Europe,14.45,39.500000,15.950000,1882.0,2018.0,...,32% below 2014 levels by 2020,2020,32.0,EUCovenantofMayors2019;GlobalCovenantofMayors2...,4,1.0,130.242215,4.556626,Energia Calabria Network,Coordinator
3,ITA,City,Acquappesa,Italy,Europe,14.45,39.500000,15.950000,1882.0,2018.0,...,100% below 2014 levels by Long term,Long term,100.0,EUCovenantofMayors2019;GlobalCovenantofMayors2...,14,1.0,130.242215,4.556626,Energia Calabria Network,Coordinator
4,AUS,City,Adelaide,Australia,East Asia and the Pacific,3257.70,-34.928889,138.601111,1376601.0,2017.0,...,,2020,35.0,Carbonn2019;GlobalCovenantofMayors2019;CDPCiti...,,1.0,7.341376,0.834017,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
569,BEL,City,Ville de Nivelles,Belgium,Europe,,50.597890,4.323399,28535.0,,...,90% below 2006 levels by Long term,Long term,90.0,EUCovenantofMayors2020,14,,,8.474211,Région wallonne,Coordinator
570,ITA,City,Vogogna,Italy,Europe,15.62,46.008980,8.293017,1770.0,2010.0,...,93% below 2010 levels by 2020,2020,93.0,GlobalCovenantofMayors2019;EUCovenantofMayors2...,14,,113.316261,4.181376,Region of Piemonte,Coordinator
571,AUS,City,Wollongong,Australia,East Asia and the Pacific,572.20,-34.433060,150.883100,208875.0,2017.0,...,,2030,,Carbonn2019;GlobalCovenantofMayors2019;CDPCiti...,24,,365.038448,,,
572,AUS,City,Wollongong,Australia,East Asia and the Pacific,572.20,-34.433060,150.883100,208875.0,2017.0,...,,,,Carbonn2019;GlobalCovenantofMayors2019;CDPCiti...,24,,365.038448,,,


## Load in Textual data

In [3]:
import os
import json

# Downlodunzip this data: https://drive.google.com/file/d/1hnoyLCcnNJub22YCHN5xxKHRy6Z8VqUw/view?usp=sharing
extraction_path = '/Users/siddharthsachdeva/personal/carbon_zero_nlp/data/all_netzero_data_textract20200906/'
# extraction_path = 'sample_data/text_jsons'
city_names = set(meta.name)
pdf_fnames = [name for name in os.listdir(extraction_path) 
              if name.endswith('.json')]
len(pdf_fnames)

172

In [4]:
len(pdf_fnames)

172

In [5]:
# from shutil import copyfile

# for f in pdf_fnames:
#     copyfile(os.path.join(extraction_path, f),os.path.join(sample_extraction_path,f)) 

In [6]:
sorted(list(pdf_fnames))

['Abasan Al-Kabira_textract20200906.json',
 'Adelaide_carbon-neutral-action-plan (1)_textract20200906.json',
 'Adelaide_report-carbon-neutral-adelaide-status-report-july-19_textract20200906.json',
 'Alameda, CA_textract20200906.json',
 'Albany, NY_textract20200906.json',
 'Amman_textract20200906.json',
 'Amsterdam_textract20200906.json',
 'Ann Arbor, MI_textract20200906.json',
 'Arlington, VA_textract20200906.json',
 'ArlingtonFinal-CEP-CLEAN-003_textract20200906.json',
 'Asheville, NC_textract20200906.json',
 'Aspen, CO_textract20200906.json',
 'Atlanta, GA_textract20200906.json',
 'Auckland_textract20200906.json',
 'Austin, TX_textract20200906.json',
 'BCP Council_textract20200906.json',
 'Barcelona_Climate_Plan_textract20200906.json',
 'Barcelona_textract20200906.json',
 'Bath & North East Somerset Climate Emergency Action Plan_textract20200906.json',
 'Bath & North East Somerset_textract20200906.json',
 'Belo Horizonte_textract20200906.json',
 'Berkeley_CA_2020-07-21 Special Item 0

### Format OCR output

In [7]:
data = []

for fname in pdf_fnames:
    city_name = fname.split(',')[0].split('_')[0]
    path = os.path.join(extraction_path, fname)
    with open(path, 'r') as f:
        text_json = json.load(f)
        if text_json['-1'] is not None:
            raw_text = text_json['-1']['raw_text']
            ocr_method = text_json['-1']['method']
            translated_text = text_json['-1'].get('translated_text', None)
        else:
            raw_text = None
            ocr_method = None
        row = {
            'full_json': json.dumps(text_json),
            'raw_text': raw_text,
            'translated_text': translated_text,
            'ocr_method': ocr_method,
            'city': city_name,
            'path': path
        }
        data.append(row)

In [8]:
# Download and unzip this data: https://drive.google.com/file/d/1O-bGar07viUpuaxbp8PHmQ0oFQhiRpdZ/view?usp=sharing

translated_extraction_path = '/Users/siddharthsachdeva/personal/carbon_zero_nlp/data/eucovdata_textracttesseract_translated/'
translated_pdf_fnames = [name for name in os.listdir(translated_extraction_path) if name.endswith('.json')]
                         #and name.split(' -')[0] in full_ds.name.tolist()]

In [9]:
sorted(list(translated_pdf_fnames))

['Aalborg - Climate Strategy_tesseract20200906.json',
 'Aarhus - Climate Plan 2016-2020 _tesseract20200906.json',
 'Aberdeenshire - Net Zero Vision And Infrastructure Plan _tesseract20200906.json',
 'Acquappesa - SEAP (Italian) _tesseract20200906.json',
 'Albairate - PAES (Italian) _tesseract20200906.json',
 'Albertslund - Klimastrategi 2017-2025 (Danish) _tesseract20200906.json',
 'Alessano - Alessano Sostenibile (Italian) _tesseract20200906.json',
 'Andrano - Andrano 2020 (Italian) _tesseract20200906.json',
 'Arzana - Action Plan Local Arzana Elini (Italian) _tesseract20200906.json',
 'Asse - Klimaatactieplan Asse (Dutch) _tesseract20200906.json',
 'Averara - SEAP Averara (Italian) _tesseract20200906.json',
 'Bagnoli di Sopra - PAES (Italian) _tesseract20200906.json',
 'Bagnolo San Vito - PAES (Italian) _tesseract20200906.json',
 'Balones - SEAP Of Balones (Spanish) _tesseract20200906.json',
 'Balsareny - PAES De Balsareny (Catalan) _tesseract20200906.json',
 'Balti - Sustainable Ene

In [10]:
for fname in translated_pdf_fnames:
    city_name = fname.split(',')[0].split('_')[0]
    path = os.path.join(translated_extraction_path, fname)
    with open(path, 'r') as f:
        text_json = json.load(f)
        if text_json['-1'] is not None:
            raw_text = text_json['-1']['raw_text']
            ocr_method = text_json['-1']['method']
            translated_text = text_json['-1'].get('translated_text', None)
        else:
            raw_text = None
            ocr_method = None
        row = {
            'full_json': json.dumps(text_json),
            'raw_text': raw_text,
            'translated_text': translated_text,
            'ocr_method': ocr_method,
            'city': city_name,
            'path': path
        }
        data.append(row)

In [11]:
df = pd.DataFrame(data)
len(df['city'].unique())

349

In [12]:
print(sorted(os.listdir('sample_data/text_jsons/')))
print(df['city'].sort_values().tolist())

['Albany, NY_textract20200906.json', 'BCP Council_textract20200906.json', 'Emeryville, CA_textract20200906.json', 'Orlando, FL_textract20200906.json', 'Riga_textract20200906.json']
['Aalborg - Climate Strategy', 'Aarhus - Climate Plan 2016-2020 ', 'Abasan Al-Kabira', 'Aberdeenshire - Net Zero Vision And Infrastructure Plan ', 'Acquappesa - SEAP (Italian) ', 'Adelaide', 'Adelaide', 'Alameda', 'Albairate - PAES (Italian) ', 'Albany', 'Albertslund - Klimastrategi 2017-2025 (Danish) ', 'Alessano - Alessano Sostenibile (Italian) ', 'Amman', 'Amsterdam', 'Andrano - Andrano 2020 (Italian) ', 'Ann Arbor', 'Arlington', 'ArlingtonFinal-CEP-CLEAN-003', 'Arzana - Action Plan Local Arzana Elini (Italian) ', 'Asheville', 'Aspen', 'Asse - Klimaatactieplan Asse (Dutch) ', 'Atlanta', 'Auckland', 'Austin', 'Averara - SEAP Averara (Italian) ', 'BCP Council', 'Bagnoli di Sopra - PAES (Italian) ', 'Bagnolo San Vito - PAES (Italian) ', 'Balones - SEAP Of Balones (Spanish) ', 'Balsareny - PAES De Balsareny (

## Apply text preprocessing using spacy

In [13]:
from spacy_langdetect import LanguageDetector

help(LanguageDetector)

Help on class LanguageDetector in module spacy_langdetect.spacy_langdetect:

class LanguageDetector(builtins.object)
 |  LanguageDetector(language_detection_function=None)
 |  
 |  Fully customizable language detection pipeline for spaCy.
 |  
 |  Arguments:
 |      language_detection_function: An optional custom language_detection_function. (Default None).
 |                                   If None uses, langdetect package to detect language
 |  
 |  # writing a custom language_detection_function:
 |      The function must take in a spacy Doc or Span object only as input and can return the detected language.
 |      This is stored in Doc._.language, Span._.language and Token._.language attributes.
 |  
 |  Methods defined here:
 |  
 |  __call__(self, doc)
 |      Call self as a function.
 |  
 |  __init__(self, language_detection_function=None)
 |      Initialize self.  See help(type(self)) for accurate signature.
 |  
 |  -----------------------------------------------------------

In [14]:
from tqdm import tqdm
import spacy

tqdm.pandas()
nlp = spacy.load('en_core_web_md')

if not PROCESSED_DOCS:
    print('Processing from scrtch...')
    df['raw_doc'] = df['raw_text'].progress_apply(nlp)
    detector = LanguageDetector()

    # df['raw_language'] = df['raw_doc'].progress_apply(lambda doc: detector(doc)._.language['language'])
    def get_doc(row):
        if row['translated_text'] is None:
            # Language detection has many false english detections, so I'll just use the translated documents.
            return row['raw_doc']
        elif row['translated_text'] is not None:
            return nlp(row['translated_text'])
        else:
            print('None')
            return None
        
    df['doc'] = df.progress_apply(get_doc, axis=1)
    df.to_pickle('processed_apr7.pkl')
else:
    print('Loading pickle file')
    df = pd.read_pickle('processed_apr7.pkl')

  from pandas import Panel
  1%|          | 3/384 [00:00<00:15, 25.35it/s]

Processing from scrtch...


 26%|██▌       | 100/384 [05:53<16:43,  3.53s/it]


KeyboardInterrupt: 

In [None]:
import jupyternotify
ip = get_ipython()
ip.register_magics(jupyternotify.JupyterNotifyMagics)

In [None]:
detector = LanguageDetector()

df['raw_language'] = df['raw_doc'].progress_apply(lambda doc: detector(doc)._.language['language'])

In [None]:
df['raw_language'].value_counts()

In [None]:
untranslated = df[(df['raw_language']!='en') & df['translated_text'].isnull()]
df = df[~(df.translated_text.isnull() & (df['city'].isin(untranslated['city'])))]

In [None]:
inp_texts = df[(df['raw_language']=='en') | df2['translated_text'].notnull()] 
len(inp_texts['city'].unique())

## Clean city names

In [None]:
from copy import deepcopy

def get_city_name(row):
    """Takes row and gives cleaned city name"""
    if row['translated_text'] is not None:
        city = row['path'].split('/')[-1].split('-')[0].split(',')[0]
        if city.endswith('.json'):
            city = city.split('_')[0]
        return city
    else:
        return row['city']
    
def remove_filler(s):
    """Remove filler words from plan name string"""
    rep = [
        'Action',
        'Plan',
        'Climate',
        'Emergency',
        '2016',
        'Final',
        'JBE-2020-18-Climate--Initiatives-Task-Force',
        'JBE-2020-18--Initiatives-Task-Force',
        'strategy',
        'Neutral',
        'Strategy',
        '-COP25 -',
        'and Energy Programme 2014-2023',
        'Energie Positive',
        'Energie Positive',
        'CAP',
        '-CEP-CLEAN-003',
        'climate-resilience-plan',
        'Fovaros XIX. Kerulet, Kispest Onkormanyzata',
        'City of', 
        'Change',
        'final'
    ]
    for w in rep:
        s = s.replace(w, '')
    return s.strip()

def remove_trailing(s):
    return s.split(',')[0]

def rename(os):
    mapping = {
        'AEro': 'Ærø',
        'Beniardà': 'Beniardà',
        'Comunita pioniera di Arborea': 'Comunità pioniera di Arborea',
        'LA': 'Los Angeles', 
        'Mexico': 'Mexico City', 
        'Sonama': 'Sonoma',
        'Belvì': 'Belvì',
        'Budapest Fováros XIX. Kerület': 'Budapest',
        'Comunità pioniera di Arborea': 'Comunità pioniera di Arborea',
        'Dénia': 'Dénia',
        'Granze (Pd)': 'Granze (Pd) - Italy',
        'Kéa (Aegean Islands)': 'Kéa (Aegean Islands)',
        'Krško': 'Krško',
        'Lörrach': 'Lörrach',
        'Luleå': 'Luleå',
        'Monrupino': 'Monrupino-Repentabor',
        'Montejícar': 'Montejícar',
        'Mornington Peninsula': 'Mornington Peninsula Shire',
        'Münster': 'Münster',
        'Nilüfer': 'Nilüfer',
        'Northhampton': 'Northampton',
        'Patù': 'Patù',
        'Peñarroya': 'Peñarroya-Pueblonuevo',
        'Region Zuid': 'Region Zuid-West-Vlaanderen',
        'Reykjavík': 'Reykjavík',
        'Ringkøbing': 'Ringkøbing-Skjern',
        'Saint': 'Saint-Nicolas',
        'San Jose': 'San José',
        'San Polo d_Enza': "San Polo d'Enza",
        'Sant_ Urbano': "Sant'urbano",
        'Sant_Anna Arresi': "Sant'Anna Arresi",
        'Sant_Elena': "Sant'Elena",
        'St Catharines': 'St. Catharines',
        'St Louis': 'St. Louis',
        'Tàrbena': 'Tàrbena',
        'Umea': 'Umeå',
        'Upssala': 'Uppsala',
        'Växjö': 'Växjö',
        'Vilkaviškis': 'Vilkaviškis',
        'Wezembeek': 'Wezembeek-Oppem',
        'eThekweni': 'Durban',
        'Wallonie Picarde': 'Wallonie picarde Energie Positive',
        'Tainan City': 'Tainan',
        'The Espoo Story in English': 'Espoo',
        'Kansas': 'Kansas City',
        'Jászberény Városi Önkormányzat':  'Jaszbereny Varosi Onkormanyzat',
        'Skovde':'Skövde'
    }

    s = deepcopy(os)
    for og in mapping:
        s = s.replace(og, mapping[og])
    return s

inp_texts['city_name'] = inp_texts.apply(get_city_name, axis=1)
inp_texts['city_name'] = inp_texts['city_name'].apply(
    remove_trailing).apply(remove_filler).apply(rename)
inp_texts['city_name'].sort_values().unique()

In [None]:
clean_names.sort_values()

In [None]:
def c(l):
    if len(l) > 2 and l[-1] == l[-3]:
        o = l[:-2]
    elif len(l) > 1 and l[-1] == l[-2]:
        o = l[:-1]
    else:
        o = l
    return ' '.join(o)

meta.name = meta.name.str.split().apply(c)

In [None]:
meta.name = meta.name.apply(
    remove_trailing).apply(remove_filler)

## Merge with Clean City Names

In [None]:
clean_names = pd.Series(json.load(open('/Users/siddharthsachdeva/Downloads/final_city_list.json')))
#clean_names['cleaned_name'] = clean_names['name'].str.split(',').apply(lambda ps: ps[0])
#clean_names['cleaned_name'].sort_values().tolist()
len(clean_names)

In [None]:
len(meta['name'].unique())

In [None]:
fvc = meta['name'].value_counts()
multiple = fvc[fvc>1].index

In [None]:
meta = meta.groupby('name').aggregate(lambda s: s.iloc[0] if len(s.unique()) == 1  else s.tolist()).reset_index()
meta

In [None]:
meta['cleaned_name'] = meta['name']

In [None]:
meta.columns

In [None]:
inp_texts.columns

In [None]:
inp_texts['cleaned_name'] = inp_texts['city_name']

In [None]:
data = pd.merge(meta, right=inp_texts, on='cleaned_name', how='outer')
len(data['cleaned_name'].unique())

In [None]:
data.columns

## Combine plans for the same city 

In [None]:
data = data[data['doc'].notnull()]
len(data['cleaned_name'].unique())

In [None]:
vc = data['name'].value_counts()
vc[vc>2]

In [None]:
import numpy as np

def combine(city_reports):
    """Combine rows that have the same city"""
    res = city_reports[~city_reports['raw_text'].duplicated()]
    if len(res) == 1:
        return res
    new = dict()
    for col in city_reports.columns:
        data = city_reports[col]
        data = data[data.notnull()]
        data = data[~data.apply(str).duplicated()]
        if len(data) == 0:
            new[col] = np.nan
        elif len(data) == 1:
            new[col] = data.iloc[0]
        elif col == 'raw_text':
            new[col] = '\n\n\n'.join(city_reports[col].tolist())
        elif col == 'doc':
            new[col] = nlp('\n\n\n'.join(city_reports['raw_text'].tolist()))
        else:
            new[col] = data.tolist()
    return pd.DataFrame([pd.Series(new)])

data = data.groupby('name').progress_apply(combine).reset_index(drop=True)

In [None]:
data

## Merge city characteristics

In [None]:
region_data = data['region'].value_counts(dropna=False).iloc[::-1]
region_data.plot.barh()

In [None]:
country2missing_regions = {
    'ITA': 'Europe',
    'USA': 'North America',
    'BEL': 'Europe',
    'GBR': 'Europe',
    'PRT': 'Europe',
    'FIN': 'Europe',
    'SVN': 'Europe',
    'BLR': 'Europe',
    'LVA': 'Europe',
    'LBN': 'Middle East and North Africa',
    'PRT': 'Europe',
    'HUN': 'Europe',
    'UKR': 'Europe',
    'NOR': 'Europe',
    'ISL': 'Europe',
    'DNK': 'Europe',
    'IRL': 'Europe',
    'ISR': 'Middle East and North Africa',
    'PSE': 'Middle East and North Africa',
    'TUR': 'Middle East and North Africa',
    'ISR': 'Middle East and North Africa',
    'UGA': 'Sub-Saharan Africa',
    'ZAF': 'Sub-Saharan Africa',
    'TWN': 'East Asia and the Pacific',
    'SGP': 'East Asia and the Pacific',
    'JPN': 'East Asia and the Pacific',
    'CHN': 'East Asia and the Pacific',
    'MEX': 'North America',
    'ESP': 'Europe',
    'CAN': 'North America',
    'NLD': 'Europe',
    'DEU': 'Europe',
    'CHL': 'South America',
    'BRA': 'South America',
    'ARG': 'South America',
    'AUS': 'East Asia and the Pacific',
    'NZL': 'East Asia and the Pacific'
}

data.loc[data['region'].isnull(), 'region'] = data[data['region'].isnull()]['iso'].map(country2missing_regions)
data['region'].isnull().sum()

In [None]:
region_counts = data.region.value_counts()[::-1]
region_counts.plot.barh()

In [None]:
inp_data = data

## Set tgt variables for logistic regression

In [None]:
inp_data.econ_wide_net_zero = inp_data.econ_wide_net_zero.notnull()
inp_data.econ_wide_net_zero.value_counts()

In [None]:
s = '1,4,1,2,4,1,4,1'
fix_tgts = lambda s: ','.join(set([x.strip() for x in s.split(',')]))
fix_tgts(s)

In [None]:
import numpy as np

def convert_tgt(tgt):
    if isinstance(tgt, list):
        return convert_tgt(','.join(sorted([x for x in tgt if x is not np.nan])))
    elif isinstance(tgt, str):
        return ','.join(sorted(set([x.strip() for x in tgt.split(',')])))#','.join(sorted(list(set([x.strip() for x in tgt.split(',')]))))
    else:
        return tgt

inp_data.net_zero_target_status = inp_data.net_zero_target_status.apply(convert_tgt)
inp_data.net_zero_target_status.unique()

In [None]:
inp_data.net_zero_target_status.value_counts(dropna=False)

In [None]:
inp_data['legislative_commitment'] = inp_data.net_zero_target_status.str.contains('2')
inp_data['legislative_commitment'].value_counts()

In [None]:
inp_data[inp_data['percent_reduction'].apply(type) == list].columns

In [None]:
def get_reduction_tgt(pct_reduction):
    if isinstance(pct_reduction, str):
        pct_reduction = json.loads(pct_reduction)
    if isinstance(pct_reduction, list):
        return max(pct_reduction)
    elif np.isnan(pct_reduction):
        return 0
    else:
        return pct_reduction
    
inp_data['AGGRESSIVE_TARGET'] = inp_data['percent_reduction'].apply(get_reduction_tgt) > 80
inp_data['AGGRESSIVE_TARGET'].value_counts()

In [None]:
def combinations(city):
    ps = city.lower().split(' ')
    cs = []
    for i in range(len(ps)):
        for j in range(i+1, len(ps)+1):
            cs.append(' '.join(ps[i:j]))
    
    return cs

combinations('new york city')

## Preprocess Raw Text into Tokens

In [None]:
all_cities = set(inp_data[inp_data['city_name'].notnull()]['city_name'].apply(combinations).explode()).union({'bay'})
all_cities

In [None]:
all_countries = set(inp_data[inp_data['country'].notnull()]['country'].apply(combinations).explode())
all_countries

In [None]:
'york' in all_cities

In [None]:
stop_words = set([
    'page',
    'ave',
    'nov'
    'montana',
    'sardinia',
    'ooo'
])

In [None]:
ignore_words = all_cities.union(all_countries).union(stop_words)

In [None]:
def include_token(tok):
    not_a_city = tok.text.lower() not in ignore_words 
    return tok.has_vector and not tok.is_punct and tok.is_alpha and len(tok.text) > 2 and not_a_city and tok.pos_ != 'PRON' and tok.lemma_.lower() != '-pron-'

for sent in inp_data['doc'].iloc[2].sents:
    toks = [tok for tok in sent if include_token(tok)]
    if len(toks):
        print('Got')
        print(toks)

In [None]:
from tqdm import tqdm
tqdm.pandas()

def get_tokens(doc):
    if doc is not np.nan:
        return [tok for tok in doc if include_token(tok)]
    else:
        return [] 

inp_data['tokens'] = inp_data['doc'].progress_apply(get_tokens)

In [None]:
def get_lemmatized_tokens(tokens):
    return [tok.lemma_.lower() for tok in tokens]

inp_data['lemmatized_tokens'] = inp_data['tokens'].apply(get_lemmatized_tokens)

## Filter out shorter documents

In [None]:
(inp_data['lemmatized_tokens'].apply(len) < 100).value_counts()

In [None]:
inp_data = inp_data[(inp_data['lemmatized_tokens'].apply(len) > 100)]
inp_data = inp_data[~inp_data.raw_text.duplicated()]

In [None]:
inp_data['econ_wide_net_zero'].apply(bool).value_counts()

In [None]:
inp_data['region'].value_counts()

In [None]:
inp_data.econ_wide_net_zero.notnull().value_counts()

## Featurize input text

In [None]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

help(CountVectorizer)

In [None]:
def dummy(doc):
    return doc

cv = CountVectorizer(stop_words='english', tokenizer=dummy, preprocessor=dummy, min_df=0.05)
cv_2grams = CountVectorizer(stop_words='english', tokenizer=dummy, preprocessor=dummy, ngram_range=(1,2), min_df=0.1)
cv.fit(inp_data['lemmatized_tokens'])
cv_2grams.fit(inp_data['lemmatized_tokens'])

In [None]:
inp_data['region'].value_counts().plot.barh()

In [None]:
len(cv.vocabulary_)

In [None]:
len(cv_2grams.vocabulary_)

In [None]:
x = cv.transform(inp_data['lemmatized_tokens'])
x_2g = cv_2grams.transform(inp_data['lemmatized_tokens'])

In [None]:
from sklearn.feature_extraction.text import TfidfTransformer

help(TfidfTransformer)

In [None]:
tfidf = TfidfTransformer(use_idf=True, sublinear_tf=True)
normalized_x = tfidf.fit_transform(x)
normalized_x2g = tfidf.fit_transform(x_2g)

## Serialize Intermediate data structures for analysis

In [None]:
inp_data['lemmatized_text'] = inp_data['lemmatized_tokens'].apply(lambda l: ' '.join(l))
inp_data['doc_lengths'] = inp_data['lemmatized_tokens'].apply(len)

In [None]:
inp_data[['city_name', 'region', 'coordinator_name', 'doc_lengths', 
          'econ_wide_net_zero', 'AGGRESSIVE_TARGET']].to_csv('serialize/inp_data.csv')

In [None]:
import pickle 
with open('serialize/x.pickle', 'wb') as out:
    pickle.dump(x, out)

In [None]:
with open('serialize/x_2g.pickle', 'wb') as out:
    pickle.dump(x_2g, out)

In [None]:
with open('serialize/normalized_x.pickle', 'wb') as out:
    pickle.dump(normalized_x, out)

In [None]:
with open('serialize/normalized_x2g.pickle', 'wb') as out:
    pickle.dump(normalized_x2g, out)

In [None]:
with open('serialize/vocabulary.json', 'w') as out:
    json.dump(cv.vocabulary_, out)
    
with open('serialize/vocabulary_2g.json', 'w') as out:
    json.dump(cv_2grams.vocabulary_, out)