Basic script for taking the text segments, running through our processor and persisting them.

In [1]:
#Basic imports
import pandas as pd
import numpy as np
import nltk
import string
from nltk.corpus import stopwords
from nltk.stem.snowball import *
import re
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix

In [2]:
#Database imports and credentials
from sqlalchemy import create_engine
from sqlalchemy_utils import database_exists, create_database
import psycopg2

In [3]:
#Establish database connection
dbname = 'beforeiagree_db'
username = 'peterostendorp'

#Create engine
con = psycopg2.connect(database = dbname, user = username)
engine = create_engine('postgres://%s@localhost/%s'%(username,dbname))
print(engine.url)

postgres://peterostendorp@localhost/beforeiagree_db


## Processing segments

In [4]:
#Get segments associated with policies in the OPP-115 corpus
sql = """
SELECT * FROM segments
WHERE "Policy UID" IN 
(SELECT "Policy UID" FROM sites
WHERE "In 115 Set?" = TRUE)
"""

segments = pd.read_sql_query(sql,con)

In [5]:
#Initial text-processing function for segments... not much needed here.
#Set up our initial text cleaning function
def text_process_policy(doc):
    """
    1. remove HTML tags
    2. remove punctuation
    3. remove stopwords
    4. stemming
    5. remove '|||' inserted into corpus documents only
    
    Returns a string of the processed doc.
    """
    sn = SnowballStemmer(language='english')
    
    lst = [word for word in doc.split() if re.search(r'\<.*\>',word) is None]
    lst = ' '.join(lst)
    lst = [char for char in lst if char not in string.punctuation]
    lst = ''.join(lst)    
    lst = [word.lower() for word in lst.split() if word.lower() not in stopwords.words('english')]
    lst = [sn.stem(word) for word in lst]
    lst = [word for word in lst if word.replace('|||','')]
    return ' '.join(lst)

In [6]:
#Process the segments here
segments['segments'] = segments['segments'].apply(text_process_policy)

In [7]:
#Persist to new table to grab easily later
segments.to_sql('segments_processed', engine, if_exists='replace')