## About

This script:
1. uses stanford named entity recognizer to find the proper names in my women article corpus and exports a new csv column with the article text without proper nouns.
2. pre-processes text 
3. Calculates sentiments
4. creates a DTM

## NER

In [8]:
import ner
import os
import re
import csv
from urllib import urlopen

In [2]:
import sys

csv.field_size_limit(sys.maxsize)
 

131072

In [3]:
tagger = ner.SocketNER(host='localhost', port=8080)

In [4]:
# test
entities = tagger.get_entities("The dictatorship of President Nicolae Ceausescu caused extreme hardships for all but a few hundred thousand of Rumania's 23 million citizens. But in the case of mothers and babies, his rule apparently had the most tragic consequences. Mr. Ceausescu, who was ousted in a popular uprising a month ago, decreed in 1967, two years after he came to power, that Rumania's population, then about 22 million, should increase to 30 million. The reason he gave was simply that he wanted a bigger Rumania - an assertion widely interpreted now as an early indication of his megalomania. And to achieve his goal he banned abortions, made contraception illegal and ordered that Rumanian women of child-bearing age have five children each. No Precise Accounting Harsh fines were ordered for women caught having abortions, and doctors or medical technicians who assisted in abortions were sentenced to up to four years in prison and prohibited from practicing for 10 years. In the latter years of the regime, women working in factories were subjected to pregnancy checks as often as once a week. There is as yet no precise accounting of how many Rumanians were adversely affected by such strictures. Officials of the new provisional Government and outside experts have only begun to gather data about what happened over the years. But the fragmentary figures and educated guesses that they have been able to provide depict a society of families torn by death and fear as a result of the decrees paradoxically meant to make them propagate. ''The policy was a total failure,'' said Dr. Timothy Rutter, a consultant for Murray Stopes International, a British charity that assi")
entities

{u'LOCATION': [u'Rumania', u'Rumania', u'Rumania'],
 u'ORGANIZATION': [u'Murray Stopes International'],
 u'PERSON': [u'Nicolae Ceausescu', u'Ceausescu', u'Timothy Rutter']}

In [5]:
for key in entities:
    entities[key] = set(entities[key])
entities

{u'LOCATION': {u'Rumania'},
 u'ORGANIZATION': {u'Murray Stopes International'},
 u'PERSON': {u'Ceausescu', u'Nicolae Ceausescu', u'Timothy Rutter'}}

In [12]:
#read csv and read into a list of dictionaries
women = []
with open('Data/Corpora/women-foreign.csv') as csvfile:
    reader = csv.DictReader(csvfile)
    for row in reader:
        women.append(row)
women[1]

{'BYLINE': 'By DAVID BINDER, Special to The New York Times',
 'COUNTRY': 'ROMANIA\xc2\xa0(96%);',
 'COUNTRY_CODE': 'ROU',
 'COUNTRY_FINAL': 'Romania',
 'COUNTRY_MAJOR': 'ROMANIA',
 'COUNTRY_NR': 'ROMANIA\xc2\xa0(96%)',
 'COUNTRY_TOP_PERCENT': 'ROMANIA\xc2\xa0(96%)',
 'DATE': 'January 24, 1990',
 'LENGTH': '936 words',
 'PUBLICATION': 'NYT',
 'REGION': 'EECA',
 'SUBJECT': "ABORTION; BIRTH CONTROL AND FAMILY PLANNING; POPULATION; CHILDREN AND YOUTH; WOMEN; POLITICS AND GOVERNMENT \xc2\xa0PREGNANCY & CHILDBIRTH\xc2\xa0(90%);\xc2\xa0INFANTS & TODDLERS\xc2\xa0(90%);\xc2\xa0HEADS OF STATE & GOVERNMENT\xc2\xa0(90%);\xc2\xa0ABORTION\xc2\xa0(90%);\xc2\xa0WOMEN\xc2\xa0(90%); CHILDREN\xc2\xa0(89%);\xc2\xa0ORPHANS\xc2\xa0(89%);\xc2\xa0JAIL SENTENCING\xc2\xa0(88%);\xc2\xa0PARENTING\xc2\xa0(78%);\xc2\xa0WOMEN'S HEALTH\xc2\xa0(78%);\xc2\xa0POPULATION SIZE\xc2\xa0(78%);\xc2\xa0CONTRACEPTION\xc2\xa0(74%);\xc2\xa0LAW ENFORCEMENT\xc2\xa0(74%);",
 'TEXT': "The dictatorship of President Nicolae Ceausescu c

In [9]:
# remove named entities
for article in women:
    entities = tagger.get_entities(article['TEXT'])
    for key in entities:
        entities[key] = set(entities[key])
    article['entities'] = entities
    entities = [item for sublist in entities.values() for item in sublist]
    text = article['TEXT']
    for noun in entities:
        text = text.replace(noun.encode('utf-8'),'')
    article['TEXT-NO-NOUN'] = text

In [10]:
for i in women[1:100]:
    print i['REGION']

EECA
Asia
EECA
MENA
EECA
Africa
Africa
MENA
Asia
West
MENA
West
Asia
West
LA
Asia
Asia
Asia
Asia
West
MENA
Asia
Asia
West
LA
Asia
MENA
West
West
EECA
Africa
Africa
Asia
Africa
LA
EECA
MENA
Asia
EECA
West
Asia
Asia
West
Asia
West
Asia
Asia
Asia
West
West
Asia
Asia
West
Asia
MENA
EECA
West
West
LA
Africa
West
West
MENA
Asia
Asia
Asia
EECA
Africa
West
MENA
West
EECA
EECA
West
West
West
West
Asia
Asia
EECA
LA
Asia
West
West
West
Africa
Africa
West
MENA
West
MENA
West
Asia
West
Asia
West
Asia
West
West


## Entity analysis

In [11]:
for i in women[:10]:
    print [s for s in i['entities']]

[u'ORGANIZATION', u'LOCATION', u'PERSON']
[u'ORGANIZATION', u'LOCATION', u'PERSON']
[u'ORGANIZATION', u'LOCATION', u'PERSON']
[u'PERSON', u'LOCATION']
[u'ORGANIZATION', u'LOCATION', u'PERSON']
[u'ORGANIZATION', u'LOCATION', u'PERSON']
[u'ORGANIZATION', u'LOCATION', u'PERSON']
[u'ORGANIZATION', u'LOCATION', u'PERSON']
[u'ORGANIZATION', u'LOCATION', u'PERSON']
[u'ORGANIZATION', u'LOCATION']


In [12]:
asia = {'locations':[],'organization':[],'person':[]}
mena = {'locations':[],'organization':[],'person':[]}
africa = {'locations':[],'organization':[],'person':[]}
la = {'locations':[],'organization':[],'person':[]}
eeca = {'locations':[],'organization':[],'person':[]}
west = {'locations':[],'organization':[],'person':[]}

In [13]:
for i in women:
    if i['REGION'] == 'Asia':
        if u'LOCATION' in i['entities'].keys():
            x = [s for s in i['entities'][u'LOCATION']]
            asia['locations'].append(x)
        if u'ORGANIZATION' in i['entities'].keys():
            x = [s for s in i['entities'][u'ORGANIZATION']]
            asia['organization'].append(x)
        if u'PERSON' in i['entities'].keys():
            x = [s for s in i['entities'][u'PERSON']]
            asia['person'].append(x)
    if i['REGION'] == 'EECA':
        if u'LOCATION' in i['entities'].keys():
            x = [s for s in i['entities'][u'LOCATION']]
            eeca['locations'].append(x)
        if u'ORGANIZATION' in i['entities'].keys():
            x = [s for s in i['entities'][u'ORGANIZATION']]
            eeca['organization'].append(x)
        if u'PERSON' in i['entities'].keys():
            x = [s for s in i['entities'][u'PERSON']]
            eeca['person'].append(x)
    if i['REGION'] == 'MENA':
        if u'LOCATION' in i['entities'].keys():
            x = [s for s in i['entities'][u'LOCATION']]
            mena['locations'].append(x)
        if u'ORGANIZATION' in i['entities'].keys():
            x = [s for s in i['entities'][u'ORGANIZATION']]
            mena['organization'].append(x)
        if u'PERSON' in i['entities'].keys():
            x = [s for s in i['entities'][u'PERSON']]
            mena['person'].append(x)
    if i['REGION'] == 'West':
        if u'LOCATION' in i['entities'].keys():
            x = [s for s in i['entities'][u'LOCATION']]
            west['locations'].append(x)
        if u'ORGANIZATION' in i['entities'].keys():
            x = [s for s in i['entities'][u'ORGANIZATION']]
            west['organization'].append(x)
        if u'PERSON' in i['entities'].keys():
            x = [s for s in i['entities'][u'PERSON']]
            west['person'].append(x)
    if i['REGION'] == 'Africa':
        if u'LOCATION' in i['entities'].keys():
            x = [s for s in i['entities'][u'LOCATION']]
            africa['locations'].append(x)
        if u'ORGANIZATION' in i['entities'].keys():
            x = [s for s in i['entities'][u'ORGANIZATION']]
            africa['organization'].append(x)
        if u'PERSON' in i['entities'].keys():
            x = [s for s in i['entities'][u'PERSON']]
            africa['person'].append(x)
    if i['REGION'] == 'LA':
        if u'LOCATION' in i['entities'].keys():
            x = [s for s in i['entities'][u'LOCATION']]
            la['locations'].append(x)
        if u'ORGANIZATION' in i['entities'].keys():
            x = [s for s in i['entities'][u'ORGANIZATION']]
            la['organization'].append(x)
        if u'PERSON' in i['entities'].keys():
            x = [s for s in i['entities'][u'PERSON']]
            la['person'].append(x)

In [14]:
for i in [asia,africa,mena,eeca,la,west]:
    for key in i.keys():
        i[key] = [s for sublist in i[key] for s in sublist]

In [15]:
from collections import Counter

In [16]:
for i in [asia,africa,mena,eeca,la,west]:
    for key in i.keys():
        i[key] = Counter(i[key])

In [17]:
africa['person']

Counter({u'Nelson Mandela': 27, u'Maureen Reagan': 19, u'Thabo Mbeki': 18, u'Reagan': 18, u'Mandela': 16, u'Bush': 14, u'Mbeki': 14, u'Daniel arap Moi': 10, u'Banda': 9, u'Alan L. Keyes': 8, u'Jacob Zuma': 8, u'Burkina Faso': 8, u'Ellen Johnson Sirleaf': 8, u'Zuma': 8, u'Kasinga': 7, u'Robert Mugabe': 7, u'Clinton': 7, u'Maathai': 7, u'bush': 7, u'Fauziya Kasinga': 7, u'Wangari Maathai': 7, u'Betty Friedan': 7, u'Obama': 7, u'Moi': 7, u'Arab': 7, u'Winnie Mandela': 6, u'Nicholas D. Kristof': 6, u'Karen Musalo': 6, u'Mohamed': 5, u'Keyes': 5, u'Jeane Kirkpatrick': 5, u'Johnson Sirleaf': 5, u'Muslim': 5, u'Musalo': 5, u'Charles Taylor': 5, u'Hutu': 5, u'Mugabe': 4, u'Yoweri Museveni': 4, u'Michelle Obama': 4, u'Denis Mukwege': 4, u'F. W. de Klerk': 4, u'Jimmy Carter': 4, u'Leymah Gbowee': 4, u'Adelaide Abankwah': 4, u'Boko Haram': 4, u'Nancy Clark Reynolds': 4, u'George': 4, u'Luo': 4, u'Linda Chavez': 4, u'Javier Perez de Cuellar': 4, u'Lindy Boggs': 4, u'Samuel Doe': 4, u'Kennedy': 4, 

In [18]:
import pprint, pickle

In [19]:
def ld_writeDicts(filePath,dict):  
    f=open(filePath,'w')  
    newData = pickle.dumps(dict, 1)  
    f.write(newData)  
    f.close()

In [20]:
ld_writeDicts('asia',asia)  
ld_writeDicts('africa',africa) 
ld_writeDicts('west',west) 
ld_writeDicts('la',la) 
ld_writeDicts('eeca',eeca) 
ld_writeDicts('mena',mena)

## Calculate Sentiments

In [15]:
from nltk import word_tokenize
from nltk import bigrams
from nltk import trigrams
from nltk import ngrams
from nltk.stem import PorterStemmer
import nltk

In [16]:
pt = PorterStemmer()

stop_words = urlopen('http://jmlr.org/papers/volume5/lewis04a/a11-smart-stop-list/english.stop').read().split('\n')
pos_words = urlopen('http://www.unc.edu/~ncaren/haphazard/positive.txt').read().split('\n')
neg_words = urlopen('http://www.unc.edu/~ncaren/haphazard/negative.txt').read().split('\n')

pos_stem_pt = map(pt.stem, pos_words)
neg_stem_pt = map(pt.stem, neg_words)

In [20]:
# preprocessing and calculate sentiments 

for article in women:
    
    i = article['TEXT-NO-NOUN']
    
    # processing
    i = re.sub('\W', ' ', i) # remove punctuation
    i = i.lower() # remove capitalization
    i = word_tokenize(i) # tokenize the words
    i = [x for x in i if x not in stop_words] # remove stop words
    i = map(pt.stem, i)
    
    article['text-stemmed'] = i
    
    # counting
    i_pt = map(pt.stem, i)
    article['number of porter stemmed positive words'] = len([x for x in i_pt if x in pos_stem_pt])
    article['number of porter stemmed negative words'] = len([x for x in i_pt if x in neg_stem_pt])
    article['number of non-stop words'] = len(i) # Number of non-stop words spoken

## Write File

In [21]:
keys = women[1].keys()
keys

['PUBLICATION',
 'TITLE',
 'COUNTRY',
 'number of porter stemmed positive words',
 'DATE',
 'BYLINE',
 'entities',
 'COUNTRY_CODE',
 'TEXT',
 'COUNTRY_FINAL',
 'number of non-stop words',
 'YEAR',
 'COUNTRY_MAJOR',
 'TEXT-NO-NOUN',
 'REGION',
 'COUNTRY_NR',
 'text-stemmed',
 'number of porter stemmed negative words',
 'LENGTH',
 'COUNTRY_TOP_PERCENT',
 'TYPE',
 'SUBJECT']

In [22]:
with open('Data/Corpora/women-processed.csv', 'wb') as output_file:
    dict_writer = csv.DictWriter(output_file, keys)
    dict_writer.writeheader()
    dict_writer.writerows(women)

## Calculate DTM

In [15]:
# count number of unigrams
unigrams = {}
used = []

for article in women:
    for word in article['text-stemmed']:
        if word in unigrams:
                unigrams[word] += 1 
        if word not in unigrams and word not in used:
                unigrams[word] = 1 # add the word to unigrams for the first time
                used.append(word) # add the word to used, which is just a list of the words used
len(set(unigrams)) # the number of the unique unigrams

32493

In [16]:
from heapq import nlargest
import operator

In [17]:
top_uni_nu = nlargest(10000, unigrams.iteritems(), key=operator.itemgetter(1))
top_uni = [x[0] for x in top_uni_nu]

In [18]:
top_uni_nu[-10:-1] # what are the bottom 10 unigrams?

[('prudent', 8),
 ('cock', 8),
 ('ruth', 8),
 ('unaffect', 8),
 ('unquest', 8),
 ('sanitarium', 8),
 ('dam', 8),
 ('texa', 8),
 ('concili', 8)]

In [19]:
# create lists of counts of unigrams per document
uni_dtm = []
for article in women:
    pub = article['PUBLICATION']
    vec = [pub + '-' + article['REGION'] + str(article['YEAR']) + '-' + str(women.index(article)),]
    # make a list of unigrams in that statement and count them
    for i in top_uni:
        count = article['text-stemmed'].count(i)
        vec.append(count)
    uni_dtm.append(vec)   

In [20]:
uni_dtm[4000][:10]

['WP-West2006-4000', 6, 3, 0, 6, 9, 3, 1, 0, 0]

In [21]:
dtm = open('Data/dtm-python.csv', 'w')
row = 'region'
for i in top_uni:
    row = row + ',' + i
dtm.write(row + '\n')

In [22]:
# write rows

for i in range(len(uni_dtm)):
    x = str(uni_dtm[i])
    x = x.replace('[','')
    x = x.replace(']','')
    dtm.write(x + '\n')

In [23]:
dtm.close()