# Getting newspaper data from COCA corpus

In [1]:
import pandas as pd
import lucem_illud_2020
import zipfile
import re

In [2]:
# set filepath
COCA = "/Users/rachelker/Documents/UChic MSCAPP/Curriculum/2019-20 Winter/Computational Content Analysis/Project/Data/Davies Corpora/COCA/"
newspapers_path = COCA + 'text_newspaper_lsp.zip'
newspapers_path

'/Users/rachelker/Documents/UChic MSCAPP/Curriculum/2019-20 Winter/Computational Content Analysis/Project/Data/Davies Corpora/COCA/text_newspaper_lsp.zip'

## Getting texts

In [89]:
def get_newspaper_df(newspapers_path, topic_filter):    
    # get all text
    newspapers_data = load_newspapers(newspapers_path)    
    all_news = []
    [all_news.extend(newspapers_data[k]) for k in newspapers_data.keys()]
        
    # load into dataframe
    df = pd.DataFrame(all_news, columns=['text'])
    # get text_ids
    df['text_id'] = df.apply(lambda x: get_textid(x), axis=1)

    df['text'] = df.apply(lambda x: x['text'].decode('utf-8', errors='ignore'), axis=1)
    df = df[df['text'].str.contains(topic_filter)]
    df.set_index('text_id', inplace=True)
    return df

def get_textid(row):
        text = row['text']
        text = text.decode('utf-8', errors='ignore')
        text_id = re.search('(?<=##)[0-9]+', text).group(0)
        return str(text_id)
    
def load_newspapers(newspapers_path):
    newspaper_raw = zipfile.ZipFile(newspapers_path)
    newspaper_data = {}

    for file in newspaper_raw.namelist():
        newspaper_data[file] = []
        with newspaper_raw.open(file) as f:
            i=0
            for line in f:
                #skip first line of every file
                if i==0:
                    i+=1 
                    continue
                newspaper_data[file].append(line)
#    for k in newspaper_data.keys():
#        print("{}: {}".format(k, len(newspaper_data[k])))
    return newspaper_data

In [116]:
newspaper_df = get_newspaper_df(newspapers_path, '')
# total of 57037 articles

In [90]:
immigra_newspaper_df = get_newspaper_df(newspapers_path, 'immigra')
migra_newspaper_df = get_newspaper_df(newspapers_path, 'migra')

In [93]:
immigra_newspaper_df
# 3335

Unnamed: 0_level_0,text
text_id,Unnamed: 1_level_1
3001109,##3001109 <p> The people who work behind the p...
3001113,##3001113 <p> Following are excerpts from a tr...
3001115,##3001115 <p> It started over some plantains a...
3001128,"##3001128 <p> Sung Soo Kim , a native of South..."
3001901,"##3001901 <p> Lucien Paye , head of the O.E.C...."
...,...
4115318,"##4115318 For the second time in a month , the..."
4115330,##4115330 Sen. Scott Brown ( R ) of Massachuse...
4115348,##4115348 Insights into the minds of Westerner...
4115354,##4115354 An Obama administration announcement...


In [94]:
migra_newspaper_df
# 4841

Unnamed: 0_level_0,text
text_id,Unnamed: 1_level_1
3001109,##3001109 <p> The people who work behind the p...
3001113,##3001113 <p> Following are excerpts from a tr...
3001115,##3001115 <p> It started over some plantains a...
3001128,"##3001128 <p> Sung Soo Kim , a native of South..."
3001901,"##3001901 <p> Lucien Paye , head of the O.E.C...."
...,...
4115348,##4115348 Insights into the minds of Westerner...
4115354,##4115354 An Obama administration announcement...
4115356,##4115356 As Syria devolves into what the UN p...
4115364,##4115364 I am the proud son of a hardworking ...


## Getting metadata

In [9]:
def get_news_sources():
    cols = ['text_id', 'word_count', 'year', 'genre', 'subgen', 'source', 'title', 'publication_info']

    zfile = zipfile.ZipFile(COCA + "sources.zip")
    source = []
    for file in zfile.namelist():
        with zfile.open(file) as f:
            for line in f:
                line = line.decode('utf-8', errors='ignore').split("\t")
                if len(line)>4 and line[3] == 'NEWS':
                    source.append(line)
    
    sources_df = pd.DataFrame(source, columns=cols)
    sources_df.set_index('text_id', inplace=True)
    return sources_df

In [11]:
sources_df = get_news_sources()

In [12]:
sources_df
#80,017 sources

Unnamed: 0_level_0,word_count,year,genre,subgen,source,title,publication_info
text_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
3000001,1179,1990,NEWS,141,NYTimes,Piniella Eager to Put Pinstripes Behind\r\n,
3000002,1153,1990,NEWS,141,NYTimes,Grand Prix Indoor Circuit Goes Down to the Fin...,
3000003,1067,1990,NEWS,141,NYTimes,"Flamingos, Palms and Silence\r\n",
3000004,957,1990,NEWS,141,NYTimes,Arbitration List\r\n,
3000005,1045,1990,NEWS,141,NYTimes,RESULTS PLUS\r\n,
...,...,...,...,...,...,...,...
4198680,827,2017,NEWS,,Chicago Sun-Times,NORMAN CHAD: Celebrating two of sports world's...,
4198681,3010,2017,NEWS,,Charlotte Observer,Has Duke replaced Kentucky as the king of coll...,
4198682,86,2017,NEWS,,Charlotte Observer,Welfare check by police leads to discovery of ...,
4198683,1057,2017,NEWS,,Baltimore Sun,Dozens of Baltimore kids plan to march in supp...,


## Merge text and meta data

In [95]:
# only take articles that have both text and metadata
immigra_merged_df = immigra_newspaper_df.join(sources_df, how='inner')
migra_merged_df = migra_newspaper_df.join(sources_df, how='inner')

In [118]:
all_merged_df = newspaper_df.join(sources_df, how='inner')

In [119]:
len(all_merged_df)
# total: 57,026 articles

57026

In [97]:
print(len(migra_merged_df), len(immigra_merged_df))

4841 3335


In [124]:
migra_merged_df.groupby('year').size()
# about 100-200+ articles a year

year
1990    191
1991    216
1992    183
1993    206
1994    239
1995    183
1996    224
1997    214
1998    191
1999    221
2000    193
2001    212
2002    206
2003    205
2004    215
2005    221
2006    275
2007    246
2008    222
2009    226
2010    223
2011    213
2012    116
dtype: int64

In [125]:
immigra_merged_df.groupby('year').size()
# about 100 a year

year
1990    111
1991    132
1992    118
1993    129
1994    152
1995    124
1996    155
1997    154
1998    135
1999    151
2000    138
2001    150
2002    142
2003    149
2004    158
2005    148
2006    209
2007    183
2008    161
2009    155
2010    159
2011    139
2012     83
dtype: int64

In [126]:
immigra_merged_df.groupby('year').size()/all_merged_df.groupby('year').size()*100

year
1990    3.937567
1991    4.600906
1992    4.761905
1993    5.056840
1994    6.386555
1995    5.059160
1996    6.666667
1997    6.392694
1998    5.681818
1999    6.175869
2000    6.292750
2001    7.085498
2002    6.200873
2003    6.619280
2004    7.019103
2005    6.141079
2006    8.822288
2007    6.421053
2008    5.613668
2009    5.398816
2010    5.254461
2011    4.846583
2012    5.323926
dtype: float64

## Data cleaning

In [193]:
data = immigra_merged_df.reset_index()

In [194]:
def clean_title(row):
    title = row['title']
    remove = ['\r','\n','^']
    for string in remove:
        title = title.replace(string,'')
    return title

def clean_text(row):
    text = row['text']
    text_id = row['text_id']
    remove = ['\r','\n','<p>','@','##{}'.format(text_id)]
    for string in remove:
        text = text.replace(string,'')
    return text

In [195]:
data['title'] = data.apply(lambda x: clean_title(x), axis=1)
data['text'] = data.apply(lambda x: clean_text(x), axis=1)

In [196]:
data

Unnamed: 0,text_id,text,word_count,year,genre,subgen,source,title,publication_info
0,3001109,The people who work behind the pebble-glass ...,1960,1990,NEWS,138,NYTimes,"In School Bureaucracy, Despair at the System",
1,3001113,Following are excerpts from a transcript of ...,2275,1990,NEWS,138,NYTimes,'This City Is Sick of Violence': Dinkins's Add...,
2,3001115,"It started over some plantains and peppers ,...",1840,1990,NEWS,138,NYTimes,Black-Korean Who-Pushed-Whom Festers,
3,3001128,"Sung Soo Kim , a native of South Korea , doe...",1509,1990,NEWS,138,NYTimes,Immigrants Help Others Lost in Maze,
4,3001901,"Lucien Paye , head of the O.E.C.D. , which g...",714,1990,NEWS,142,NYTimes,FOREIGN AFFAIRS;The People Threat,
...,...,...,...,...,...,...,...,...,...
3330,4115318,"For the second time in a month , the US Supre...",1232,2012,NEWS,135,CSMonitor,Arizona immigration law: states vs. Obama at U...,\r\n
3331,4115330,Sen. Scott Brown ( R ) of Massachusetts said ...,1126,2012,NEWS,135,CSMonitor,Brown calls on Harvard to correct record on El...,\r\n
3332,4115348,Insights into the minds of Westerners who hav...,1152,2012,NEWS,135,CSMonitor,American jihadi in Somalia writes an autobiogr...,\r\n
3333,4115354,An Obama administration announcement met by c...,1006,2012,NEWS,135,CSMonitor,Why some illegal immigrants arent celebrating ...,\r\n


In [199]:
data.to_csv('data/immigra_coca_news.csv', index=False)