# Language Feature Analysis - News

### Author 
Stephen Lee

### Goal
Find language features of the following news articles: 
- Fox News
- Vox News
- PBS News

### Date 
3.4.19

## Read Data, Remove Missing Values

In [1]:
import pandas as pd
import os

In [2]:
DATASET_PATH = "/home/stephen/Dropbox/CodeWorkspace/data-sets/Thesis/"

In [3]:
FILE = "articles.csv"

In [4]:
os.chdir(DATASET_PATH)

In [5]:
df = pd.read_csv(FILE, sep='|').drop('Unnamed: 0', axis=1)

In [6]:
df.head(2)

Unnamed: 0,article id,source,article
0,fox_politics_166,Fox,Video\nBolton warns Venezuela's Maduro to stay...
1,fox_politics_390,Fox,Video\nOcasio-Cortez rallies to stop all fossi...


In [7]:
df.groupby('source').count()

Unnamed: 0_level_0,article id,article
source,Unnamed: 1_level_1,Unnamed: 2_level_1
Fox,731,731
PBS,1752,1752
Vox,2000,1938


In [8]:
df[df['article'].isnull()].head()

Unnamed: 0,article id,source,article
749,vox_politics_1692,Vox,
774,vox_politics_1923,Vox,
809,vox_politics_336,Vox,
821,vox_politics_1347,Vox,
936,vox_politics_1915,Vox,


In [9]:
df = df.dropna()

In [10]:
df.groupby('source').count()

Unnamed: 0_level_0,article id,article
source,Unnamed: 1_level_1,Unnamed: 2_level_1
Fox,731,731
PBS,1752,1752
Vox,1938,1938


#### Check for and remove duplicates

In [12]:
df.groupby("source").describe()

Unnamed: 0_level_0,article,article,article,article,article id,article id,article id,article id
Unnamed: 0_level_1,count,unique,top,freq,count,unique,top,freq
source,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
Fox,731,476,Video\nTrump: The media refuses to acknowledge...,4,731,731,fox_politics_267,1
PBS,1752,1739,"It is messy, tentacled, and increasingly confu...",5,1752,1752,pbs_politics_1088,1
Vox,1938,1027,"Part of The 2018 midterm elections, explained",152,1938,1938,vox_politics_1643,1


In [13]:
df = df.drop_duplicates('article', keep='first')
df.groupby("source").describe()

Unnamed: 0_level_0,article,article,article,article,article id,article id,article id,article id
Unnamed: 0_level_1,count,unique,top,freq,count,unique,top,freq
source,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
Fox,476,476,Video\nMcCabe says he briefed Congress on Trum...,1,476,476,fox_politics_267,1
PBS,1739,1739,ATLANTA — The first debate between Democrat St...,1,1739,1739,pbs_politics_1088,1
Vox,1027,1027,"It’s official: After months of empty threats, ...",1,1027,1027,vox_politics_1315,1


## Find Language Features

In [14]:
%matplotlib inline
import matplotlib.pyplot as plt
import plotly.offline as py
import plotly.graph_objs as go 
from plotly import tools 
from wordcloud import STOPWORDS 
from collections import defaultdict 
py.init_notebook_mode(connected=True)

In [15]:
fox = df[df['source'] == "Fox"]
vox = df[df['source'] == "Vox"]
pbs = df[df['source'] == "PBS"]

### Define Helpers

#### generate n grams or 'tokens' of various length


In [16]:
def generate_ngrams(txt, n_gram=1):
    token = [t for t in txt.lower().split(' ') if ((t not in STOPWORDS) and (t.isalpha()))]
    ngrams = zip(*[token[i:] for i in range(n_gram)])
    return [' '.join(g) for g in ngrams]

#### make a horizontal plot

In [17]:
def hz_chart(df, color):
    trace = go.Bar(
        y = df['token'].values[::-1],
        x = df['count'].values[::-1], 
        showlegend=False,
        orientation = 'h', 
        marker=dict(color=color))
    return trace

#### prepare the plot


In [18]:
def plt_freq(txt, n_grams=1, number=15, color='blue'):
    freq = defaultdict(int)
    for q in txt:
        for ngram in generate_ngrams(q, n_grams):
            freq[ngram] += 1
    df_sorted = pd.DataFrame(sorted(freq.items(), key=lambda x: x[1])[::-1])
    df_sorted.columns = ['token', 'count']
    return hz_chart(df_sorted.head(number), color)

#### get ngram frequencies

In [19]:
def ngram_freq(txt, n_grams=1, number=15, color='blue'):
    freq = defaultdict(int)
    for q in txt:
        for ngram in generate_ngrams(q, n_grams):
            freq[ngram] += 1
    df_sorted = pd.DataFrame(sorted(freq.items(), key=lambda x: x[1])[::-1])
    df_sorted.columns = ['token', 'count']
    return df_sorted

#### display plot

In [20]:
def disp_freq_plot(word_freqs, title):
    title = [title]
    fig = tools.make_subplots(rows=1, cols=1, vertical_spacing=0.04, subplot_titles=title)
    fig.append_trace(word_freqs, 1, 1)
    fig['layout'].update(height=900, width=600, title='Word Counts')
    py.iplot(fig, filename='counts')

### Frequent Words (ngram = 1)

In [21]:
# fox_counts = plt_freq(fox['article'], n_grams=1, number=20)
# vox_counts = plt_freq(vox['article'], n_grams=1, number=20)
# pbs_counts = plt_freq(pbs['article'], n_grams=1, number=20)

# disp_freq_plot(fox_counts, "Fox News")

In [22]:
# disp_freq_plot(vox_counts, "Vox News")

In [23]:
# disp_freq_plot(pbs_counts, "PBS News")

In [24]:
fox_ngrams = ngram_freq(fox['article'], n_grams=1, number=20)
vox_ngrams = ngram_freq(vox['article'], n_grams=1, number=20)
pbs_ngrams = ngram_freq(pbs['article'], n_grams=1, number=20)

In [25]:
fox_ngrams.head()

Unnamed: 0,token,count
0,trump,1566
1,said,1437
2,president,1233
3,new,1111
4,house,1106


In [26]:
vox_ngrams.head()

Unnamed: 0,token,count
0,trump,5446
1,tax,5206
2,will,4098
3,people,4013
4,health,4003


In [27]:
pbs_ngrams.head()

Unnamed: 0,token,count
0,trump,7811
1,said,7383
2,president,3997
3,house,3495
4,will,3079


### Frequent Phrases (ngram = 2)

In [28]:
fox_ngrams = ngram_freq(fox['article'], n_grams=2, number=20)
vox_ngrams = ngram_freq(vox['article'], n_grams=2, number=20)
pbs_ngrams = ngram_freq(pbs['article'], n_grams=2, number=20)

In [29]:
fox_ngrams.head()

Unnamed: 0,token,count
0,white house,461
1,new york,246
2,president trump,242
3,green new,178
4,associated press,143


In [30]:
vox_ngrams.head()

Unnamed: 0,token,count
0,health care,1654
1,white house,743
2,trump administration,672
3,donald trump,598
4,tax cuts,479


In [31]:
pbs_ngrams.head()

Unnamed: 0,token,count
0,white house,1683
1,president donald,1297
2,donald trump,1035
3,special counsel,613
4,supreme court,584


### Frequent Phrases (ngram = 3)

In [32]:
fox_ngrams = ngram_freq(fox['article'], n_grams=3, number=20)
vox_ngrams = ngram_freq(vox['article'], n_grams=3, number=20)
pbs_ngrams = ngram_freq(pbs['article'], n_grams=3, number=20)

In [33]:
fox_ngrams.head()

Unnamed: 0,token,count
0,associated press contributed,131
1,green new deal,103
2,house speaker nancy,75
3,state union address,56
4,deputy attorney general,55


In [34]:
vox_ngrams.head()

Unnamed: 0,token,count
0,affordable care act,222
1,president donald trump,157
2,congressional budget office,127
3,health care bill,121
4,new york times,115


In [35]:
pbs_ngrams.head()

Unnamed: 0,token,count
0,president donald trump,785
1,special counsel robert,396
2,washington president donald,258
3,majority leader mitch,179
4,attorney general jeff,139
