# Language Feature Analysis - News

### Author 
Stephen Lee

### Goal
Find language features of the following news articles: 
- Fox News
- Vox News
- PBS News

### Date 
3.4.19

## Read Data, Remove Missing Values

In [1]:
import pandas as pd
import os

In [2]:
DATASET_PATH = "/home/smlee_981/data/"

In [3]:
FILE = 'clean_article_df.csv'

In [4]:
os.chdir(DATASET_PATH)

In [5]:
df = pd.read_csv(FILE, sep='|').drop('Unnamed: 0', axis=1)

In [6]:
df.head(2)

Unnamed: 0,article id,source,article,clean_articles,targets
0,fox_politics_166,Fox,Bolton warns Venezuela's Maduro to stay away f...,Bolton warns Venezuela's Maduro to stay away f...,3
1,fox_politics_390,Fox,Ocasio-Cortez rallies to stop all fossil fuel ...,Ocasio-Cortez rallies to stop all fossil fuel ...,3


In [7]:
df.groupby('source').count()

Unnamed: 0_level_0,article id,article,clean_articles,targets
source,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Fox,476,476,476,476
PBS,1739,1739,1739,1739
Vox,1027,1027,1027,1027


In [8]:
df[df['article'].isnull()].head()

Unnamed: 0,article id,source,article,clean_articles,targets


## Find Language Features

In [9]:
%matplotlib inline
import matplotlib.pyplot as plt
import plotly.offline as py
import plotly.graph_objs as go 
from plotly import tools 
from wordcloud import STOPWORDS 
from collections import defaultdict 
py.init_notebook_mode(connected=True)

In [10]:
fox = df[df['source'] == "Fox"]
vox = df[df['source'] == "Vox"]
pbs = df[df['source'] == "PBS"]

### Define Helpers

#### generate n grams or 'tokens' of various length


In [11]:
def generate_ngrams(txt, n_gram=1):
    token = [t for t in txt.lower().split(' ') if ((t not in STOPWORDS) and (t.isalpha()))]
    ngrams = zip(*[token[i:] for i in range(n_gram)])
    return [' '.join(g) for g in ngrams]

#### make a horizontal plot

In [12]:
def hz_chart(df, color):
    trace = go.Bar(
        y = df['token'].values[::-1],
        x = df['count'].values[::-1], 
        showlegend=False,
        orientation = 'h', 
        marker=dict(color=color))
    return trace

#### prepare the plot


In [13]:
def plt_freq(txt, n_grams=1, number=15, color='blue'):
    freq = defaultdict(int)
    for q in txt:
        for ngram in generate_ngrams(q, n_grams):
            freq[ngram] += 1
    df_sorted = pd.DataFrame(sorted(freq.items(), key=lambda x: x[1])[::-1])
    df_sorted.columns = ['token', 'count']
    return hz_chart(df_sorted.head(number), color)

#### get ngram frequencies

In [14]:
def ngram_freq(txt, n_grams=1, number=15, color='blue'):
    freq = defaultdict(int)
    for q in txt:
        for ngram in generate_ngrams(q, n_grams):
            freq[ngram] += 1
    df_sorted = pd.DataFrame(sorted(freq.items(), key=lambda x: x[1])[::-1])
    df_sorted.columns = ['token', 'count']
    return df_sorted

#### display plot

In [15]:
def disp_freq_plot(word_freqs, title):
    fig = tools.make_subplots(rows=1, cols=1, vertical_spacing=0.04)
    fig.append_trace(word_freqs, 1, 1)
    fig['layout'].update(height=900, width=600, title=title)
    fig['layout'].update(font=dict(size=20), margin=dict(r=20, l=300, b=75, t=125))
    py.iplot(fig, filename='counts')

### Frequent Words (ngram = 1)

In [16]:
fox_counts = plt_freq(fox['article'], n_grams=1, number=7)
vox_counts = plt_freq(vox['article'], n_grams=1, number=7)
pbs_counts = plt_freq(pbs['article'], n_grams=1, number=7)

disp_freq_plot(fox_counts, "Word Counts - Fox News")

This is the format of your plot grid:
[ (1,1) x1,y1 ]



In [17]:
disp_freq_plot(vox_counts, "Word Counts - Vox News")

This is the format of your plot grid:
[ (1,1) x1,y1 ]



In [18]:
disp_freq_plot(pbs_counts, "Word Counts - PBS News")

This is the format of your plot grid:
[ (1,1) x1,y1 ]



In [64]:
fox_counts = plt_freq(fox['article'], n_grams=2, number=7)
vox_counts = plt_freq(vox['article'], n_grams=2, number=7)
pbs_counts = plt_freq(pbs['article'], n_grams=2, number=7)

disp_freq_plot(fox_counts, "Phrase Counts - Fox News")

This is the format of your plot grid:
[ (1,1) x1,y1 ]



In [65]:
disp_freq_plot(vox_counts, "Phrase Counts - Vox News")

This is the format of your plot grid:
[ (1,1) x1,y1 ]



In [66]:
disp_freq_plot(pbs_counts, "Phrase Counts - PBS News")

This is the format of your plot grid:
[ (1,1) x1,y1 ]



In [34]:
fox_ngrams = ngram_freq(fox['article'], n_grams=1, number=20)
vox_ngrams = ngram_freq(vox['article'], n_grams=1, number=20)
pbs_ngrams = ngram_freq(pbs['article'], n_grams=1, number=20)

fox_ngrams.head(5)

Unnamed: 0,token,count
0,trump,1596
1,said,1437
2,president,1253
3,new,1116
4,house,1110


In [35]:
vox_ngrams.head()

Unnamed: 0,token,count
0,trump,5446
1,tax,5206
2,will,4098
3,people,4013
4,health,4003


In [36]:
pbs_ngrams.head()

Unnamed: 0,token,count
0,trump,7811
1,said,7383
2,president,3997
3,house,3495
4,will,3079


### Frequent Phrases (ngram = 2)

In [37]:
fox_ngrams = ngram_freq(fox['article'], n_grams=2, number=20)
vox_ngrams = ngram_freq(vox['article'], n_grams=2, number=20)
pbs_ngrams = ngram_freq(pbs['article'], n_grams=2, number=20)

In [38]:
fox_ngrams.head()

Unnamed: 0,token,count
0,white house,464
1,president trump,255
2,new york,247
3,green new,179
4,associated press,143


In [39]:
vox_ngrams.head()

Unnamed: 0,token,count
0,health care,1654
1,white house,743
2,trump administration,672
3,donald trump,598
4,health insurance,479


In [40]:
pbs_ngrams.head()

Unnamed: 0,token,count
0,white house,1683
1,president donald,1297
2,donald trump,1035
3,special counsel,613
4,supreme court,584


### Frequent Phrases (ngram = 3)

In [41]:
fox_ngrams = ngram_freq(fox['article'], n_grams=3, number=20)
vox_ngrams = ngram_freq(vox['article'], n_grams=3, number=20)
pbs_ngrams = ngram_freq(pbs['article'], n_grams=3, number=20)

In [42]:
fox_ngrams.head()

Unnamed: 0,token,count
0,associated press contributed,131
1,green new deal,104
2,house speaker nancy,75
3,state union address,56
4,partial government shutdown,55


In [43]:
vox_ngrams.head()

Unnamed: 0,token,count
0,affordable care act,222
1,president donald trump,157
2,congressional budget office,127
3,health care bill,121
4,new york times,115


In [44]:
pbs_ngrams.head()

Unnamed: 0,token,count
0,president donald trump,785
1,special counsel robert,396
2,majority leader mitch,179
3,attorney general jeff,139
4,sarah huckabee sanders,137
