# Descriptive analysis

The dataset is scraped by a parser that retreives the articles. There could be a number of issues that happen by retrieving the data. The following questions are asked that are answered by descriptive analysis.

1. Evaluate parser?
> 1. Are there incorrectly scraped articles that have no content?
> 2. Are there incorrectly scraped articles that have too much content?
> 3. Is there some decision boundary that could be used for low populated articles?
> 4. Is there a difference in size for each publisher?

2. Evaluate content?
> 1. Is there text content that is invalid for each article?
> 2. What kind of text content types do we see

hash, ngram, tdidf, spacy


3. Evaluate representation?
> 1. How strong is the overlap between the bias and the publishers
> 2. Is the dataset skewed for each bias?


**TL DR**

There is no missing data by the retrieval of the parser. However there are a lot of problems indicated by the 

## 1. Setup

In [2]:
import pandas as pd
import numpy as np
import matplotlib
import plotly.plotly
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
from sklearn.linear_model import LinearRegression
import colorlover as cl
from ipywidgets import widgets
from IPython.display import clear_output

init_notebook_mode(connected=True)
pd.options.display.max_rows
pd.set_option('display.max_colwidth', -1)

In [3]:
# Scrape the text
df = pd.read_csv('data/full.csv.gz', 
                compression='gzip', 
                sep='\t',
                encoding='utf-8',
                index_col=0).dropna()

In [4]:
# Populate the dataset with the extra columns like bias/hyperpartisan and publisher
df['hyperpartisan'] = df.hyperpartisan.astype("category").cat.codes
df['bias_code'] = df.bias.astype("category").cat.codes
df['hyperpartisan_code'] = df['hyperpartisan'].apply(str)
df['bias'] = df['bias'].apply(str)
df['publisher'] = df['url'].str.split('/').str[2]

In [5]:
# Add stripped_text and length
df['text_stripped'] = df.textbody.str.strip()
df['text_length'] = df.text_stripped.str.len()


## 2. Evaluate parser

In [6]:
# Sample the dataset 
df_sample = df.sample(n=20000) 

In [7]:
# check if there is a decision boundary in the data based on the size
df_sample = df_sample \
    .sort_values(by='text_length', ascending=True) \
    .reset_index(drop=True)

data = [
    go.Bar(
        x=df_sample.index,
        y=df_sample['text_length']
    )
]

iplot(data)

In [8]:
publishers = df_sample.groupby(['publisher', 'bias', 'bias_code']) \
        .agg({'url':'size', 'text_length':'mean'}) \
        .rename(columns={'url':'count','text_length':'mean'}) \
        .sort_values(by='mean', ascending=True) \
        .reset_index()

colorscale = [
    ['right-center', 'rgb(178,223,138)'], 
    ['right', 'rgb(51,160,44)'], 
    ['least', 'rgb(31,120,180)'], 
    ['left', 'rgb(251,154,153)'], 
    ['left-center', 'rgb(227,26,28)']
]
data = [];
for i, color in enumerate(colorscale): 
    publisher_bias = publishers.loc[publishers['bias'] == color[0]]
    data.append(
        go.Bar(
            name=color[0],
            x=publisher_bias.index,
            y=publisher_bias['mean'],
            text=publisher_bias['publisher'],
            marker=dict(
                color=color[1]
            )
        )
    )
    
layout = go.Layout(barmode='stack')
fig = go.Figure(data=data, layout=layout)
iplot(fig)

## Distribution of publishers in a bias



In [9]:
publishers = publishers.sort_values(by='count', ascending=True) \
        .reset_index()

data = [];
for i, color in enumerate(colorscale): 
    publisher_bias = publishers.loc[publishers['bias'] == color[0]]
    data.append(
        go.Bar(
            name=color[0],
            x=publisher_bias.index,
            y=publisher_bias['count'],
            text=publisher_bias['publisher'],
            marker=dict(
                color=color[1]
            )
        )
    )
    
layout = go.Layout(barmode='stack')
fig = go.Figure(data=data, layout=layout)
iplot(fig)

# publishers.sort_values(by='count', ascending=False).head(20)

In [10]:
publishers = df.groupby(['publisher', 'bias', 'bias_code']) \
        .agg({'publisher':'size'}) \
        .rename(columns={'publisher':'count'}) \
        .sort_values(by='count', ascending=False) \
        .reset_index()

publishers['publisher_code'] = publishers['publisher'].astype("category").cat.codes

# color_range = cl.to_rgb(cl.interp( cl.scales['11']['div']['RdYlBu'], publishers.shape[0]))

data = [];
for i in range(publishers.shape[0]): 
    publisher_bias = publishers.loc[publishers['publisher_code'] == i]
    data.append(
        go.Bar(
            name=publisher_bias['publisher'].iloc[0],
            x=publisher_bias['bias'],
            y=publisher_bias['count'],
            text=publisher_bias['publisher'],
            textposition='inside',
            hoverinfo='none'
        )
    )

layout = go.Layout(barmode='stack')
fig = go.Figure(data=data, layout=layout)
iplot(fig)

In [11]:
publishers = df.groupby(['publisher', 'bias', 'bias_code', 'hyperpartisan']) \
        .agg({'url':'size', 'text_length':'mean'}) \
        .rename(columns={'url':'count','text_length':'mean'}) \
        .sort_values(by='count', ascending=False) \
        .reset_index()

publishers['publisher_code'] = publishers['publisher'].astype("category").cat.codes

data = [];
for i in range(publishers.shape[0]): 
    publisher_bias = publishers.loc[publishers['publisher_code'] == i]
    data.append(
        go.Bar(
            name=publisher_bias['publisher'].iloc[0],
            x=publisher_bias['hyperpartisan'],
            y=publisher_bias['count'],
            text=publisher_bias['publisher'],
            textposition='auto',
            hoverinfo='none'
        )
    )

layout = go.Layout(barmode='stack')
fig = go.Figure(data=data, layout=layout)
iplot(fig)

### 3. Evaluate text

In [12]:
# print(randrange(publisher_1.shape[0]))?
random_article
samples = []
def init():  
    random_article = df.sample(n=1).reset_index(drop=True)
    print("Contains advertisement?")
    btn_yes = widgets.Button(description="Yes")
    btn_no = widgets.Button(description="No")
    display(btn_yes)
    display(btn_no)
    btn_yes.on_click(on_btn_yes)
    btn_no.on_click(on_btn_no)

    print(random_article.loc[0, 'id'])
    print(random_article.loc[0, 'publisher'])
    print(random_article.loc[0, 'url'])
    print(random_article.loc[0, 'textbody'])
    
def on_btn_no(b):
    clear_output()
    print("no")
    init()
    
def on_btn_yes(b):
    clear_output()
    samples.append(random_article.loc[0, 'id'])
    print("appended")
    init()
    
init()


NameError: name 'random_article' is not defined

In [None]:
publisher_3 = df_sample.loc[df_sample['text_length'] < 300].reset_index(drop=True)
for i in range(publisher_3.shape[0]):
    print(publisher_3.loc[i]['textbody'].rstrip().lstrip() + "\n")

## 3. Evaluate distribution