# Dataset Analysis

This notebook provides information about the input dataset.

First, we import the required packages.

In [1]:
import pandas as pd
import numpy as np
import csv
from sklearn.feature_extraction import text 
from nltk.corpus import stopwords
import string

Read the dataset into a dataframe. Remove any rows with empty text values.

In [2]:
dataset = pd.read_csv('fake_or_real_news.csv', sep=',', error_bad_lines=True,
                      usecols=['id', 'text', 'label']);
dataset['text'].replace(' ', np.nan, inplace=True)
dataset.dropna(subset=['text'], inplace=True)
print "Total number of articles: " + str(len(dataset))

Total number of articles: 6299


### Extract some information from the dataset.

#### Setup: Get the stop words to remove from retrieving corpus information

In [140]:
stop_words = text.ENGLISH_STOP_WORDS.union(['.', ',', '-', '–', '—', '--'])

### Fake Articles

In [141]:
fake_corpus = dataset[dataset['label'] == 'FAKE']

print "Total number of fake articles: " + str(len(fake_corpus))
print "\n"

print "Number of words statistics: "
words = fake_corpus['text'].apply(lambda x: len(x.lower().split()))
print words.describe()
print "\n"

print "Top 25 words: "
words = fake_corpus['text']
words = words.apply(lambda x: x.translate(None, string.punctuation))
words = words.str.lower().str.split()
words = words.apply(lambda x: [item for item in x if item not in stop_words])
flat = [item for sublist in words for item in sublist]
pd.Series(flat).value_counts().nlargest(25)

Total number of fake articles: 3128


Number of words statistics: 
count     3128.000000
mean       686.938299
std        961.676838
min          1.000000
25%        217.750000
50%        427.500000
75%        835.000000
max      20891.000000
Name: text, dtype: float64


Top 25 words: 


trump         5620
clinton       5579
people        5072
hillary       4074
said          3974
just          3535
new           3447
like          3182
election      2894
world         2794
time          2782
state         2657
2016          2589
government    2564
president     2421
war           2329
american      2326
states        2219
it’s          2147
years         2146
campaign      2064
media         2027
know          1937
obama         1891
donald        1830
dtype: int64

#### Real Articles

In [137]:
real_corpus = dataset[dataset['label'] == 'REAL']

print "Total number of real articles: " + str(len(real_corpus))
print "\n"

print "Number of words statistics: "
words = real_corpus['text'].apply(lambda x: len(x.lower().split()))
print words.describe()
print "\n"

print "Top 25 words: "
words = real_corpus['text']
words = words.apply(lambda x: x.translate(None, string.punctuation))
words = words.str.lower().str.split()
words = words.apply(lambda x: [item for item in x if item not in stop_words])
flat = [item for sublist in words for item in sublist]
pd.Series(flat).value_counts().nlargest(25)

Total number of real articles: 3171


Number of words statistics: 
count    3171.000000
mean      870.101230
std       719.419773
min         7.000000
25%       449.000000
50%       769.000000
75%      1119.000000
max      7599.000000
Name: text, dtype: float64


Top 25 words: 


said            16972
trump           12383
clinton          8633
people           5891
new              5695
president        5523
state            5471
campaign         5279
republican       4814
obama            4593
states           3964
just             3793
like             3778
time             3556
party            3488
sanders          3461
house            3407
political        3292
voters           3107
republicans      3099
presidential     3031
percent          3020
democratic       2996
going            2919
told             2839
dtype: int64

### Convert the labels to binary

In [3]:
label = {'FAKE':1, 'REAL':0}
dataset["label"].replace(label, inplace=True)
dataset

Unnamed: 0,id,text,label
0,8476,"Daniel Greenfield, a Shillman Journalism Fello...",1
1,10294,Google Pinterest Digg Linkedin Reddit Stumbleu...,1
2,3608,U.S. Secretary of State John F. Kerry said Mon...,0
3,10142,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",1
4,875,It's primary day in New York and front-runners...,0
5,6903,"\nI’m not an immigrant, but my grandparents ...",1
6,7341,"Share This Baylee Luciani (left), Screenshot o...",1
7,95,A Czech stockbroker who saved more than 650 Je...,0
8,4869,Hillary Clinton and Donald Trump made some ina...,0
9,2909,Iranian negotiators reportedly have made a las...,0


### Write to file

In [4]:
dataset = dataset.rename(index=str, columns={"label":"class"})
dataset.to_csv(path_or_buf='gm_fake_or_real.csv', columns=['id', 'text', 'class'], index=False)