# Dataset Analysis

This notebook provides information about the input dataset.

First, we import the required packages.

In [2]:
import pandas as pd
import numpy as np
import csv
from sklearn.feature_extraction import text 
from nltk.corpus import stopwords
import string

Read the dataset into a dataframe. Remove any rows with empty text values.

In [3]:
dataset = pd.read_csv('fake_or_real_news.csv', sep=',', error_bad_lines=True,
                      usecols=['id', 'text', 'label']);
dataset['text'].replace(' ', np.nan, inplace=True)
dataset.dropna(subset=['text'], inplace=True)
print "Total number of articles: " + str(len(dataset))

Total number of articles: 6299


### Extract some information from the dataset.

#### Setup: Get the stop words to remove from retrieving corpus information

In [4]:
stop_words = text.ENGLISH_STOP_WORDS.union(['.', ',', '-', '–', '—', '--'])

### Fake Articles

In [5]:
fake_corpus = dataset[dataset['label'] == 'FAKE']

print "Total number of fake articles: " + str(len(fake_corpus))
print "\n"

print "Number of words statistics: "
words = fake_corpus['text'].apply(lambda x: len(x.lower().split()))
print words.describe()
print "\n"

print "Top 25 words: "
words = fake_corpus['text']
words = words.apply(lambda x: x.translate(None, string.punctuation))
words = words.str.lower().str.split()
words = words.apply(lambda x: [item for item in x if item not in stop_words])
flat = [item for sublist in words for item in sublist]
pd.Series(flat).value_counts().nlargest(25)

Total number of fake articles: 3128


Number of words statistics: 
count     3128.000000
mean       686.938299
std        961.676838
min          1.000000
25%        217.750000
50%        427.500000
75%        835.000000
max      20891.000000
Name: text, dtype: float64


Top 25 words: 


trump         5620
clinton       5579
people        5072
hillary       4074
said          3974
just          3535
new           3447
like          3182
election      2894
world         2794
time          2782
state         2657
2016          2589
government    2564
president     2421
war           2329
american      2326
states        2219
it’s          2147
years         2146
campaign      2064
media         2027
know          1937
obama         1891
donald        1830
dtype: int64

#### Real Articles

In [6]:
real_corpus = dataset[dataset['label'] == 'REAL']

print "Total number of real articles: " + str(len(real_corpus))
print "\n"

print "Number of words statistics: "
words = real_corpus['text'].apply(lambda x: len(x.lower().split()))
print words.describe()
print "\n"

print "Top 25 words: "
words = real_corpus['text']
words = words.apply(lambda x: x.translate(None, string.punctuation))
words = words.str.lower().str.split()
words = words.apply(lambda x: [item for item in x if item not in stop_words])
flat = [item for sublist in words for item in sublist]
pd.Series(flat).value_counts().nlargest(25)

Total number of real articles: 3171


Number of words statistics: 
count    3171.000000
mean      870.101230
std       719.419773
min         7.000000
25%       449.000000
50%       769.000000
75%      1119.000000
max      7599.000000
Name: text, dtype: float64


Top 25 words: 


said            16972
trump           12383
clinton          8633
people           5891
new              5695
president        5523
state            5471
campaign         5279
republican       4814
obama            4593
states           3964
just             3793
like             3778
time             3556
party            3488
sanders          3461
house            3407
political        3292
voters           3107
republicans      3099
presidential     3031
percent          3020
democratic       2996
going            2919
told             2839
dtype: int64

### Equalize number of samples for each class

Randomly select articles to remove from the class that has the most number of examples.

In [8]:
combined = [fake_corpus[:], real_corpus[:len(fake_corpus)]]
final = pd.concat(combined)
final = final.sample(frac=1).reset_index(drop=True)
print "Total number of articles (final): " + str(len(final))

Total number of articles (final): 6256


### Convert the labels to binary

In [9]:
label = {'FAKE':1, 'REAL':0}
final["label"].replace(label, inplace=True)
final

Unnamed: 0,id,text,label
0,8230,Report Copyright Violation OFFICIAL ONLINE NAT...,1
1,2169,"Back in the early days of the Great Recession,...",0
2,4752,The first and only vice-presidential debate of...,0
3,3963,"Washington (CNN) The cool, calm, clear thinkin...",0
4,2125,President Obama earned a double-barreled rebuk...,0
5,5990,Urban Population Booms Will Make Climate Chang...,1
6,6051,Get short URL 0 3 0 0 Buddy Blackwell asked hi...,1
7,8552,Can nuclear war break out on the Korean Penins...,1
8,3566,BEIRUT (AP) — Islamic State militants have acc...,0
9,2261,I’m not among those Republicans who have “evol...,0


### Write to file

In [10]:
final = final.rename(index=str, columns={"label":"class"})
final.to_csv(path_or_buf='gm_fake_or_real_equal.csv', columns=['id', 'text', 'class'], index=False)

### Equalize LIWC data

In [11]:
dataset = pd.read_csv('gm_test_liwc_all.csv');

In [13]:
dataset

Unnamed: 0,id,tone,sixLtr,clout,wps,analytic,wc,dic,authentic,family,...,Apostro,i,posemo,ingest,motion,swear,Comma,time,reward,class
0,8476,1.346360,0.233796,65.315500,14.727273,82.093610,1296,0.794753,13.949903,0.000000,...,0.024691,0.000000,0.016204,0.001543,0.023148,0.000772,0.013117,0.043210,0.006944,1
1,10294,37.486140,0.222958,75.322710,18.120000,89.013950,453,0.754967,8.608158,0.002208,...,0.017660,0.000000,0.024283,0.002208,0.022075,0.002208,0.041943,0.061810,0.008830,1
2,3608,22.181538,0.238979,69.515750,21.550000,93.690840,431,0.779582,30.056124,0.002320,...,0.013921,0.004640,0.013921,0.000000,0.025522,0.000000,0.044084,0.076566,0.006961,0
3,10142,34.262894,0.333333,66.166664,22.666666,97.627144,408,0.617647,7.835681,0.000000,...,0.034314,0.000000,0.024510,0.000000,0.012255,0.004902,0.068627,0.056373,0.004902,1
4,875,85.501690,0.214286,69.031944,15.333333,92.522575,322,0.736025,19.493101,0.000000,...,0.040373,0.003106,0.052795,0.000000,0.027950,0.000000,0.052795,0.062112,0.031056,0
5,6903,18.258707,0.215837,50.851425,22.805826,81.970260,2349,0.833120,68.950270,0.004683,...,0.008514,0.065560,0.022137,0.001703,0.011920,0.000851,0.075777,0.049383,0.005960,1
6,7341,13.573038,0.197842,84.742380,30.888890,84.491560,556,0.874101,27.689127,0.005396,...,0.025180,0.012590,0.016187,0.000000,0.026978,0.000000,0.059353,0.048561,0.023381,1
7,95,25.774195,0.308943,79.194670,24.600000,99.000000,123,0.658537,45.520530,0.000000,...,0.016260,0.000000,0.000000,0.000000,0.048780,0.000000,0.048780,0.073171,0.000000,0
8,4869,29.309557,0.229758,62.792946,22.046728,92.707070,2359,0.763459,21.974830,0.000000,...,0.023739,0.002967,0.028826,0.001272,0.019924,0.000000,0.050869,0.066978,0.004239,0
9,2909,28.081482,0.259722,52.777780,21.176470,97.917244,720,0.754167,18.830828,0.000000,...,0.006944,0.000000,0.018056,0.000000,0.019444,0.000000,0.040278,0.073611,0.001389,0


In [15]:
real_corpus = dataset[dataset['class'] == 0]
fake_corpus = dataset[dataset['class'] == 1]

print "Total number of real articles: " + str(len(real_corpus))
print "Total number of fake articles: " + str(len(fake_corpus))

Total number of real articles: 3171
Total number of fake articles: 3123


In [16]:
combined = [fake_corpus[:], real_corpus[:len(fake_corpus)]]
final = pd.concat(combined)
final = final.sample(frac=1).reset_index(drop=True)
print "Total number of articles (final): " + str(len(final))

Total number of articles (final): 6246


In [18]:
final = final.rename(index=str, columns={"label":"class"})
final.to_csv(path_or_buf='gm_test_liwc_all_equal.csv', index=False)