In [1]:
import pandas as pd
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
import numpy as np
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.optim as optim

In [2]:
import torchtext

In [3]:
fake_fp = '../data/Fake.csv'
real_fp = '../data/True.csv'

In [4]:
fake = pd.read_csv(fake_fp)
fake.head()

Unnamed: 0,title,text,subject,date
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017"
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017"
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017"
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017"
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017"


In [6]:
fake.shape

(23481, 4)

In [7]:
fake[fake.title.str.contains('Donald Trump Sends Out Embarrassing New Year’s Eve Message')]

Unnamed: 0,title,text,subject,date
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017"


In [8]:
fake.subject.value_counts(normalize=True)

News               0.385418
politics           0.291342
left-news          0.189898
Government News    0.066863
US_News            0.033346
Middle-east        0.033133
Name: subject, dtype: float64

In [9]:
fake.shape

(23481, 4)

In [10]:
real = pd.read_csv(real_fp)
real.head()

Unnamed: 0,title,text,subject,date
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,politicsNews,"December 31, 2017"
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,politicsNews,"December 29, 2017"
2,Senior U.S. Republican senator: 'Let Mr. Muell...,WASHINGTON (Reuters) - The special counsel inv...,politicsNews,"December 31, 2017"
3,FBI Russia probe helped by Australian diplomat...,WASHINGTON (Reuters) - Trump campaign adviser ...,politicsNews,"December 30, 2017"
4,Trump wants Postal Service to charge 'much mor...,SEATTLE/WASHINGTON (Reuters) - President Donal...,politicsNews,"December 29, 2017"


In [11]:
real.shape

(21417, 4)

In [12]:
# add labels before concatenating
fake.loc[:,'label'] = 0
real.loc[:,'label'] = 1

In [13]:
df = pd.concat([fake, real], ignore_index=True)
df.head()

Unnamed: 0,title,text,subject,date,label
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017",0
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017",0
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017",0
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017",0
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017",0


In [14]:
df.shape

(44898, 5)

In [15]:
df.label.value_counts(normalize=True)

0    0.522985
1    0.477015
Name: label, dtype: float64

In [16]:
df = df[['text', 'label']]
df.shape

(44898, 2)

In [17]:
text_lengths = df.text.str.len()
print(f'average text length: {text_lengths.mean():0.2f}')

average text length: 2469.11


In [18]:
text_lengths.describe()

count    44898.000000
mean      2469.109693
std       2171.617091
min          1.000000
25%       1234.000000
50%       2186.000000
75%       3105.000000
max      51794.000000
Name: text, dtype: float64

In [18]:
# how many words per text?
df.text.str.split().str.len().describe()

count    44898.000000
mean       405.282284
std        351.265595
min          0.000000
25%        203.000000
50%        362.000000
75%        513.000000
max       8135.000000
Name: text, dtype: float64

In [22]:
df.index.max()

44897

In [25]:
rand_idxs = np.random.randint(0,df.index.max(), 10)
for txt in df.text.values[rand_idxs]:
    print(txt, '\n')
# df.text.values[0]

MEXICO CITY (Reuters) - For a country still in mourning and counting its dead from Tuesday s devastating earthquake, Frida the Navy rescue dog has emerged as a source of inspiration and pride in the search for survivors. Known for her custom-made doggy goggles and boots, the 7-year-old yellow Labrador has become a social media star, appearing on one man s tattoo and the unofficial image of a 500-peso note. Some posts have wildly exaggerated her feats. In her career, Frida has located 12 people alive beneath the rubble plus more than 40 others dead, more than any other Mexican rescue dog, the Navy said. She has put her nose to work in disasters such as an earthquake in Ecuador last year, another one in southern Mexico two weeks ago, a landslide in Guatemala in 2015 and a Mexico City gas explosion in 2013. Her credits for the most recent disaster has not been tallied yet, said Navy Captain Armando Segura, because she is still busy trying to save lives. The 7.1-magnitude quake has killed 

In [14]:
from sklearn.model_selection import train_test_split
X = df['text']
y = df['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [15]:
X_train.shape

(35918,)

In [16]:
X_test.shape

(8980,)

In [17]:
train = pd.concat([X_train, y_train], axis=1)
train.head()

Unnamed: 0,text,label
36335,ATHENS (Reuters) - Turkish President Tayyip Er...,1
12384,"Ted Cruz would be fair, honest and most of all...",0
24419,WASHINGTON (Reuters) - White House Chief of St...,1
24740,DUBAI (Reuters) - Saudi Arabia welcomed the ne...,1
27039,"SIGONELLA, Italy (Reuters) - U.S. President Do...",1


In [18]:
test = pd.concat([X_test, y_test], axis=1)
test.head()

Unnamed: 0,text,label
22216,"21st Century Wire says Ben Stein, reputable pr...",0
27917,WASHINGTON (Reuters) - U.S. President Donald T...,1
25007,(Reuters) - Puerto Rico Governor Ricardo Rosse...,1
1377,"On Monday, Donald Trump once again embarrassed...",0
32476,"GLASGOW, Scotland (Reuters) - Most U.S. presid...",1


In [20]:
train.label.value_counts(normalize=True)

0    0.521967
1    0.478033
Name: label, dtype: float64

In [21]:
test.label.value_counts(normalize=True)

0    0.52706
1    0.47294
Name: label, dtype: float64

In [22]:
# train_fp = '../data/train.csv'
# test_fp = '../data/test.csv'
# train.to_csv(train_fp)
# test.to_csv(test_fp)

In [9]:
train = pd.read_csv('../data/train.csv')
test = pd.read_csv('../data/test.csv')

In [10]:
test.shape

(8980, 3)

In [11]:
train.shape

(35918, 3)