### Install fastText 

In [17]:
import pandas as pd
from sklearn.cross_validation import train_test_split
from sklearn.metrics import confusion_matrix
import numpy as np
import fasttext

### Download dataset

In [88]:
!wget https://s3.amazonaws.com/assets.datacamp.com/blog_assets/fake_or_real_news.csv

--2019-08-16 15:57:59--  https://s3.amazonaws.com/assets.datacamp.com/blog_assets/fake_or_real_news.csv
Resolving s3.amazonaws.com (s3.amazonaws.com)... 52.216.138.101
Connecting to s3.amazonaws.com (s3.amazonaws.com)|52.216.138.101|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 30696129 (29M) [text/csv]
Saving to: ‘fake_or_real_news.csv’


2019-08-16 16:01:02 (165 KB/s) - ‘fake_or_real_news.csv’ saved [30696129/30696129]



### Read data

In [89]:
df = pd.read_csv('fake_or_real_news.csv')
df = df[['title', 'text', 'label']]

In [90]:
df.shape

(6335, 3)

In [91]:
df.head()

Unnamed: 0,title,text,label
0,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE
1,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE
2,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL
3,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE
4,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,REAL


### Split data into train and test set

In [92]:
df_train, df_test = train_test_split(df[['title', 'text', 'label']], test_size=0.33, random_state=42)

### Save data in fastText format

In [93]:
def save_in_fastText_format(filename, data_frame):
    f = open(filename + ".txt", "w")
    for text, label in zip(data_frame.text, data_frame.label):
        f.writelines("__label__" + label + " " + text + "\n")
    f.close()

In [94]:
save_in_fastText_format("train", df_train)
save_in_fastText_format("test", df_test)

### Training model

In [95]:
model = fasttext.train_supervised('train.txt')

In [96]:
print(model.labels)

['__label__REAL', '__label__FAKE']


### Test model

In [97]:
def print_results(N, p, r):
    print("N\t" + str(N))
    print("P@{}\t{:.3f}".format(1, p))
    print("R@{}\t{:.3f}".format(1, r))

print_results(*model.test('test.txt'))

N	2091
P@1	0.891
R@1	0.891
