In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import re
from clean_tweet import remove_url, change_characters_tweet, is_small_tweet

In [2]:
df = pd.read_excel('data/twitter_fakenews_USElections_2016.xlsx')

In [3]:
df.shape

(9001, 19)

In [4]:
df.head()

Unnamed: 0,is_fake_news_1,is_fake_news_2,fake_news_category_1,fake_news_category_2,tweet_id,created_at,retweet_count,text,user_screen_name,user_verified,user_friends_count,user_followers_count,user_favourites_count,tweet_source,geo_coordinates_available,num_hashtags,num_mentions,num_urls,num_media
0,False,False,0,0,258641295487156224,Wed Oct 17 18:51:04 +0000 2012,1210,@realDonaldTrump YOU SHOULD BE PRESIDENT FOR S...,GHOSTofMEATBALL,False,396,804,91,TweetDeck,0,0,2,0,0
1,False,False,0,0,261135127055327233,Wed Oct 24 16:00:40 +0000 2012,1656,I literally pose half naked for a living and u...,chrissyteigen,True,4024,3983853,20973,Twitter for iPhone,0,0,2,0,0
2,False,False,0,0,264033382076407808,Thu Nov 01 15:57:18 +0000 2012,4688,@realDonaldTrump you are full of shit!,RalphGilles,True,759,29163,3298,Twitter for iPhone,0,0,2,0,0
3,False,False,0,0,265895586660757505,Tue Nov 06 19:17:02 +0000 2012,10418,@realDonaldTrump you're fucking retarded,TimmyWait,False,839,397,506,Twitter Web Client,0,0,2,0,0
4,False,False,0,0,265895723445411841,Tue Nov 06 19:17:35 +0000 2012,1930,@realDonaldTrump You are the stupidest man on ...,mattcale52,False,1184,1350,3364,Twitter for iPhone,0,0,2,0,0


#### Amount of tweets

In [5]:
len(df)

9001

In [7]:
def cleanup_tweet(tweet):
    clean_tweet = remove_url(tweet)
    clean_tweet = change_characters_tweet(clean_tweet)
    return clean_tweet

In [10]:
def build_dataset(df, dest_path):
    f = open(dest_path, 'w', encoding='utf8')
    data = ''
    summaries = df.tolist()
    for summary in summaries:
        summary = str(summary).strip()
        summary = re.sub(r"\s", " ", summary)
        bos_token = '<BOS>'
        eos_token = '<EOS>'
        data += bos_token + ' ' + summary + ' ' + eos_token + '\n'
        
    f.write(data)

### First Fine-Tunning (With all the data, fake or not)

##### We are only interested in the text of the tweet and we need to add

In [6]:
tweets_content = df["text"]

In [8]:
clean_tweets = []

for tweet in tweets_content:
    clean_tweet = cleanup_tweet(tweet)
    if not is_small_tweet(clean_tweet):
        clean_tweets.append(clean_tweet)

clean_tweets = pd.Series(clean_tweets)

In [9]:
len(clean_tweets)

8733

##### Almost 270 tweets have been removed

#### Now let's build the datasets

In [11]:
# Choose ratios to split data in train, validation and test
train_test_ratio = 0.9
train_valid_ratio = 0.8
df_full_train, df_test = train_test_split(clean_tweets, train_size = train_test_ratio, random_state = 1)
df_train, df_valid = train_test_split(df_full_train, train_size = train_valid_ratio, random_state = 1)

In [15]:
build_dataset(df_train, r'input\all_tweets\US_train_cleaned.txt')
build_dataset(df_valid, r'input\all_tweets\US_valid_cleaned.txt')
build_dataset(df_test, r'input\all_tweets\US_test_cleaned.txt')

In [16]:
f = open("US_train_cleaned.txt", "r", encoding="utf8")
print(f.read())

<BOS> Tu vois ce moment où l'Amérique détruit son propre pays mais le Canada est prêt à nettoyer leur inhumanité ?… <EOS>
<BOS> Let's shout it from the rooftops, fellow Patriots...  'WHAT DID THE PRESIDENT KNOW, AND WHEN DID HE KNOW IT?'  @realDonaldTrump <EOS>
<BOS> Les USA ça va changer comme a Poudlard quand le ministère de la magie a viré Dumbledore pour mettre Dolores Ombrage #ElectionDay <EOS>
<BOS> Trump says voter fraud may be goin' on under our very noses. @realDonaldTrump <EOS>
<BOS> Throwback to the beautiful moment Barack Obama was elected 8 years ago #ElectionNight <EOS>
<BOS> So my sister told my dad she voted for Donald lmaoooo #ElectionDay <EOS>
<BOS> 'Just listen to what you heard' best advice of the night for our nation. Thank you @HillaryClinton #debatenight <EOS>
<BOS> In his 'apology,' @realDonaldTrump pretends his behavior in the leaked tapes was an anomaly. No, it is a pattern. <EOS>
<BOS> Elida worked for @realDonaldTrump for six years. She has nothing but love 

### Second Fine-Tunning (With only no fake tweets)

##### We have two columns with a manual classification for every tweet. They have three different values 
- FALSE: The tweet is not fake
- TRUE: The tweet is fake
- UNKNOWN: Impossible to classify with confidence

In [48]:
df['is_fake_news_1'].value_counts()

False      8264
True        404
UNKNOWN     333
Name: is_fake_news_1, dtype: int64

In [50]:
df['is_fake_news_2'].value_counts()

False      6897
True       1729
UNKNOWN     375
Name: is_fake_news_2, dtype: int64

##### Seeing the distribution of values of both features, we will choose the column is_fake_news_2 to separate the tweets, because it is better distributed

In [51]:
df_no_fake_tweets = df[df['is_fake_news_2']==False].copy()

In [53]:
df_no_fake_tweets.shape

(6897, 19)

##### We are only interested in the text of the tweet

In [57]:
no_fake_tweets_content = df_no_fake_tweets["text"]

In [58]:
clean_no_fake_tweets = []

for tweet in no_fake_tweets_content:
    clean_tweet = cleanup_tweet(tweet)
    if not is_small_tweet(clean_tweet):
        clean_no_fake_tweets.append(clean_tweet)

clean_no_fake_tweets = pd.Series(clean_no_fake_tweets)

In [59]:
len(clean_no_fake_tweets)

6719

##### Almost 180 tweets have been removed

In [60]:
# Choose ratios to split data in train, validation and test
train_test_ratio = 0.9
train_valid_ratio = 0.8
df_full_train, df_test = train_test_split(clean_no_fake_tweets, train_size = train_test_ratio, random_state = 1)
df_train, df_valid = train_test_split(df_full_train, train_size = train_valid_ratio, random_state = 1)

In [69]:
build_dataset(df_train, r'input\no_fake_tweets\US_train_no_fake.txt')
build_dataset(df_valid, r'input\no_fake_tweets\US_valid_no_fake.txt')
build_dataset(df_test, r'input\no_fake_tweets\US_test_no_fake.txt')

In [70]:
f = open(r"input\no_fake_tweets\US_train_no_fake.txt", "r", encoding="utf8")
print(f.read())

<BOS> Im not defending his words. Why are you defending Bills ACTIONS and how Bill and Hillary smeared woman? <EOS>
<BOS> @realDonaldTrump QUICK POLL:  What are Trump's Tweets really? <EOS>
<BOS> Allow me to settle the @mCuban vs. @realDonaldTrump feud: Cuban would win Celebrity Apprentice; Trump would never get funded on Shark Tank. <EOS>
<BOS> '@fivestarr6028: Yes! Hubby and I voted 4 @realDonaldTrump already!  Thank you. <EOS>
<BOS> 'How stupid is our country' - @realDonaldTrump  Stupid enough to make you a presidential nominee I suppose #Debate <EOS>
<BOS> #Retweet if you have a problem with this. 🙄 I know I do, @POTUS & @HillaryClinton. #NeverHillary… <EOS>
<BOS> The idea that @realDonaldTrump will still own massive business interests as @POTUS isn’t only unethical – it puts us all at risk. <EOS>
<BOS> 'I am asking you to DREAM BIG...we are just 6 days away from the change you’ve been waiting for your entire life.'… <EOS>
<BOS> Never underestimate the power of stupid people in lar

### Third Fine-Tunning (With only fake tweets)

In [71]:
df_fake_tweets = df[df['is_fake_news_2']==True].copy()

In [72]:
df_fake_tweets.shape

(1729, 19)

##### We are only interested in the text of the tweet

In [73]:
fake_tweets_content = df_fake_tweets["text"]

In [74]:
clean_fake_tweets = []

for tweet in fake_tweets_content:
    clean_tweet = cleanup_tweet(tweet)
    if not is_small_tweet(clean_tweet):
        clean_fake_tweets.append(clean_tweet)

clean_fake_tweets = pd.Series(clean_fake_tweets)

In [75]:
len(clean_fake_tweets)

1667

##### 62 tweets have been removed

In [76]:
# Choose ratios to split data in train, validation and test
train_test_ratio = 0.9
train_valid_ratio = 0.8
df_full_train, df_test = train_test_split(clean_fake_tweets, train_size = train_test_ratio, random_state = 1)
df_train, df_valid = train_test_split(df_full_train, train_size = train_valid_ratio, random_state = 1)

In [77]:
build_dataset(df_train, r'input\fake_tweets\US_train_fake.txt')
build_dataset(df_valid, r'input\fake_tweets\US_valid_fake.txt')
build_dataset(df_test, r'input\fake_tweets\US_test_fake.txt')

In [81]:
f = open(r"input\fake_tweets\US_train_fake.txt", "r", encoding="utf8")
print(f.read())

<BOS> joyeux hunger games et puisse le sort vous être favorable #ElectionNight <EOS>
<BOS> One of @realDonaldTrump’s 1st exec actions combines 2 of his favorite things:  ✓ Silencing anyone who disagrees wit… <EOS>
<BOS> America right now... 😭 #ElectionNight <EOS>
<BOS> Question of the Day: Do you think the press owes @realdonaldtrump an apology? #Hannity <EOS>
<BOS> And Obama's ICE knew where they were & allowed them to roam freely. Thank you @realDonaldTrump for bringing back la… <EOS>
<BOS> sending $100 to everyone who retweeted if Hillary wins. go #ElectionNight <EOS>
<BOS> '@NeilTurner_: @realDonaldTrump Cruz & Rubio are scared! WATCH -> <EOS>
<BOS> BREAKING: @HillaryClinton is LEADING in polls in important battleground states Qatar, Iraq, Iran, Saudi Arabia  #RememberWhenTrump was?Never <EOS>
<BOS> Sources: 99% chance foreign intel agencies breached @HillaryClinton private server. <EOS>
<BOS> ALSO...There is another.  @realDonaldTrump bought 2nd portrait of himself w/ charity mone