In [2]:
import pandas as pd
import numpy as np


from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
stop_words = list(ENGLISH_STOP_WORDS)

import pprint
import re
from datetime import datetime

import matplotlib.pyplot as plt
import seaborn as sns

sns.set_style('darkgrid')
%config InlineBackend.figure_format = 'retina'
%matplotlib inline

In [3]:
df= pd.read_csv('fake_or_real_news.csv', index_col=0)
kaggle= pd.read_csv('fake.csv')

In [4]:
df.shape, kaggle.shape

((6335, 3), (12999, 20))

In [None]:
#Cleaning for analysis

In [31]:
for i in kaggle['published'][:10]:
    print(i[11:16])

21:41
08:47
01:41
05:22
21:56
16:31
19:40
01:19
23:54
02:43


In [32]:
kaggle['published'].sample(2)

2643     2016-11-09T13:48:46.757+02:00
10320    2016-11-03T21:25:00.000+02:00
Name: published, dtype: object

In [51]:
get_time(kaggle['published'][2643])>get_time(kaggle['published'][10320])

False

In [49]:
get_time(kaggle['published'][10320])

datetime.time(21, 25)

'13:48'

In [43]:
time=datetime.strptime(time, '%H:%M').time()

In [45]:
time.time()

datetime.time(13, 48)

In [46]:
def get_time(time):
    '''Formats time from published column'''
    time=time[11:16]
    time=datetime.strptime(time, '%H:%M').time()
    
    return time

In [None]:
[]

In [52]:
kaggle.sample(3)

Unnamed: 0,uuid,ord_in_thread,author,published,title,text,language,crawled,site_url,country,domain_rank,thread_title,spam_score,main_img_url,replies_count,participants_count,likes,comments,shares,type
8575,01b407046f36ececa2279603595bdf9105a31e67,0,Gillian,2016-11-11T20:27:36.711+02:00,The Art of Jumping Timelines,Leave a reply Tom Kenyon \nThe Hathors – Altho...,english,2016-11-11T20:27:36.711+02:00,shiftfrequency.com,US,,The Art of Jumping Timelines,0.0,http://www.shiftfrequency.com/wp-content/uploa...,0,1,0,0,0,bs
8082,4457f1f51bf1f520d045e472247782fc5d927aa9,0,admin,2016-10-27T14:20:06.260+03:00,Jews ‘blamed for Holocaust’ at House of Lords ...,Jews ‘blamed for Holocaust’ at House of Lords ...,english,2016-10-27T14:20:06.260+03:00,rinf.com,US,,Jews ‘blamed for Holocaust’ at House of Lords ...,0.0,http://rinf.com/alt-news/wp-content/uploads/20...,0,1,0,0,0,bs
4601,3b83ca9258f3963db7613986384601947a14e15e,0,Mr. Wendal,2016-10-27T21:50:31.139+03:00,Homeless Trump Supporter Posts As GUARD For Tr...,0 comments \nWOW! Liberals can’t even let a do...,english,2016-10-27T21:50:31.139+03:00,ihavethetruth.com,US,67400.0,Homeless Trump Supporter Posts As GUARD For Tr...,0.0,https://ihavethetruth.com/wp-content/uploads/2...,0,1,0,0,0,bs


In [5]:
kaggle['type'].value_counts().unique

<bound method Series.unique of bs            11492
bias            443
conspiracy      430
hate            246
satire          146
state           121
junksci         102
fake             19
Name: type, dtype: int64>

In [None]:
kaggle["label"]=kaggle["type"]
kaggle=kaggle.drop('type', axis=1)

In [None]:
le1 = LabelEncoder()
kaggle['numerical_label']=le1.fit_transform(kaggle['label'])
kaggle = kaggle.reset_index(drop=True)

In [None]:
kaggle.sample(3)

In [None]:
df.sample(7)

In [None]:
df['label'].value_counts().unique

In [None]:
le2 = LabelEncoder()
df['binary_label']=le2.fit_transform(df['label'])
df = df.reset_index(drop=True)

In [None]:
def cleaner(text):
    try:
        text = re.sub('[.,\/#!$%\^&\*;:{}\+=<>_`~()]', ' ', text)
        text = re.sub('[^a-z0-9 ]','', text.lower())
#         text = re.sub('displaystyle',' ', text)
        text = re.sub('\s+',' ',text)
    except: 
        pass
    return text

In [None]:
df["text"]=df["text"].apply(cleaner)

In [None]:
df.sample(7)

### Ideas to drive from EDA:
* Look at global token counts
  * Ngrams (individually 2,3,4,5,6.  Then Range 2,6)
* Look at subsets for labels (+ ngrams)
* Look at New datasets (kaggle)
* Look at specific time periods (do some research first)
* Look at what is distinct between classes / labels
* Make this modular (consider OO python)
  * Swap out vectorizers
* Start plotting everything

In [None]:

# stop_words.extend(["number", "like", "just"]) #after round one
# stop_words.extend(["donald","trump", "hillary", "clinton", "obama", "number", "like"])
#"united", "states", "white", "house"

In [None]:
my_stops=["donald","trump", "clinton", "obama", "number", "like", "number", "like", "just"]


In [None]:
class TokenAnalysis(object):
    from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
    from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
    
    
    def __init__(self, df, Vectorizer=None, max_df=None, n_gram_range=None, stop_words=None):
        self.text=df["text"]
        self.label=df["label"]
        
        self.max_df=max_df
        self.ngram_range= n_gram_range
        self.vec=Vectorizer
#         self.vectorizer=self.init_vectorizer(sel.vec)
        
        if stop_words is None:
            self.stop_words=list(ENGLISH_STOP_WORDS)           
        else:
            self.stop_words= stop_words
        
        
        if self.vec is None:
            if self.max_df is None:
                self.vectorizer=CountVectorizer(stop_words=self.stop_words, max_df=0.7) 
            else:
                self.vectorizer=CountVectorizer(stop_words=self.stop_words, max_df=self.max_df)
                
        else:                    
            if self.ngram_range is None:
                self.vectorizer=self.vec(stop_words=self.stop_words, max_df=0.7)
            else:
                self.vectorizer=self.vec(stop_words=self.stop_words, max_df=0.7, ngram_range=self.ngram_range)
        

        
    
        # make token dataframe called X_text
        self.X = self.vectorizer.fit_transform(self.text)
        self.X_text = pd.DataFrame(self.X.toarray(), columns = self.vectorizer.get_feature_names())
        self.X_text.index=self.label
    
    #intialize vectorizer
#     def init_vectorizer(self, vec):   
         
#         #default to CountVectorizer

#         if self.vec is None:
#             if self.max_df is None:
#                 self.vectorizer=CountVectorizer(stop_words=self.stop_words, max_df=0.7) 
#             else:
#                 self.vectorizer=CountVectorizer(stop_words=self.stop_words, max_df=self.max_df)
                
#         else:                    
#             if self.ngram_range is None:
#                 self.vectorizer=self.vec(stop_words=self.stop_words, max_df=0.7)
#             else:
#                 self.vectorizer=self.vec(stop_words=self.stop_words, max_df=0.7, ngram_range=self.ngram_range)
                
        
    def top_tokens(self, num=None, label=None):
        
        if num is None and label is None:
            token_sum = self.X_text.sum()
            top=token_sum.sort_values(ascending=False)
            return top[:10]
        
        if label is None:
            token_sum = self.X_text.sum()
            top=token_sum.sort_values(ascending=False)
            return top[:num]
        
        else:
            token_sums_labeled = self.X_text[self.X_text.index==label].sum()
            top=token_sums_labeled.sort_values(ascending=False)
            
            if num is None:
                return top[:10]
            else:
                return top[:num] 
            
            
            
    def top_tokens_label_freq(self, num=None):
        if num is None:
            num=10
            
        top=self.top_tokens(num)
        return self.X_text.groupby(self.X_text.index)[top.index[:num]].sum()                                                      

    
    def plot_top_tokens(self, num=None, label=None):
        top=self.top_tokens(num, label)
     
        if label is None:
            label="GLOBAL"
        
        plt.figure(figsize=(8,6))
        
        plot1=sns.barplot(x=top.index, y=top)
        
        plot1.set_title("Top {} {} tokens".format(len(top), label))
        plot1.set_xticklabels(top.index, rotation=30)
        plt.show()
        
        
        
    def plot_top_tokens_by_binary_label(self, num=None):
        #set scale by axis
        labeled_df=self.top_tokens_label_freq(num).T
        labels=list(labeled_df.columns)
        ylim=max(max(self.top_tokens(label=labels[0])), max(self.top_tokens(label=labels[1])))
        
        fig, ax = plt.subplots(1, len(labels), figsize=(15, 8))

        for axes in range(len(labels)):
            label=labels[axes]
            plot=labeled_df[label].plot(kind="bar", ax=ax[axes])
            plot.set_xticklabels(labeled_df.index, rotation=30)
            ax[axes].set_title("Freq of top tokens in {} Articles".format(label))
            ax[axes].set_ylim([0,ylim*1.10])
        
        plt.show()
        
        
    
    def plot_top_tokens_by_label(self, num=None):
        labeled_df=self.top_tokens_label_freq(num).T
        labels=list(labeled_df.columns)
        ylim=0
        
        for i in range(len(labels)):
            if labels[i] != "bs":
                ylim= max(ylim, max(self.top_tokens(label=labels[i])))
        
        fig, ax = plt.subplots(len(labels), 1, figsize=(10, 75))

        for axes in range(len(labels)):
            label=labels[axes]
            plot=labeled_df[label].plot(kind="bar", ax=ax[axes])
            plot.set_xticklabels(labeled_df.index, rotation=30)
            ax[axes].set_title("Freq of top tokens in {} Articles".format(label))
            if label!="bs":
                ax[axes].set_ylim([0,ylim*1.10])

        
        plt.show()
        
        
        
    def add_stops(self, words):
        if type(words) is list:
            self.stop_words.extend(words)
        else:
            self.stop_words.append(words)
        

        if self.vec is None:
            if self.max_df is None:
                self.vectorizer=CountVectorizer(stop_words=self.stop_words, max_df=0.7) 
            else:
                self.vectorizer=CountVectorizer(stop_words=self.stop_words, max_df=self.max_df)
                
        else:                    
            if self.ngram_range is None:
                self.vectorizer=self.vec(stop_words=self.stop_words, max_df=0.7)
            else:
                self.vectorizer=self.vec(stop_words=self.stop_words, max_df=0.7, ngram_range=self.ngram_range)
        
        
        self.X = self.vectorizer.fit_transform(self.text)
        self.X_text = pd.DataFrame(self.X.toarray(), columns = self.vectorizer.get_feature_names())
        self.X_text.index=self.label
        
        
        
        

In [None]:
####kaggle token analysis

In [None]:
kaggle['label'].value_counts().unique

In [None]:
kaggle['text'].isnull().sum()

In [None]:
def get_date(date):
    date=date[:10]
    date=datetime.strptime(date, '%Y-%m-%d')
    
    return date
    

In [None]:
kaggle["date"] = kaggle['published'].apply(get_date)
kaggle["date"] = pd.to_datetime(kaggle["date"])
kaggle["month"] = kaggle["date"].dt.month
kaggle["day"] = kaggle["date"].dt.day
kaggle["year"] = kaggle["date"].dt.year

In [None]:
#clean text
kaggle["text"]=kaggle["text"].apply(cleaner)
kaggle["text"] = kaggle["text"].fillna('')

In [None]:
#add fake to bs
kaggle.loc[kaggle['label'] == "fake", 'label'] = "bs"

In [None]:
kaggle['label'].value_counts().unique

In [None]:
kag_sub=kaggle.sample(1000)

In [None]:
print("Date Range: ", (str(min(kaggle["date"])), max(kaggle["date"])))

In [None]:
#feature engineering: make social reach score from social reach cols
social_reach_cols=["replies_count", 'participants_count', 'likes', 'comments','shares']
feature_engineer_df=kaggle[social_reach_cols]



In [None]:

scaler=MinMaxScaler()
for col in feature_engineer_df.columns:
    x_scaled = scaler.fit_transform(feature_engineer_df[social_reach_cols])
    feature_engineer_df[col]=x_scaled


kaggle["social_reach_score"]=feature_engineer_df[social_reach_cols[0]] + feature_engineer_df[social_reach_cols[1]] \
                        + feature_engineer_df[social_reach_cols[2]]  + feature_engineer_df[social_reach_cols[3]] \
                        + feature_engineer_df[social_reach_cols[4]] 
kaggle.sample(3)

In [None]:
#Dataframes to model


#1 4 distinct labels
kaggle=kaggle[(kaggle["label"]=="bs")\
       | (kaggle["label"]=="bias") \
       | (kaggle["label"]=="conspiracy") \
       | (kaggle["label"]=="satire")]


#2 Important date ranges between 10-26 and 11-25
before_comey_mask=kaggle["date"]<'2016-10-28 00:00:00'
comey_mask=(kaggle["date"]>='2016-10-28 00:00:00' )& (kaggle["date"]<='2016-11-07 00:00:00')
election_mask=(kaggle["date"]>='2016-11-08 00:00:00' )& (kaggle["date"]<='2016-11-10 00:00:00')
after_election_mask=kaggle["date"]>='2016-11-10 00:00:00' 

bc_df = kaggle[before_comey_mask]
comey_df = kaggle[comey_mask]
election_df = kaggle[election_mask]
ae_df = kaggle[after_election_mask]






In [None]:
##### Before Comey letter 

In [None]:
CV_bc = TokenAnalysis(bc_df)
TF_1_ngram_bc = TokenAnalysis(bc_df, TfidfVectorizer)
TF_2_ngram_bc = TokenAnalysis(bc_df, TfidfVectorizer, n_gram_range=(2,2))
TF_3_ngram_bc = TokenAnalysis(bc_df, TfidfVectorizer, n_gram_range=(3,3))


In [None]:
# Top 10 GLOBAL tokens
CV_bc.plot_top_tokens()

In [None]:
# Top 10 GLOBAL tokens for each label

CV_bc.plot_top_tokens_by_label()

In [None]:
# Top 10 bs tokens
CV_bc.plot_top_tokens(label="bs")

# Top 10 bias tokens
CV_bc.plot_top_tokens(label="bias")

In [None]:
# Top 10 conspiracy tokens
CV_bc.plot_top_tokens(label="conspiracy")

# Top 10 satire tokens
CV_bc.plot_top_tokens(label="satire")

In [None]:
# Top 10 GLOBAL tokens
TF_2_ngram_bc.plot_top_tokens()

# Top 10 GLOBAL tokens for each label
TF_2_ngram_bc.plot_top_tokens_by_label()

In [None]:
# Top 10 bs tokens
TF_2_ngram_bc.plot_top_tokens(label="bs")

# Top 10 bias tokens
TF_2_ngram_bc.plot_top_tokens(label="bias")

In [None]:
# Top 10 conspiracy tokens
TF_2_ngram_bc.plot_top_tokens(label="conspiracy")

# Top 10 satire tokens
TF_2_ngram_bc.plot_top_tokens(label="satire")

In [None]:
# Top 10 GLOBAL tokens
TF_3_ngram_bc.plot_top_tokens()

# Top 10 GLOBAL tokens for each label
TF_3_ngram_bc.plot_top_tokens_by_label()

In [None]:
# Top 10 bs tokens
TF_3_ngram_bc.plot_top_tokens(label="bs")

# Top 10 bias tokens
TF_3_ngram_bc.plot_top_tokens(label="bias")

In [None]:
# Top 10 conspiracy tokens
TF_3_ngram_bc.plot_top_tokens(label="conspiracy")

# Top 10 satire tokens
TF_3_ngram_bc.plot_top_tokens(label="satire")

In [None]:
# During Comey letter 

In [None]:
CV_comey = TokenAnalysis(comey_df)
TF_2_ngram_comey = TokenAnalysis(comey_df, TfidfVectorizer, n_gram_range=(2,2))
TF_3_ngram_comey = TokenAnalysis(comey_df, TfidfVectorizer, n_gram_range=(3,3))


In [None]:
# Top 10 GLOBAL tokens
CV_comey.plot_top_tokens()

In [None]:
# Top 10 GLOBAL tokens for each label

CV_comey.plot_top_tokens_by_label()

In [None]:
# Top 10 bs tokens
CV_comey.plot_top_tokens(label="bs")

# Top 10 bias tokens
CV_comey.plot_top_tokens(label="bias")

In [None]:
# Top 10 conspiracy tokens
CV_comey.plot_top_tokens(label="conspiracy")

# Top 10 satire tokens
CV_comey.plot_top_tokens(label="satire")

In [None]:
# Top 10 GLOBAL tokens
TF_2_ngram_comey.plot_top_tokens()

# Top 10 GLOBAL tokens for each label
TF_2_ngram_comey.plot_top_tokens_by_label()

In [None]:
# Top 10 bs tokens
TF_2_ngram_comey.plot_top_tokens(label="bs")

# Top 10 bias tokens
TF_2_ngram_comey.plot_top_tokens(label="bias")

In [None]:
# Top 10 conspiracy tokens
TF_2_ngram_comey.plot_top_tokens(label="conspiracy")

# Top 10 satire tokens
TF_2_ngram_comey.plot_top_tokens(label="satire")

In [None]:
# Top 10 GLOBAL tokens
TF_3_ngram_comey.plot_top_tokens()

# Top 10 GLOBAL tokens for each label
TF_3_ngram_comey.plot_top_tokens_by_label()

In [None]:
# Top 10 bs tokens
TF_3_ngram_comey.plot_top_tokens(label="bs")

# Top 10 bias tokens
TF_3_ngram_comey.plot_top_tokens(label="bias")

In [None]:
# Top 10 conspiracy tokens
TF_3_ngram_comey.plot_top_tokens(label="conspiracy")

# Top 10 satire tokens
TF_3_ngram_comey.plot_top_tokens(label="satire")

In [None]:
##### During Election

In [None]:
CV_election = TokenAnalysis(election_df)
TF_2_ngram_election = TokenAnalysis(election_df, TfidfVectorizer, n_gram_range=(2,2))
TF_3_ngram_election = TokenAnalysis(election_df, TfidfVectorizer, n_gram_range=(3,3))


In [None]:
# Top 10 GLOBAL tokens
CV_election.plot_top_tokens()

In [None]:
# Top 10 GLOBAL tokens for each label

CV_election.plot_top_tokens_by_label()

In [None]:
# Top 10 bs tokens
CV_election.plot_top_tokens(label="bs")

# Top 10 bias tokens
CV_election.plot_top_tokens(label="bias")

In [None]:
# Top 10 conspiracy tokens
CV_election.plot_top_tokens(label="conspiracy")

# Top 10 satire tokens
CV_election.plot_top_tokens(label="satire")

In [None]:
# Top 10 GLOBAL tokens
TF_2_ngram_election.plot_top_tokens()

# Top 10 GLOBAL tokens for each label
TF_2_ngram_election.plot_top_tokens_by_label()

In [None]:
# Top 10 bs tokens
TF_2_ngram_election.plot_top_tokens(label="bs")

# Top 10 bias tokens
TF_2_ngram_election.plot_top_tokens(label="bias")

In [None]:
# Top 10 conspiracy tokens
TF_2_ngram_election.plot_top_tokens(label="conspiracy")

# Top 10 satire tokens
TF_2_ngram_election.plot_top_tokens(label="satire")

In [None]:
# Top 10 GLOBAL tokens
TF_3_ngram_election.plot_top_tokens()

# Top 10 GLOBAL tokens for each label
TF_3_ngram_election.plot_top_tokens_by_label()

In [None]:
# Top 10 bs tokens
TF_3_ngram_election.plot_top_tokens(label="bs")

# Top 10 bias tokens
TF_3_ngram_election.plot_top_tokens(label="bias")

In [None]:
# Top 10 conspiracy tokens
TF_3_ngram_election.plot_top_tokens(label="conspiracy")

# Top 10 satire tokens
TF_3_ngram_election.plot_top_tokens(label="satire")

In [None]:
##### After election

In [None]:
CV_ae = TokenAnalysis(ae_df)
TF_2_ngram_ae = TokenAnalysis(ae_df, TfidfVectorizer, n_gram_range=(2,2))
TF_3_ngram_ae = TokenAnalysis(ae_df, TfidfVectorizer, n_gram_range=(3,3))


In [None]:
# Top 10 GLOBAL tokens
CV_ae.plot_top_tokens()

In [None]:
# Top 10 GLOBAL tokens for each label

CV_ae.plot_top_tokens_by_label()

In [None]:
# Top 10 bs tokens
CV_ae.plot_top_tokens(label="bs")

# Top 10 bias tokens
CV_ae.plot_top_tokens(label="bias")

In [None]:
# Top 10 conspiracy tokens
CV_ae.plot_top_tokens(label="conspiracy")

# Top 10 satire tokens
CV_ae.plot_top_tokens(label="satire")

In [None]:
# Top 10 GLOBAL tokens
TF_2_ngram_ae.plot_top_tokens()

# Top 10 GLOBAL tokens for each label
TF_2_ngram_ae.plot_top_tokens_by_label()

In [None]:
# Top 10 bs tokens
TF_2_ngram_ae.plot_top_tokens(label="bs")

# Top 10 bias tokens
TF_2_ngram_ae.plot_top_tokens(label="bias")

In [None]:
# Top 10 conspiracy tokens
TF_2_ngram_ae.plot_top_tokens(label="conspiracy")

# Top 10 satire tokens
TF_2_ngram_ae.plot_top_tokens(label="satire")

In [None]:
# Top 10 GLOBAL tokens
TF_3_ngram_ae.plot_top_tokens()

# Top 10 GLOBAL tokens for each label
TF_3_ngram_ae.plot_top_tokens_by_label()

In [None]:
# Top 10 bs tokens
TF_3_ngram_ae.plot_top_tokens(label="bs")

# Top 10 bias tokens
TF_3_ngram_ae.plot_top_tokens(label="bias")

In [None]:
# Top 10 conspiracy tokens
TF_3_ngram_ae.plot_top_tokens(label="conspiracy")

# Top 10 satire tokens
TF_3_ngram_ae.plot_top_tokens(label="satire")

In [None]:
# Initialize the `tfidf_vectorizer`  with n_gram range 2,6
# tfidf_vectorizer2 = TfidfVectorizer(stop_words=stop_words, max_df=0.7, ngram_range=(2,2)) 

# # Fit and transform the training data 
# tfidf_train2 = tfidf_vectorizer2.fit_transform(X_train) 

# # Transform the test set 
# tfidf_test2 = tfidf_vectorizer2.transform(X_test)

In [None]:
# from sklearn.naive_bayes import MultinomialNB
# from sklearn.metrics import accuracy_score

In [None]:
# clf = MultinomialNB()

# clf.fit(count_train, y_train)
# pred = clf.predict(count_test)
# score = accuracy_score(y_test, pred)
# print("accuracy:   %0.3f" % score)

In [None]:
# clf = MultinomialNB()

# clf.fit(tfidf_train, y_train)
# pred = clf.predict(tfidf_test)
# score = accuracy_score(y_test, pred)
# print("accuracy:   %0.3f" % score)