In [48]:
import os, sys, json, re, itertools

import nltk, sklearn, pandas as pd
from nltk.stem.wordnet import WordNetLemmatizer
from nltk import word_tokenize, pos_tag
from nltk.corpus import wordnet as wn
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split

lemmatizer = WordNetLemmatizer()
stemmer = nltk.stem.PorterStemmer()

path = "Downloads/fake-news/"

train = "train.csv"
test = "test.csv"
submit = "submit.csv"

token_pat = re.compile(r"[^\W_]+")

tag_map = {
    "J": wn.ADJ,
    "V": wn.VERB,
    "R": wn.ADV,
}


def get_wordnet_tag(pos):
    return tag_map.get(pos[0], wn.NOUN)


def load_data(fn):
    fn = os.path.join(path, fn)
    df = pd.read_csv(fn)
    return df


def get_stopwords():
    return list(itertools.chain(*[tokenize_text(stopword) for stopword in stopwords.words("english")]))


def tokenize_text(text, stem=True):
    # tokenizer = nltk.tokenize.punkt.PunktSentenceTokenizer()
    # tokens = tokenizer.tokenize(text)
    
    # using simple regex tokens which only detects digits/characters
    # tokens = token_pat.findall(text)
    
    tokens = word_tokenize(text)
    if stem:
        return [stemmer.stem(token) for token in tokens]
    processed_tokens = []
    for token, pos in pos_tag(tokens):
        # lemma = lemmatizer.lemmatize(token, get_wordnet_tag(pos))
        processed_token = stemmer.stem(token)  # lemma
        # print(token, processed_token)
        processed_tokens.append(processed_token)
    
    return processed_tokens




def get_featurizer():
    vectorizer = TfidfVectorizer(
        strip_accents="ascii",
        lowercase=True,
        # tokenizer=tokenize_text,
        stop_words=get_stopwords(),  # stopwords.words("english"),
        ngram_range=(1, 1), 
        max_features=50000
    )
    def featurize(df, train=True, labels=True):
        if train:
            X = vectorizer.fit_transform(df["text"])
        else:
            X = vectorizer.transform(df["text"])
        if not labels:
            return X
        y = df["label"]

        return X, y
    return featurize


def clean_df(df, col="text"):
    return df[~df[col].isna()]
    

In [49]:
train_df = load_data(train)
train_df = clean_df(train_df)
# print("df size:", len(df))
# df = df.dropna()
# print("df dropped size:", len(df))
test_df = load_data(test)
test_df = clean_df(test_df)

In [50]:
train_df

Unnamed: 0,id,title,author,text,label
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1
...,...,...,...,...,...
20795,20795,Rapper T.I.: Trump a ’Poster Child For White S...,Jerome Hudson,Rapper T. I. unloaded on black celebrities who...,0
20796,20796,"N.F.L. Playoffs: Schedule, Matchups and Odds -...",Benjamin Hoffman,When the Green Bay Packers lost to the Washing...,0
20797,20797,Macy’s Is Said to Receive Takeover Approach by...,Michael J. de la Merced and Rachel Abrams,The Macy’s of today grew from the union of sev...,0
20798,20798,"NATO, Russia To Hold Parallel Exercises In Bal...",Alex Ansary,"NATO, Russia To Hold Parallel Exercises In Bal...",1


In [38]:
test_df

Unnamed: 0,id,title,author,text
0,20800,"Specter of Trump Loosens Tongues, if Not Purse...",David Streitfeld,"PALO ALTO, Calif. — After years of scorning..."
1,20801,Russian warships ready to strike terrorists ne...,,Russian warships ready to strike terrorists ne...
2,20802,#NoDAPL: Native American Leaders Vow to Stay A...,Common Dreams,Videos #NoDAPL: Native American Leaders Vow to...
3,20803,"Tim Tebow Will Attempt Another Comeback, This ...",Daniel Victor,"If at first you don’t succeed, try a different..."
4,20804,Keiser Report: Meme Wars (E995),Truth Broadcast Network,42 mins ago 1 Views 0 Comments 0 Likes 'For th...
...,...,...,...,...
5195,25995,The Bangladeshi Traffic Jam That Never Ends - ...,Jody Rosen,Of all the dysfunctions that plague the world’...
5196,25996,John Kasich Signs One Abortion Bill in Ohio bu...,Sheryl Gay Stolberg,WASHINGTON — Gov. John Kasich of Ohio on Tu...
5197,25997,"California Today: What, Exactly, Is in Your Su...",Mike McPhate,Good morning. (Want to get California Today by...
5198,25998,300 US Marines To Be Deployed To Russian Borde...,,« Previous - Next » 300 US Marines To Be Deplo...


In [43]:
# tokenize_text("tshifhdsif sdfknsdlkfn sdlfnkls this is good goods giving give giver  lover love give")

In [51]:
featurize = get_featurizer()
train_X, train_y = featurize(train_df)
# test_X, test_y = featurize(test_df, train=False)

X_train, X_test, y_train, y_test = train_test_split(train_X, train_y, test_size=0.2, random_state=42, stratify=train_y)

classifier = DecisionTreeClassifier()

classifier.fit(X_train, y_train)

pred_y = classifier.predict(X_test)

f1 = f1_score(y_test, pred_y)

f1

0.8788894207754907

In [56]:
classifier = GradientBoostingClassifier()

classifier.fit(train_X, train_y)

# pred_y = classifier.predict(X_test)
# f1 = f1_score(y_test, pred_y)
# f1

GradientBoostingClassifier(ccp_alpha=0.0, criterion='friedman_mse', init=None,
                           learning_rate=0.1, loss='deviance', max_depth=3,
                           max_features=None, max_leaf_nodes=None,
                           min_impurity_decrease=0.0, min_impurity_split=None,
                           min_samples_leaf=1, min_samples_split=2,
                           min_weight_fraction_leaf=0.0, n_estimators=100,
                           n_iter_no_change=None, presort='deprecated',
                           random_state=None, subsample=1.0, tol=0.0001,
                           validation_fraction=0.1, verbose=0,
                           warm_start=False)

In [57]:
ttest_X = featurize(test_df, train=False, labels=False)
pred_y = classifier.predict(ttest_X)
test_df["predicted"] = pred_y

In [58]:
test_df[test_df["predicted"] == 1]

Unnamed: 0,id,title,author,text,predicted
1,20801,Russian warships ready to strike terrorists ne...,,Russian warships ready to strike terrorists ne...,1
2,20802,#NoDAPL: Native American Leaders Vow to Stay A...,Common Dreams,Videos #NoDAPL: Native American Leaders Vow to...,1
4,20804,Keiser Report: Meme Wars (E995),Truth Broadcast Network,42 mins ago 1 Views 0 Comments 0 Likes 'For th...,1
5,20805,Trump is USA's antique hero. Clinton will be n...,,Trump is USA's antique hero. Clinton will be n...,1
7,20807,Weekly Featured Profile – Randy Shannon,Trevor Loudon,You are here: Home / *Articles of the Bound* /...,1
...,...,...,...,...,...
5186,25986,The Fed Plans for the Next Crisis,Ron Paul,\nIn her recent address at the Jackson Hole ...,1
5189,25989,Amazon extends olive branch to Megyn Kelly ove...,Staff Writer,11.23.2016 @4:17 PM EST More Establishment tie...,1
5192,25992,Earth To Ammosexuals: NRA Admits No One Is Com...,Natalie Dailey,Earth To Ammosexuals: NRA Admits No One Is Com...,1
5195,25995,The Bangladeshi Traffic Jam That Never Ends - ...,Jody Rosen,Of all the dysfunctions that plague the world’...,1


In [54]:
test_df[test_df["predicted"] == 1]

Unnamed: 0,id,title,author,text,predicted
1,20801,Russian warships ready to strike terrorists ne...,,Russian warships ready to strike terrorists ne...,1
2,20802,#NoDAPL: Native American Leaders Vow to Stay A...,Common Dreams,Videos #NoDAPL: Native American Leaders Vow to...,1
4,20804,Keiser Report: Meme Wars (E995),Truth Broadcast Network,42 mins ago 1 Views 0 Comments 0 Likes 'For th...,1
5,20805,Trump is USA's antique hero. Clinton will be n...,,Trump is USA's antique hero. Clinton will be n...,1
7,20807,Weekly Featured Profile – Randy Shannon,Trevor Loudon,You are here: Home / *Articles of the Bound* /...,1
...,...,...,...,...,...
5186,25986,The Fed Plans for the Next Crisis,Ron Paul,\nIn her recent address at the Jackson Hole ...,1
5189,25989,Amazon extends olive branch to Megyn Kelly ove...,Staff Writer,11.23.2016 @4:17 PM EST More Establishment tie...,1
5192,25992,Earth To Ammosexuals: NRA Admits No One Is Com...,Natalie Dailey,Earth To Ammosexuals: NRA Admits No One Is Com...,1
5195,25995,The Bangladeshi Traffic Jam That Never Ends - ...,Jody Rosen,Of all the dysfunctions that plague the world’...,1


In [55]:
len(test_df)

5193

In [61]:
test_df[test_df["predicted"] == 0]

Unnamed: 0,id,title,author,text,predicted
0,20800,"Specter of Trump Loosens Tongues, if Not Purse...",David Streitfeld,"PALO ALTO, Calif. — After years of scorning...",0
3,20803,"Tim Tebow Will Attempt Another Comeback, This ...",Daniel Victor,"If at first you don’t succeed, try a different...",0
6,20806,Pelosi Calls for FBI Investigation to Find Out...,Pam Key,"Sunday on NBC’s “Meet the Press,” House Minori...",0
15,20815,"Thieves Take a Chunk of Change, All 221 Pounds...",Melissa Eddy,"BERLIN — You could never palm it, flip it o...",0
16,20816,"New England Patriots’ Owner, Still Sore at N.F...",Ken Belson and Ben Shpigel,"FOXBOROUGH, Mass. — The N. F. L. likes port...",0
...,...,...,...,...,...
5193,25993,Toyota Recalls 1.4 Million Vehicles as 2nd Maj...,Hiroko Tabuchi and Jonathan Soble,Public concern about faulty automobile airbags...,0
5194,25994,Trump on If ’Tapes’ Exist of Comey Conversatio...,Pam Key,Pres. Trump on if “tapes” exist of his convers...,0
5196,25996,John Kasich Signs One Abortion Bill in Ohio bu...,Sheryl Gay Stolberg,WASHINGTON — Gov. John Kasich of Ohio on Tu...,0
5197,25997,"California Today: What, Exactly, Is in Your Su...",Mike McPhate,Good morning. (Want to get California Today by...,0


# Creating Submit.csv

In [67]:
submit_df = test_df.rename(columns={"predicted": "label"})
print(submit_df.columns)
submit_df = submit_df[["id", "label"]]


submit_df.to_csv(os.path.join(path, submit), sep=",", index=False)
submit_df

Index(['id', 'title', 'author', 'text', 'label'], dtype='object')


Unnamed: 0,id,label
0,20800,0
1,20801,1
2,20802,1
3,20803,0
4,20804,1
...,...,...
5195,25995,1
5196,25996,0
5197,25997,0
5198,25998,1
