In [171]:
import numpy as np
import pandas as pd 
import spacy
import string
import re
from nltk.corpus import stopwords

In [172]:
# out-of-the-box: download best-matching default model
# python -m spacy download en
parser = spacy.load("en")

In [173]:
def clean_text(text):
        # get rid of newlines
        text = text.strip().replace("\n", " ").replace("\r", " ")
        
        # replce mentions wth @
        mentionFinder = re.compile(r"@[a-z0-9_]{1,15}", re.IGNORECASE)
        text = mentionFinder.sub("@MENTION", text)

        # replace emails and also @ mention
        emailFinder = re.compile(r"\b[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,}\b", re.IGNORECASE)
        text = emailFinder.sub("<EMAIL>", text)

        # replace HTML symbols
        text = text.replace("&amp;", "and").replace("&gt;", ">").replace("&lt;", "<")
        
        text = text.replace("(Before It's News)", "")
        text = text.replace("% of readers think this story is Fact. Add your two cents.", "")
        ## Exlpore more patterns inside text and add it here
        return text


In [174]:
# stopword that we dont want 
STOPLIST = set(stopwords.words('english')) # add more stopword
# symbols that we dont want, research add more later 
SYMBOLS = " ".join(string.punctuation).split(" ") + ["-----", "---", "...", "“", "”", "’", "…"]

# Lets create a custome tokenizer using spacy
def pretokenizer_clean(texts):
    tokens = parser(texts)
    lemmas = []
    try: 
        lemmas = [tok.text.lower().strip() if tok.ent_type_ == "" else "<{}>".format(tok.ent_type_) for tok in tokens]
    except:
        print('error occured')
        lemmas.append("<UNK>")
    
    tokens  = lemmas
    # For named entity we hve to replace them with their positional index
    #tokens = [tok.lemma_.lower().strip() if tok.lemma_ != "-PRON-" else tok.lower_ for tok in tokens]
    tokens = [tok for tok in tokens if tok not in STOPLIST]
    tokens = [tok for tok in tokens if tok not in SYMBOLS]
    
    while "" in tokens:
        tokens.remove("")
    while " " in tokens:
        tokens.remove(" ")
    while "\n" in tokens:
        tokens.remove("\n")
    while "\n\n" in tokens:
        tokens.remove("\n\n")
                
    all_tokens = " ".join(tokens)
   
    return str(all_tokens)            

In [175]:
df_all = pd.read_pickle('data/all_data.pkl')
#Do this once, only when your data is changed 
# df_train = df_all.sample(frac=0.7)
# df_test = df_all.loc[~df_all.index.isin(df_train.index), :]
# #pickle these for grid search
# df_train.to_pickle('data/data_train.pkl')
# df_test.to_pickle('data/data_test.pkl')


In [176]:
df_train = pd.read_pickle('data/data_train.pkl')
df_test = pd.read_pickle('data/data_test.pkl')

X_train = df_train['content']
y_train = df_train['label']
X_test = df_test['content']
y_test = df_test['label']

In [177]:
X_train.shape

(6796,)

In [178]:
X_train = [clean_text(text) for text in X_train]
X_test = [clean_text(text) for text in X_test]

In [180]:
one = pretokenizer_clean(X_train[0])

In [181]:
#About 30 min 
X_train = [pretokenizer_clean(text) for text in X_train] 
X_test = [pretokenizer_clean(text) for text in X_test] 

In [182]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS
tfidf_vectorizer  = TfidfVectorizer(stop_words=ENGLISH_STOP_WORDS,  ngram_range=(1,2), max_df= 0.85, min_df= 2, max_features=5000)

In [183]:
from timeit import default_timer as timer
import time
start = timer()
t0 = time.time()
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)
t1 = time.time()

total = t1-t0
# STOP MY TIMER
elapsed_time = timer() - start # in seconds
print(elapsed_time)

print("total time ", total)

9.972166464023758
total time  9.972394943237305


In [184]:
X_train_tfidf.shape

(6796, 5000)

In [185]:
type(X_train_tfidf)

scipy.sparse.csr.csr_matrix

In [186]:
from sklearn.linear_model import LogisticRegression
# using optimal parameters from grid search (l1 and 10)
lr = LogisticRegression(penalty='l1', C = 10)
# train our model
lr.fit(X_train_tfidf, y_train)

LogisticRegression(C=10, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l1', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [187]:
y_test_predict_lr = lr.predict(X_test_tfidf)

In [188]:
from sklearn.metrics import f1_score, accuracy_score , recall_score , precision_score
lr_acc = accuracy_score(y_test, y_test_predict_lr) *  100 
lr_F1 = f1_score(y_test, y_test_predict_lr) * 100
lr_precision = precision_score(y_test, y_test_predict_lr) * 100
lr_recall = recall_score(y_test, y_test_predict_lr) * 100
print ("Logistic regression accuracy and F1 score \n")
print ("Accuracy {:.5}".format(lr_acc))
print ("F1 {:.5}".format(lr_F1))
print ("Precision {:.5}".format(lr_precision))
print ("Recall {:.5}".format(lr_recall))

Logistic regression accuracy and F1 score 

Accuracy 89.32
F1 83.692
Precision 84.444
Recall 82.952


In [194]:
import pickle
def save_object(obj, filename):
    with open(filename, 'wb') as output:  # Overwrites any existing file.
        pickle.dump(obj, output, pickle.HIGHEST_PROTOCOL)

In [226]:
save_object(X_train_tfidf, 'data/X_train_tfidf.pkl')
save_object(X_test_tfidf, 'data/X_test_tfidf.pkl')

In [227]:
save_object(y_train, 'data/y_train_spacy.pkl')
save_object(y_test, 'data/y_test_spacy.pkl')

In [229]:
save_object(X_train, 'data/X_train_spacy.pkl')
save_object(X_test, 'data/X_test_spacy.pkl')

In [231]:
feature_names = tfidf_vectorizer.get_feature_names()
save_object(feature_names, 'data/tfidf_featurenames_spacy.pkl')


In [None]:
with open('data/X_train_tfidf.pkl', 'rb') as input:
    saved_xtrain = pickle.load(input)

In [189]:
feature_names = tfidf_vectorizer.get_feature_names()

In [230]:
feature_names[100:110]

['additional class',
 'additional information',
 'additional reporting',
 'additionally',
 'address',
 'addresses',
 'addressing',
 'adds',
 'adjusted',
 'adjustment']

In [190]:
coeff = lr.coef_[0]

In [191]:
top_n_index = np.argsort(coeff)[::-1][:30]

In [192]:
feature_names = tfidf_vectorizer.get_feature_names()
top_feat_tuple = [(feature_names[i], coeff[i]) for i in top_n_index]

In [193]:
top_feat_tuple

[('http', 30.40274696359508),
 ('subscribe org', 28.91593141667666),
 ('post appeared', 24.075247732257935),
 ('article', 23.534869065678073),
 ('appeared ordinal', 22.713944886038636),
 ('originally published', 21.239162700965448),
 ('embedded content', 21.08176913887025),
 ('average cardinal', 18.44430605535015),
 ('wrong', 16.543722144762654),
 ('post', 16.031163248874066),
 ('bullish', 15.810049600853297),
 ('certainly', 15.255951826697286),
 ('vs', 15.070715248894407),
 ('copyright', 14.904340759721523),
 ('spiritual', 14.847342707158452),
 ('essays', 14.801456777513922),
 ('ultimately', 14.48958528963117),
 ('correct', 14.047396169186069),
 ('reading', 13.719835085792017),
 ('figure', 13.657053190569743),
 ('morning', 12.75193108030634),
 ('appears', 12.48842920615319),
 ('spirit', 12.466586242603228),
 ('finally', 12.422041837713445),
 ('exposed', 11.983662727441743),
 ('scenario', 11.658998959492983),
 ('proof', 11.571683735018052),
 ('middle', 11.468704576979368),
 ('terrorist