In [81]:
import numpy as np
import pandas as pd 
import spacy
import string
import re
from nltk.corpus import stopwords

In [80]:
# out-of-the-box: download best-matching default model
# python -m spacy download en
parser = spacy.load("en")

In [23]:
def clean_text(text):
        # get rid of newlines
        text = text.strip().replace("\n", " ").replace("\r", " ")
        
        # replce mentions wth @
        mentionFinder = re.compile(r"@[a-z0-9_]{1,15}", re.IGNORECASE)
        text = mentionFinder.sub("@MENTION", text)

        # replace emails and also @ mention
        emailFinder = re.compile(r"\b[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,}\b", re.IGNORECASE)
        text = emailFinder.sub("<EMAIL>", text)

        # replace HTML symbols
        text = text.replace("&amp;", "and").replace("&gt;", ">").replace("&lt;", "<")
        
        ## Exlpore more patterns inside text and add it here
        return text


In [40]:
# stopword that we dont want 
STOPLIST = set(stopwords.words('english')) # add more stopword
# symbols that we dont want, research add more later 
SYMBOLS = " ".join(string.punctuation).split(" ") + ["-----", "---", "...", "“", "”", "’", "’s", "'s"]

# Lets create a custome tokenizer using spacy
def spacy_simple_tokenizer(texts):
    tokens = parser(texts)
    lemmas = []
    try: 
        lemmas = [tok.text.lower().strip() if tok.ent_type_ == "" else "<{}>".format(tok.ent_type_) for tok in tokens]
    except:
        print('error occured')
        lemmas.append("<UNK>")
    
    tokens  = lemmas
    # For named entity we hve to replace them with their positional index
    #tokens = [tok.lemma_.lower().strip() if tok.lemma_ != "-PRON-" else tok.lower_ for tok in tokens]
    tokens = [tok for tok in tokens if tok not in STOPLIST]
    tokens = [tok for tok in tokens if tok not in SYMBOLS]
    
    while "" in tokens:
        tokens.remove("")
    while " " in tokens:
        tokens.remove(" ")
    while "\n" in tokens:
        tokens.remove("\n")
    while "\n\n" in tokens:
        tokens.remove("\n\n")
                
    all_tokens = " ".join(tokens)
   
    return str(all_tokens)            

In [51]:
df_train = pd.read_pickle('data/data_train.pkl')
df_test = pd.read_pickle('data/data_test.pkl')

X_train = df_train['content']
y_train = df_train['label']
X_test = df_test['content']
y_test = df_test['label']

In [60]:
X_train.shape

(8744,)

In [53]:
X_train = [clean_text(text) for text in X_train]

In [56]:
X_train.head()

4075     Cbrain A/S : \n\n* The Ministry of Immigration...
100      (Before It's News)  \n\nFor the last several m...
8141     Story highlights Injured firefighter chats wit...
880      Z Capital Group said its private equity manage...
11936    (Before It's News)  \n \nWhen Youngest Junior ...
Name: content, dtype: object

In [57]:
only_one_df = X_train[4075]
#only_one_df = "this is a cat, and a cat walked over to mountain and said what a lovely day is, and she was in China.\n Is India a nice place. "

In [58]:
one = spacy_simple_tokenizer(only_one_df)

In [59]:
one

'cbrain <ORG> <ORG> <ORG> <ORG> <ORG> <ORG> <ORG> <ORG> signed <GPE> agreement <GPE> deliver <PRODUCT> case document management <ORG> delivered cloud service saas <ORG> <FAC> <FAC> <GPE> <NORP> agency <ORG> services <PERSON> source text eikon <ORG> company coverage <PERSON> <PERSON>'

In [61]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vectorizer  = TfidfVectorizer(tokenizer= spacy_simple_tokenizer, ngram_range=(1,2), max_df= 0.85, min_df= 2, max_features=3000)

In [62]:
# started at 8.46
from timeit import default_timer as timer
import time
start = timer()
t0 = time.time()
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_validation_tfidf = tfidf_vectorizer.transform(X_test)
t1 = time.time()

total = t1-t0
# STOP MY TIMER
elapsed_time = timer() - start # in seconds
print(elapsed_time)

print("total time ", total)

NameError: name 'start' is not defined

In [64]:
X_train_tfidf.shape

(8744, 2883)

In [66]:
type(X_train_tfidf)

scipy.sparse.csr.csr_matrix

In [68]:
from sklearn.linear_model import LogisticRegression
# using optimal parameters from grid search (l1 and 10)
lr = LogisticRegression(penalty='l1', C = 10)
# train our model
lr.fit(X_train_tfidf, y_train)

LogisticRegression(C=10, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l1', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [70]:
y_test_predict_lr = lr.predict(X_validation_tfidf)

In [72]:
from sklearn.metrics import f1_score, accuracy_score , recall_score , precision_score
lr_acc = accuracy_score(y_test, y_test_predict_lr) *  100 
lr_F1 = f1_score(y_test, y_test_predict_lr) * 100
lr_precision = precision_score(y_test, y_test_predict_lr) * 100
lr_recall = recall_score(y_test, y_test_predict_lr) * 100
print ("Logistic regression accuracy and F1 score \n")
print ("Accuracy {:.5}".format(lr_acc))
print ("F1 {:.5}".format(lr_F1))
print ("Precision {:.5}".format(lr_precision))
print ("Recall {:.5}".format(lr_recall))

Logistic regression accuracy and F1 score 

Accuracy 93.436
F1 86.195
Precision 88.377
Recall 84.118


In [75]:
import pickle
def save_object(obj, filename):
    with open(filename, 'wb') as output:  # Overwrites any existing file.
        pickle.dump(obj, output, pickle.HIGHEST_PROTOCOL)

In [76]:
save_object(X_train_tfidf, 'data/X_train_tfidf.pkl')
save_object(X_validation_tfidf, 'data/X_test_tfidf.pkl')

In [79]:
save_object(y_train, 'data/y_train_spacy.pkl')
save_object(y_test, 'data/y_test_spacy.pkl')

In [None]:
with open('data/X_train_tfidf.pkl', 'rb') as input:
    saved_xtrain = pickle.load(input)