In [2]:
import os
import pandas as pd
import spacy
from pickle import dump
import string
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier

In [3]:
from utils_articles import TopicTrain
articles_df = TopicTrain()._loadArticles('data')
articles_df.head()

Unnamed: 0,category,article
0,business,Ad sales boost Time Warner profit Quarterly p...
1,business,Dollar gains on Greenspan speech The dollar h...
2,business,Yukos unit buyer faces loan claim The owners ...
3,business,High fuel prices hit BA's profits British Air...
4,business,Pernod takeover talk lifts Domecq Shares in U...


In [4]:
def spacy_tokenizer(doc):
        """Function that serves as tokenizer in our pipeline
        Loads the 'en_core_web_sm' model, tokenize the string and perform pre processing. 
        Preprocessing includes lemmatizing tokens as well as removing stop words and punctuations. 
        Args:
            doc(str): sentence to tokenize.
        Returns: 
            list: preprocessed tokens. 
        """

        punctuations = string.punctuation
        
        stop_words = spacy.lang.en.stop_words.STOP_WORDS
        tokens = nlp(doc)

        # Lemmatizing each token and converting each token into lowercase
        tokens = [word.lemma_.lower() for word in tokens if not word.is_space]        
        # Removing stop words and punctuations
        tokens = [ word for word in tokens if word not in stop_words and word not in punctuations ]
        # return preprocessed list of tokens
        return tokens

In [5]:
nlp = spacy.load('en_core_web_sm')
text_clf = Pipeline([('tfidf', TfidfVectorizer(tokenizer=spacy_tokenizer,min_df=3)),\
             ('clf', RandomForestClassifier())])

In [6]:
text_clf.fit(articles_df['article'], articles_df['category'])



In [9]:
import pathlib
model_path = os.path.join(str(pathlib.Path().absolute()), "model")
model_file = model_path + "/rm_tfidf.pkl"
print(model_file)
dump(text_clf, open(model_file, 'wb'))

/usr/src/nlp/articles/model/rm_tfidf.pkl


# Loading and predicting

In [13]:
import os
import pathlib
import joblib

model_path = os.path.join(str(pathlib.Path().absolute()), "model")
model_file = model_path + "/rm_tfidf.pkl"
#print(model_file)
model = joblib.load(model_file)

In [10]:
import os
import pandas as pd
def _loadArticles(path):
        cat_article = []
        for subdir, dirs, files in os.walk(path):
            #print(subdir,dirs,files)
            for file in files:
                if '.txt' in file:
                    category = subdir.split('/')[-1]
                    f = open(os.path.join(subdir, file),'r')
                    lines = f.readlines()
                    lines = ' '.join(lines).replace('\n','')
                    #list of lists: [category,article]
                    cat_article.append([category,lines])
                    f.close()
        data = pd.DataFrame(cat_article)
        data.columns = ['category','article']
        return data

In [11]:
target_art = _loadArticles('test_articles')
target_art.head()

Unnamed: 0,category,article
0,test_articles,"Barclays boss Jes Staley is ""shell-shocked, an..."
1,test_articles,"For nearly seven months, through the regular s..."
2,test_articles,Tennessee Titans running back Derrick Henry su...
3,test_articles,"There will still be a Facebook, and an Instagr..."


In [12]:
import numpy as np
inp = target_art['article'].values[1]
inp_arr = np.array(inp)
inp_arr

array("For nearly seven months, through the regular season and the early rounds of the playoffs, the Houston Astros featured the best offense in baseball. It was the highest-scoring, best-balanced, most unsolvable attack in the majors.  The Astros hit for power. They hit for average. They walked the fine line between discipline and aggression: They were the best two-strike hitting team in the big leagues. In an era when so much of the battle is decided by the team that wins the strike zone, the Astros' batsmen generally won the strike zone.",
      dtype='<U539')

In [14]:
pred = model.predict([inp])
proba = model.predict_proba([inp])
print(pred)
proba[0]

['sport']


array([0.06, 0.18, 0.05, 0.71, 0.  ])

In [17]:
from utils_articles import TopicTrain
TopicTrain().train()

