In [None]:
#for unseen article, we will use prebuilt models to make features.

In [8]:
from nltk.corpus import stopwords
import gensim
import pickle
import spacy
from datetime import datetime, timedelta
import pandas as pd
from textblob import TextBlob
from textstat.textstat import textstat

In [9]:
#returns dictionary of topics
#Pass crtd_date in string format as %Y-%m-%d %H:%M:%S'(ex:"2018-12-11 00:19:31")
def text_to_features(article_content, crtd_date, category):
    def unseen_to_vect(text):
        trigram_model=gensim.models.phrases.Phraser.load("./lda_model_data/trigram_model")
        bigram_model=gensim.models.phrases.Phraser.load("./lda_model_data/bigram_model")
        stop_words = stopwords.words('english')
        stop_words.extend(['from', 'subject', 're', 'edu', 'use'])
        #python3 -m spacy download en
        #1 time download
        nlp = spacy.load('en', disable=['parser', 'ner'])
        # Initialize spacy 'en' model, keeping only tagger component (for efficiency)
        def remove_stopwords(token_list):
            return [word for word in token_list if word not in stop_words]
        def lemmatization(token_list, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
            """https://spacy.io/api/annotation"""

            doc = nlp(" ".join(token_list))
            return [token.lemma_ for token in doc if token.pos_ in allowed_postags]


        tok=list(gensim.utils.simple_preprocess(str(text), deacc=True))
        tok=remove_stopwords(tok)
        tok=bigram_model[tok]
        tok=trigram_model[tok]
        tok=lemmatization(tok)
        file = open("./lda_model_data/id2word_dict.pickle", 'rb')
        #load  information from that file
        id2word = pickle.load(file)
        # close the file
        file.close()
        optimal_model=gensim.models.ldamodel.LdaModel.load("./lda_model_data/lda15top")
        vocab=list(id2word.values()) #all vocabulary used in lda model, we need to exclude new words from unseen article so that lda works without errors
        def remove_new(lemma_list):
            return [word for word in lemma_list if word in vocab]
        tok= remove_new(tok)
        id_freq=id2word.doc2bow(tok)
        return {str(x):y for x,y in dict(optimal_model.get_document_topics(id_freq, minimum_probability=0.0)).items()}


    #Pass crtd_date in string format as %Y-%m-%dT%H:%M:%S'(ex:"2018-12-11 00:19:31")
    def unseen_msaav10(text, crtd_date, upto_days=30):#crtd_date is timedate object, upto_days is integer.
        crtd_date=datetime.strptime(crtd_date, '%Y-%m-%d %H:%M:%S')
        model=gensim.models.doc2vec.Doc2Vec.load("./lda_model_data/doc2vec_model")
        #model.docvecs.most_similar(2, topn=30) #indexes refer to indexes of data.csv file
        tups=model.docvecs.most_similar(positive=[model.infer_vector(list(gensim.utils.simple_preprocess(str(text), deacc=True)),alpha=0.025, min_alpha=0.001, steps=55)], topn=80000)
        data=pd.read_pickle("./lda_model_data/data.pickle")
        tot=0
        c=0
        #data indexed are in alignment with tags of doc2vec results.
        for i in tups:
            if crtd_date> data.crtd_date[i[0]] and crtd_date-timedelta(days=upto_days) < data.crtd_date[i[0]]:
                tot+=data.upv3day[i[0]]
                c+=1
            if c==10:
                break
        return {'msaav10':tot/c, "weekday": crtd_date.weekday(), "hour":crtd_date.hour}



    def stats(text):
        op={}
        op["num_of_sentences"]=textstat.sentence_count(text)
        op["length"]= len(text)
        blob=blob = TextBlob(text)
        op["polarity"]= blob.sentiment.polarity
        op["subjectivity"]=blob.sentiment.subjectivity
        nlp = spacy.load('en', disable=["parser"])
        i1=["PERSON", "NORP", "ORG", "GPE"]
        i2=[ "FAC", "LOC", "PRODUCT", "EVENT", "WORK_OF_ART"]
        temp=nlp(text)
        e1=0
        oe=0
        for x in temp.ents:
            if x.label_ in i1:
                e1+=1
            if x.label_ in i2:
                oe+=1
        op["num_of_identities"]=e1
        op["other_entities"]= oe

        return op
    
    features=unseen_to_vect(article_content)
    features.update(unseen_msaav10(article_content,crtd_date))
    features.update(stats(article_content))
    
    if category in ["national"]:
        features["cat"]="national"
    elif category in ["himachal-pradesh", "uttar-pradesh","punjab","haryana", "madhya-pradesh","jalandhar","chandigarh", "uttrakhand", "ludhiana", "amritsar", "jharkhand", "gurdaspur", "firozepur" ,"hoshiarpur","bathinda"
               "patiala","barnala","faridkot","kanpurthala", "nawanshahr","ambala", "faridabad", "panipat","khanna","yamunanagar","sonipat", "gurgaon","kurukshetra","karnal", "jind", "sirsa" , "bhiwani"
               "sirsa", "bhiwani", "kaithal", "rphtak","fatehabad","new-delhi","jammu-kashmir"]:
        features["cat"]="regional"
    elif category in ["sports","cricket"]:
        features["cat"]="sports"
    elif category in ["entertainment"]:
        features["cat"]="entertainment"
    elif category in ["business"]:
        features["cat"]="business"
    elif category in ["international"]:
        features["cat"]="international"
    elif category in ["education-and-jobs"]:
        features["cat"]="education-and-jobs"
    elif category in ["nari"]:
        features["cat"]="nari"
    elif category in ["dharm"]:
        features["cat"]="dharm"
    elif category in ["life-style"]:
        features["cat"]="life-style"
    elif category in ["blogs"]:
        features["cat"]="blogs"
    else:
        features["cat"]="other"

    
    return features


In [12]:
x="NEW DELHI: Finance Minister Nirmala Sitharaman presents the first budget of Modi 2.0 government at 11 am today. In February's interim budget, the then acting finance minister Piyush Goyal had provided several income tax sops to the middle-class and introduced zero tax liability for those in the ₹5 lakh income bracket. The interim budget had also increased standard deduction to ₹50,000 from ₹40,000 for the salaried class."

text_to_features(x, "2019-05-25 10:10:01", "sports")

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL
  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


{'0': 0.02268612,
 '1': 0.028077157,
 '10': 0.011487752,
 '11': 0.67987967,
 '12': 0.016253043,
 '13': 0.030137703,
 '14': 0.038450055,
 '2': 0.013336904,
 '3': 0.026404655,
 '4': 0.014157324,
 '5': 0.021940619,
 '6': 0.015841777,
 '7': 0.038382106,
 '8': 0.011957343,
 '9': 0.031007748,
 'cat': 'sports',
 'hour': 10,
 'length': 29,
 'msaav10': 972.7,
 'num_of_identities': 0,
 'num_of_sentences': 1,
 'other_entities': 0,
 'polarity': 0.7,
 'subjectivity': 0.6000000000000001,
 'weekday': 5}