Feature Engineering Notebook

In [1]:
!pip install transformers --quiet
!pip install flair --quiet

In [2]:
from google.colab import drive
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.layers import Embedding, Input, Dense, Lambda
from tensorflow.keras.models import Model
import tensorflow.keras.backend as K
import os
import matplotlib.pyplot as plt
from transformers import BertTokenizer, TFBertModel
from transformers import logging
from tensorflow.keras.utils import to_categorical
from flair.models import TextClassifier
from flair.data import Sentence
from flair.nn import Classifier
from transformers import pipeline
import time

In [3]:
#import pre_trained models from Higgingface
bert_tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
bert_model = TFBertModel.from_pretrained('bert-base-cased')
key_words_pipe = pipeline("token-classification", model="yanekyuk/bert-keyword-extractor")
topic_pipe = pipeline("text-classification", model="ishaansharma/topic-detector")
sentiment_pipe = pipeline("text-classification", model="shashanksrinath/News_Sentiment_Analysis")
topic_pipe_v2 = pipeline("text-classification", model="wesleyacheng/news-topic-classification-with-bert")

#get sentiment analysis from flair
sia = TextClassifier.load('en-sentiment')
def flair_prediction(x):
    sentence = Sentence(x)
    sia.predict(sentence)
    score = sentence.labels[0]
    if "POSITIVE" in str(score):
        return "pos"
    elif "NEGATIVE" in str(score):
        return "neg"
    else:
        return "neu"
#get NER pre_trained model from flair
tagger = Classifier.load('ner-ontonotes')

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions w

2023-08-05 02:32:02,595 SequenceTagger predicts: Dictionary with 75 tags: O, S-PERSON, B-PERSON, E-PERSON, I-PERSON, S-GPE, B-GPE, E-GPE, I-GPE, S-ORG, B-ORG, E-ORG, I-ORG, S-DATE, B-DATE, E-DATE, I-DATE, S-CARDINAL, B-CARDINAL, E-CARDINAL, I-CARDINAL, S-NORP, B-NORP, E-NORP, I-NORP, S-MONEY, B-MONEY, E-MONEY, I-MONEY, S-PERCENT, B-PERCENT, E-PERCENT, I-PERCENT, S-ORDINAL, B-ORDINAL, E-ORDINAL, I-ORDINAL, S-LOC, B-LOC, E-LOC, I-LOC, S-TIME, B-TIME, E-TIME, I-TIME, S-WORK_OF_ART, B-WORK_OF_ART, E-WORK_OF_ART, I-WORK_OF_ART, S-FAC


In [4]:
drive.mount('/content/gdrive')
!cp '/content/gdrive/MyDrive/W266/data/news_popularity_full.csv' -d /content
!cp '/content/gdrive/MyDrive/W266/data/OnlineNewsPopularity.csv' -d /content

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [5]:
# Orginal dataset
df = pd.read_csv('OnlineNewsPopularity.csv')
news_df = pd.read_csv('news_popularity_full.csv').dropna()
df[' share_mod'] = np.log(df[' shares'])
threshold = df[' share_mod'].mean()
df['share_cat'] = df[' share_mod'].apply(lambda x: 1 if x > threshold else 0 ) # change no. of shares to categorical varaibles
merged_df = pd.merge(df,news_df,how='inner',on='url')
merged_df.groupby('share_cat')['share_cat'].count()

share_cat
0    23479
1    16120
Name: share_cat, dtype: int64

In [6]:
#For demo porpose, dataset will cut into 10 rows
merged_df = merged_df.head(10)

In [7]:
# NER extraction step from article body
# This is just an example with 10 rows, the full run will take about 3.5 hours to complete with using T4 GPU from colab.
body_ner_results_dict = {}
for index,sent in enumerate(merged_df['body'][:10]):
  try:
      sentence = Sentence(sent)
      tagger.predict(sentence)
      if index % 2 == 0:
          print(str(index)+ ' complted')

      org_lst = []
      org_score_lst = []

      for span in sentence.get_spans('ner'):
        if span.tag == 'ORG':
          org_lst.append(span.text)
          org_score_lst.append(span.score)
        else:
          org_lst.append('N/A')
          org_score_lst.append(0)

      max_index = org_score_lst.index(max(org_score_lst))
      org = org_lst[max_index]

      event_lst = []
      event_score_lst = []
      for span in sentence.get_spans('ner'):
        if span.tag == 'EVENT':
          event_lst.append(span.text)
          event_score_lst.append(span.score)
        else:
          event_lst.append('N/A')
          event_score_lst.append(0)

      max_index = event_score_lst.index(max(event_score_lst))
      event = event_lst[max_index]

      person_lst = []
      person_score_lst = []
      for span in sentence.get_spans('ner'):
        if span.tag == 'PERSON':
          person_lst.append(span.text)
          person_score_lst.append(span.score)
        else:
          person_lst.append('N/A')
          person_score_lst.append(0)

      max_index = person_score_lst.index(max(person_score_lst))
      person = person_lst[max_index]


      date_lst = []
      date_score_lst = []
      for span in sentence.get_spans('ner'):
        if span.tag == 'DATE':
          date_lst.append(span.text)
          date_score_lst.append(span.score)
        else:
          date_lst.append('N/A')
          date_score_lst.append(0)

      max_index = date_score_lst.index(max(date_score_lst))
      date = date_lst[max_index]


      loc_lst = []
      loc_score_lst = []
      for span in sentence.get_spans('ner'):
        if span.tag == 'LOC':
          loc_lst.append(span.text)
          loc_score_lst.append(span.score)
        else:
          loc_lst.append('N/A')
          loc_score_lst.append(0)

      max_index = loc_score_lst.index(max(loc_score_lst))
      loc = loc_lst[max_index]

      result = {'org':org,
                'event':event,
                'person':person,
                'date':date,
                'loc':loc
                }
      body_ner_results_dict[index] = result
  except Exception as e:
        # Handle the error here (you can print a message or do nothing)
        print(f"An error occurred for index {index}: {e}")
        continue
#'DATE' 'PERSON' 'LOC'

0 complted
2 complted
4 complted
6 complted
8 complted


In [8]:
ner_df = pd.DataFrame(body_ner_results_dict).T
ner_df.head()

Unnamed: 0,org,event,person,date,loc
0,Prime,,,Monday,
1,Samsung,the International CES,,Monday,
2,Apple,,Eddy Cue,December,
3,NASA,the BCS Championship,Kevin Ford,,
4,Pix & Flix,,,Monday,


In [9]:
#get section from pre_trined model
#Below is an example with 10 rows result
section = []
for i in merged_df['headline'][:10]:
  section.append(topic_pipe_v2(i)[0]['label'])

In [10]:
#get sentiment from headline and body using flair pre_trained model
headline_sent = merged_df['headline'][:10].apply(flair_prediction)
body_sent = merged_df['body'][:10].apply(flair_prediction)

In [11]:
#get top 3 key words from pre_trained model
key_words = []
start_time = time.time()
for index, i in enumerate(merged_df['headline'][:10]):
    if index % 2 == 0:
      print(str(index)+ ' complted')
    result = key_words_pipe(i)
    sorted_data = sorted(result, key=lambda x: x['score'], reverse=True)
    keep_words = [data['word'] for data in sorted_data[:3]]
    key_words.append(keep_words)
end_time = time.time()
execution_time = end_time - start_time
print(f"Loop ran for {execution_time:.5f} seconds")

0 complted
2 complted
4 complted
6 complted
8 complted
Loop ran for 0.93324 seconds


In [12]:
#get topic from pre_trained model
topic = []
for i in merged_df['headline'][:10]:
  topic.append(topic_pipe(i)[0]['label'])

In [13]:
#get topic from pre_trained model V2
topic_v2 = []
for i in merged_df['headline'][:10]:
  topic_v2.append(topic_pipe_v2(i)[0]['label'])

In [14]:
#get sentiment result from pre_trained model on hagging face
sent_v2 = []
for i in merged_df['headline'][:10]:
  sent_v2.append(sentiment_pipe(i)[0]['label'])


In [15]:
#pre-processing key_words:
key_words_df = pd.DataFrame(key_words)
new_column_names = {
    0: 'key_word_1',
    1: 'key_word_2',
    2: 'key_word_3'
}
key_words_df = key_words_df.rename(columns=new_column_names).fillna('na')

In [16]:
merged_df['topic'] = topic
merged_df['headline_sent'] = headline_sent
merged_df['body_sent'] = body_sent
merged_df['topic_v2'] = topic_v2
merged_df['sent_v2'] = sent_v2
merged_df = pd.merge(merged_df,ner_df,left_index=True, right_index=True, how='inner')
merged_df = pd.merge(merged_df,key_words_df, left_index=True, right_index=True)

In [17]:
res = merged_df.groupby('topic')['share_cat'].sum().sort_values(ascending=False)
merged_df = pd.merge(merged_df,res,left_on='topic', right_index=True, how='left').rename(columns={'share_cat_y':'topic_shares',
                                                                                                      'share_cat_x':'share_cat'}).fillna(0)
res_1 = merged_df.groupby('org')['share_cat'].sum().sort_values(ascending=False)
merged_df = pd.merge(merged_df,res_1,left_on='org', right_index=True, how='left').rename(columns={'share_cat_y':'org_shares',
                                                                                                     'share_cat_x':'share_cat'}).fillna(0)
res_2 = merged_df.groupby('event')['share_cat'].sum().sort_values(ascending=False)
merged_df = pd.merge(merged_df,res_2,left_on='event', right_index=True, how='left').rename(columns={'share_cat_y':'event_shares',
                                                                                                      'share_cat_x':'share_cat'}).fillna(0)
res_3 = merged_df.groupby('person')['share_cat'].sum().sort_values(ascending=False)
merged_df = pd.merge(merged_df,res_3,left_on='person', right_index=True, how='left').rename(columns={'share_cat_y':'person_shares',
                                                                                                      'share_cat_x':'share_cat'}).fillna(0)
res_4 = merged_df.groupby('date')['share_cat'].sum().sort_values(ascending=False)
merged_df = pd.merge(merged_df,res_4,left_on='date', right_index=True, how='left').rename(columns={'share_cat_y':'date_shares',
                                                                                                      'share_cat_x':'share_cat'}).fillna(0)
res_5 = merged_df.groupby('loc')['share_cat'].sum().sort_values(ascending=False)
merged_df = pd.merge(merged_df,res_5,left_on='loc', right_index=True, how='left').rename(columns={'share_cat_y':'loc_shares',
                                                                                                      'share_cat_x':'share_cat'}).fillna(0)
res_6 = merged_df.groupby('topic_v2')['share_cat'].sum().sort_values(ascending=False)
merged_df = pd.merge(merged_df,res,left_on='topic_v2', right_index=True, how='left').rename(columns={'share_cat_y':'topic_v2_shares',
                                                                                                      'share_cat_x':'share_cat'}).fillna(0)
res_7 = merged_df.groupby('key_word_1')['share_cat'].sum().sort_values(ascending=False)
res_8 = merged_df.groupby('key_word_2')['share_cat'].sum().sort_values(ascending=False)
res_9 = merged_df.groupby('key_word_3')['share_cat'].sum().sort_values(ascending=False)

key_words_shares_df = pd.merge(pd.merge(pd.merge(key_words_df,res_7,left_index=True, right_index=True, how='left')
,res_8,left_index=True, right_index=True, how='left')
,res_9,left_index=True, right_index=True, how='left').rename(columns={'share_cat':'kw_1_shares',
                                                                                 'share_cat_x':'kw_2_shares',
                                                                                 'share_cat_y':'kw_3_shares'}).fillna(0).drop(['key_word_1',
                                                                                                                                     'key_word_2',
                                                                                                                                     'key_word_3'], axis = 1)


In [18]:

merged_df = pd.merge(merged_df,res_7,left_on='key_word_1', right_index=True, how='left').rename(columns={'share_cat_y':'kw_1_shares',
                                                                                                      'share_cat_x':'share_cat'}).fillna(0)
merged_df = pd.merge(merged_df,res_8,left_on='key_word_2', right_index=True, how='left').rename(columns={'share_cat_y':'kw_2_shares',
                                                                                                      'share_cat_x':'share_cat'}).fillna(0)
merged_df = pd.merge(merged_df,res_9,left_on='key_word_3', right_index=True, how='left').rename(columns={'share_cat_y':'kw_3_shares',
                                                                                                      'share_cat_x':'share_cat'}).fillna(0)




In [19]:
#replace na with median shares
merged_df['kw_1_shares'].replace(merged_df['kw_1_shares'].max(), merged_df['kw_1_shares'].median(), inplace=True)
merged_df['kw_2_shares'].replace(merged_df['kw_2_shares'].max(), merged_df['kw_2_shares'].median(), inplace=True)
merged_df['kw_3_shares'].replace(merged_df['kw_3_shares'].max(), merged_df['kw_3_shares'].median(), inplace=True)
merged_df['org_shares'].replace(merged_df['org_shares'].max(), merged_df['org_shares'].median(), inplace=True)
merged_df['event_shares'].replace(merged_df['event_shares'].max(), merged_df['event_shares'].median(), inplace=True)
merged_df['person_shares'].replace(merged_df['person_shares'].max(), merged_df['person_shares'].median(), inplace=True)
merged_df['date_shares'].replace(merged_df['date_shares'].max(), merged_df['date_shares'].median(), inplace=True)
merged_df['loc_shares'].replace(merged_df['loc_shares'].max(), merged_df['loc_shares'].median(), inplace=True)
merged_df['topic_shares'].replace(merged_df['topic_shares'].max(), merged_df['topic_shares'].median(), inplace=True)
merged_df['topic_v2_shares'].replace(merged_df['topic_v2_shares'].max(), merged_df['topic_v2_shares'].median(), inplace=True)

In [21]:
merged_df.head()

Unnamed: 0,url,timedelta,n_tokens_title,n_tokens_content,n_unique_tokens,n_non_stop_words,n_non_stop_unique_tokens,num_hrefs,num_self_hrefs,num_imgs,...,topic_shares,org_shares,event_shares,person_shares,date_shares,loc_shares,topic_v2_shares,kw_1_shares,kw_2_shares,kw_3_shares
0,http://mashable.com/2013/01/07/amazon-instant-...,731.0,12.0,219.0,0.663594,1.0,0.815385,4.0,2.0,1.0,...,0,0,1,1,0,1,0.0,0,0,0
1,http://mashable.com/2013/01/07/ap-samsung-spon...,731.0,9.0,255.0,0.604743,1.0,0.791946,3.0,1.0,1.0,...,0,0,0,1,0,1,0.0,0,0,0
2,http://mashable.com/2013/01/07/apple-40-billio...,731.0,9.0,211.0,0.57513,1.0,0.663866,3.0,1.0,1.0,...,0,0,1,0,0,1,0.0,0,0,0
3,http://mashable.com/2013/01/07/astronaut-notre...,731.0,9.0,531.0,0.503788,1.0,0.665635,9.0,0.0,1.0,...,0,0,0,0,0,1,0.0,0,0,0
4,http://mashable.com/2013/01/07/att-u-verse-apps/,731.0,13.0,1072.0,0.415646,1.0,0.54089,19.0,19.0,20.0,...,0,0,1,1,0,1,0.0,0,0,0


In [20]:
#save processed df to google drive for modeling
#file_path = '/content/gdrive/MyDrive/W266/data/feature_eng_df.csv'
#merged_df.to_csv(file_path, index=False)