**Feature Engineering**

In [0]:
import pickle
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import chi2
import numpy as np

In [2]:
path_df = "drive/My Drive/Colab Notebooks/article_dataset3.pickle"

with open(path_df, 'rb') as data:
    df = pickle.load(data)

df.head()

Unnamed: 0,Title,Article,Category,id,Article_length
0,title,It's more important than ever that organizatio...,Advertising\n,1,5055.0
1,title,Native advertising: The refreshing and unobtru...,Advertising\n,1,5324.0
2,title,There’s so much to learn with social media adv...,Advertising\n,1,14023.0
3,title,How big is your social media budget? I’ve hear...,Advertising\n,1,7562.0
4,title,Q: I've never really done much advertising for...,Advertising\n,1,2422.0


In [3]:
df.loc[1]['Article']

"Native advertising: The refreshing and unobtrusive form of advertising that syncs beautifully with web content and is receiving the attention of many publishers. The traditional advertising system has been dismantled by the heightened interest of the likes of adblockers. Therefore, an increasing number of publishers and advertisers are now using native ads on their websites and blogs to augment their existing ad revenues. The biggest thing that works for native ads is their ability to flow seamlessly within the blog/website content to the extent that it doesn’t really look like advertising, at all.\xa0Additionally, native ads enhance the appeal of clickability by making content more relatable and interesting. It seems that the increase of native ads in important publications, and on various niches, means that they are here to stay.  Here are eight native ad platforms that are creating a global buzz. Redirect.com is at the top of the list for a good reason. Allowing customers to both b

**Text Cleaning and Preparation**

In [0]:
df['Article_Parsed_1'] = df['Article'].str.replace("\r", " ")
df['Article_Parsed_1'] = df['Article_Parsed_1'].str.replace("\n", " ")
df['Article_Parsed_1'] = df['Article_Parsed_1'].str.replace("    ", " ")
df['Article_Parsed_1'] = df['Article_Parsed_1'].str.replace('"', '')

Downcase all text

In [0]:
df['Article_Parsed_2'] = df['Article_Parsed_1'].str.lower()

Remove punctuation signs

In [0]:
punctuation_signs = list("?:!.,;")
df['Article_Parsed_3'] = df['Article_Parsed_2']

for punct_sign in punctuation_signs:
    df['Article_Parsed_3'] = df['Article_Parsed_3'].str.replace(punct_sign, '')

Remove possessive pronouns

In [0]:
df['Article_Parsed_4'] = df['Article_Parsed_3'].str.replace("'s", "")

Applying Lemmatization

In [8]:
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [0]:
wordnet_lemmatizer = WordNetLemmatizer()

nrows = len(df)
lemmatized_text_list = []

for row in range(0, nrows):
    
    # Create an empty list containing lemmatized words
    lemmatized_list = []
    
    # Save the text and its words into an object
    text = df.loc[row]['Article_Parsed_4']
    text_words = str(text).split(" ")

    # Iterate through every word to lemmatize
    for word in text_words:
        lemmatized_list.append(wordnet_lemmatizer.lemmatize(word, pos="v"))
        
    # Join the list
    lemmatized_text = " ".join(lemmatized_list)
    
    # Append to the list containing the texts
    lemmatized_text_list.append(lemmatized_text)

In [0]:
df['Article_Parsed_5'] = lemmatized_text_list

Remove stop words

In [11]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [0]:
stop_words = list(stopwords.words('english'))

In [13]:
stop_words[0:10]

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're"]

In [0]:
df['Article_Parsed_6'] = df['Article_Parsed_5']

for stop_word in stop_words:

    regex_stopword = r"\b" + stop_word + r"\b"
    df['Article_Parsed_6'] = df['Article_Parsed_6'].str.replace(regex_stopword, '')

In [15]:
df.head(1)

Unnamed: 0,Title,Article,Category,id,Article_length,Article_Parsed_1,Article_Parsed_2,Article_Parsed_3,Article_Parsed_4,Article_Parsed_5,Article_Parsed_6
0,title,It's more important than ever that organizatio...,Advertising\n,1,5055.0,It's more important than ever that organizatio...,it's more important than ever that organizatio...,it's more important than ever that organizatio...,it more important than ever that organizations...,it more important than ever that organizations...,important ever organizations harness powe...


In [0]:
list_columns = ["Title", "Article", "Category", "Article_Parsed_6"]
df = df[list_columns]

df = df.rename(columns={'Article_Parsed_6': 'Article_Parsed'})

In [17]:
df.head()

Unnamed: 0,Title,Article,Category,Article_Parsed
0,title,It's more important than ever that organizatio...,Advertising\n,important ever organizations harness powe...
1,title,Native advertising: The refreshing and unobtru...,Advertising\n,native advertise refresh unobtrusive form a...
2,title,There’s so much to learn with social media adv...,Advertising\n,’ much learn social media advertise ’ diff...
3,title,How big is your social media budget? I’ve hear...,Advertising\n,big social media budget ’ hear company sp...
4,title,Q: I've never really done much advertising for...,Advertising\n,q ' never really much advertise business ' ...


**Label Coding**

In [0]:
category_codes = {
    'Advertising\n': 0,
    'Entrepreneurship\n': 1,
    'Accounting\n': 2,
    'Audit\n': 3,
    'Banking\n': 4,
    'Corporate Law\n': 5,
    'Finance\n': 6,
    'Ecommerce\n': 7,
    'Ethics\n': 8,
    'Human Resource\n': 9,
    'Insurance\n': 10,
    'Investing\n': 11,
    'Logistics\n': 12,
    'Marketing\n': 13,
    'Negotiation\n': 14,
    'Real Estate\n': 15,
    'Sales\n': 16,
    'Startup\n': 17,
    'Technology\n': 18,
    'Trading\n': 19,
    'Writing\n': 20
}

In [0]:
df['Category_Code'] = df['Category']
df = df.replace({'Category_Code':category_codes})

In [20]:
df.head()

Unnamed: 0,Title,Article,Category,Article_Parsed,Category_Code
0,title,It's more important than ever that organizatio...,Advertising\n,important ever organizations harness powe...,0
1,title,Native advertising: The refreshing and unobtru...,Advertising\n,native advertise refresh unobtrusive form a...,0
2,title,There’s so much to learn with social media adv...,Advertising\n,’ much learn social media advertise ’ diff...,0
3,title,How big is your social media budget? I’ve hear...,Advertising\n,big social media budget ’ hear company sp...,0
4,title,Q: I've never really done much advertising for...,Advertising\n,q ' never really much advertise business ' ...,0


**Train-Test Split**

In [0]:
X_train, X_test, y_train, y_test = train_test_split(df['Article_Parsed'], df['Category_Code'], test_size=0.15, random_state=8)

We'll use TF-IDF Vectors as features.

We have to define the different parameters:

ngram_range: We want to consider both unigrams and bigrams.
max_df: When building the vocabulary ignore terms that have a document frequency strictly higher than the given threshold
min_df: When building the vocabulary ignore terms that have a document frequency strictly lower than the given threshold.
max_features: If not None, build a vocabulary that only consider the top max_features ordered by term frequency across the corpus.

In [0]:
ngram_range = (1,2)
min_df = 10
max_df = 1.
max_features = 300

In [23]:
tfidf = TfidfVectorizer(encoding='utf-8',
                        ngram_range=ngram_range,
                        stop_words=None,
                        lowercase=False,
                        max_df=max_df,
                        min_df=min_df,
                        max_features=max_features,
                        norm='l2',
                        sublinear_tf=True)
                        
features_train = tfidf.fit_transform(X_train).toarray()
labels_train = y_train
print(features_train.shape)

features_test = tfidf.transform(X_test).toarray()
labels_test = y_test
print(features_test.shape)

(1093, 300)
(194, 300)


In [24]:
for Product, category_id in sorted(category_codes.items()):
    features_chi2 = chi2(features_train, labels_train == category_id)
    indices = np.argsort(features_chi2[0])
    feature_names = np.array(tfidf.get_feature_names())[indices]
    unigrams = [v for v in feature_names if len(v.split(' ')) == 1]
    bigrams = [v for v in feature_names if len(v.split(' ')) == 2]
    print("# '{}' category:".format(Product))
    print("  . Most correlated unigrams:\n. {}".format('\n. '.join(unigrams[-5:])))
    print("  . Most correlated bigrams:\n. {}".format('\n. '.join(bigrams[-2:])))
    print("")

# 'Accounting
' category:
  . Most correlated unigrams:
. report
. cash
. tax
. software
. account
  . Most correlated bigrams:
. real estate
. social media

# 'Advertising
' category:
  . Most correlated unigrams:
. target
. audience
. ad
. ads
. advertise
  . Most correlated bigrams:
. real estate
. social media

# 'Audit
' category:
  . Most correlated unigrams:
. financial
. firm
. account
. tax
. audit
  . Most correlated bigrams:
. real estate
. social media

# 'Banking
' category:
  . Most correlated unigrams:
. digital
. financial
. loan
. credit
. bank
  . Most correlated bigrams:
. social media
. real estate

# 'Corporate Law
' category:
  . Most correlated unigrams:
. state
. contract
. corporate
. legal
. law
  . Most correlated bigrams:
. social media
. real estate

# 'Ecommerce
' category:
  . Most correlated unigrams:
. website
. purchase
. facebook
. products
. online
  . Most correlated bigrams:
. real estate
. social media

# 'Entrepreneurship
' category:
  . Most cor

**Save Files**

In [0]:
# X_train
with open('drive/My Drive/Colab Notebooks/X_train.pickle', 'wb') as output:
    pickle.dump(X_train, output)
    
# X_test    
with open('drive/My Drive/Colab Notebooks/X_test.pickle', 'wb') as output:
    pickle.dump(X_test, output)
    
# y_train
with open('drive/My Drive/Colab Notebooks/y_train.pickle', 'wb') as output:
    pickle.dump(y_train, output)
    
# y_test
with open('drive/My Drive/Colab Notebooks/y_test.pickle', 'wb') as output:
    pickle.dump(y_test, output)
    
# df
with open('drive/My Drive/Colab Notebooks/df.pickle', 'wb') as output:
    pickle.dump(df, output)
    
# features_train
with open('drive/My Drive/Colab Notebooks/features_train.pickle', 'wb') as output:
    pickle.dump(features_train, output)

# labels_train
with open('drive/My Drive/Colab Notebooks/labels_train.pickle', 'wb') as output:
    pickle.dump(labels_train, output)

# features_test
with open('drive/My Drive/Colab Notebooks/features_test.pickle', 'wb') as output:
    pickle.dump(features_test, output)

# labels_test
with open('drive/My Drive/Colab Notebooks/labels_test.pickle', 'wb') as output:
    pickle.dump(labels_test, output)
    
# TF-IDF object
with open('drive/My Drive/Colab Notebooks/tfidf.pickle', 'wb') as output:
    pickle.dump(tfidf, output)