# Text Feature Engineering

In [2]:
import re
import numpy as np
import pandas as pd
from sklearn import model_selection
from sklearn.feature_extraction import text
from sklearn.feature_extraction.text import CountVectorizer
import nltk
from nltk.corpus import wordnet as wn
from nltk.stem.wordnet import WordNetLemmatizer

In [3]:
# nltk.download('wordnet')

### Split the data in a training set to apply transformations

In [4]:
df = pd.read_csv('csv_files/9_1_all_books_df.csv')

In [5]:
# all the preprocessing is being done on the TRAIN set....

X_train, X_test, y_train, y_test = model_selection.train_test_split(df['description'], 
                                                                    df['best_seller'], 
                                                                    test_size = .2,
                                                                    random_state = 42)

In [6]:
# T_train on it's own is a series
type(X_train)
# X_train[1]

pandas.core.series.Series

In [88]:
len(X_train)

1098

In [87]:
X_train

array(['an exquisite memoir about how to live and love every day with death in the room from poet nina riggs mother of two young son and the direct descendant of ralph waldo emerson in the tradition of',
       "the vision want to be human and what's more human than family so he head back to the begin to the laboratory where ultron create him and mold him into a weapon the place where he first rebel against his give destiny and imagine that he could be more that he could be a man there he build them a wife virginia two teenage twin viv and vin they look",
       "an intelligent explanation of the mechanism that produce the crisis and the response to it one of the great strength of tooze's book be to demonstrate the deeply intertwine nature of the european and american financial system the new york time book review from a prizewinning economic historian an eye opening reinterpretation of the economic crisis",
       ...,
       'a move story of love friendship grief heal and the magical

## Cleaning text
- punctuation
- lowercase
- lemmas

# one hot encoding categories
#### this will happen in another notebook
- genres
- format
- publisher
- month (most likely not)

# vectorize the description
- look at html format
- key words
- take out NYT reference

### cleaning html tags
- as well as exploring vectorizing in pandas.

In [7]:
# no_html = df.description[3].replace('<i>', '')
# no_html
df.description[1371]
# this is what we want to act on
# df.description.values

# use regrex to clean the html tags
# no_reg_html = re.sub('<.{1,9}>',' ', df.description[1372])
# no_reg_html
# vec_html = re.sub('<.{1,9}>',' ', df.description.values)

'<b>Nonpareil science writer David Quammen explains how recent discoveries in molecular biology can change our understanding of evolution and life’s history, with powerful implications for human health and even our own human nature. </b>'

#### function to take out the tags

In [8]:
def no_html(text):
    return re.sub('<.{1,9}>',' ', text)

#### function to remove years and numbers in general

In [9]:
def no_nums(text):
    return re.sub('\d+', ' ', text)

#### function to remove punc

In [10]:
def no_punc(text):
    return re.sub('\.|-|\(|\)|\"|,|\?', ' ', text)

In [11]:
X_train[2]

'Northern California, during the violent end of the 1960s. At the start of summer, a lonely and thoughtful teenager, Evie Boyd, sees a group of girls in the park, and is immediately caught by their freedom, their careless dress, their dangerous aura of abandon. Soon, Evie is in thrall to Suzanne, a mesmerizing older girl, and is drawn into the circle of a soon-to-be infamou'

In [12]:
no_punc(X_train[2])

'Northern California  during the violent end of the 1960s  At the start of summer  a lonely and thoughtful teenager  Evie Boyd  sees a group of girls in the park  and is immediately caught by their freedom  their careless dress  their dangerous aura of abandon  Soon  Evie is in thrall to Suzanne  a mesmerizing older girl  and is drawn into the circle of a soon to be infamou'

#### function to remove cases

In [13]:
def no_upper(text):
    return text.lower()

#### method 1 np.vectorize

In [14]:
no_html_v = np.vectorize(no_html)

In [15]:
# checking the function
no_html_v([X_train[1371],df.description[1372] ])

array([' Nonpareil science writer David Quammen explains how recent discoveries in molecular biology can change our understanding of evolution and life’s history, with powerful implications for human health and even our own human nature.  ',
       ' NAMED A BEST BOOK OF 2018 BY  THE NEW YORK TIMES  "Somehow Casey Gerald has pulled off the most urgently political, most deeply personal, and most engagingly spiritual statement of our time by just looking outside his window and inside himself. Extraordinary." - Marlon James "Staccato prose and peripatetic storytelling combine the cadences of the Bible with an urgency reminisc '],
      dtype='<U381')

In [16]:
X_train = no_html_v(X_train)

In [17]:
# some series magic... originally made the function for a dataframe
# X_train["no_html"] = no_html_v(X_train.values)

In [18]:
# X_train = X_train['no_html']

In [19]:
X_train

array(['An exquisite memoir about how to live--and love--every day with "death in the room," from poet Nina Riggs, mother of two young sons and the direct descendant of Ralph Waldo Emerson, in the tradition of ',
       "The Vision wants to be human, and what's more human than family? So he heads back to the beginning, to the laboratory where Ultron created him and molded him into a weapon. The place where he first rebelled against his given destiny and imagined that he could be more -that he could be a man. There, he builds them. A wife, Virginia. Two teenage twins, Viv and Vin. They look",
       ' "An intelligent explanation of the mechanisms that produced the crisis and the response to it...One of the great strengths of Tooze\'s book is to demonstrate the deeply intertwined nature of the European and American financial systems."   --The New York Times Book Review       From a prizewinning economic historian, an eye-opening reinterpretation of the 2008 economic crisis ( ',
       ..

#### apply numpy function

In [20]:
no_nums_v = np.vectorize(no_nums)

In [21]:
X_train = no_nums_v(X_train)

In [22]:
no_punc_v = np.vectorize(no_punc)

In [23]:
X_train = no_punc_v(X_train)

In [24]:
no_upper_v = np.vectorize(no_upper)

In [25]:
X_train = no_upper_v(X_train)

#### method 2 pandas apply, this doesn't not apply to X_train

In [26]:
# df['no_html_apply'] = df.description.apply(no_html)

# df.head()

### POS for lemmas

In [27]:
nltk.pos_tag(['5'])[0][1][0]
# ADJ = adjective
# NOUN = noun
# VERB = verb
# ADV = adverb

'C'

In [28]:
lemmatizer = WordNetLemmatizer()
# n = noun
# v = verb
# a = adjective
# r = adverb

In [29]:
def mapping_pos(word):
    if word.isdigit():
        return ' '
    else:
        tag = nltk.pos_tag([word])[0][1][0]
        if tag == 'J':
            return 'a'
        elif tag == 'V':
            return 'v'
        elif tag == 'R':
            return 'r'
        else:
            return 'n'

In [30]:
mapping_pos('5')

' '

In [31]:
lemmatizer.lemmatize('bigger', mapping_pos('bigger'))
lemmatizer.lemmatize('twin', mapping_pos('twin'))

'twin'

In [32]:
kids = ['girls', 'boys',' and']
kids_string = 'girls boys and'
size = ['bigger', 'biggest']
size_string = 'bigger biggest'
def lemmtize_it(sentences):
    # split the string of sentences
    sentence = sentences.split(' ')
    cleaned = [lemmatizer.lemmatize(i, mapping_pos(i)) for i in sentence if i is not '']
    return " ".join(cleaned)



In [33]:
lemmtize_it(size_string)

'big big'

In [34]:
lemmtize_it(X_train[2])
# X_train[0].split(' ')

"an intelligent explanation of the mechanism that produce the crisis and the response to it one of the great strength of tooze's book be to demonstrate the deeply intertwine nature of the european and american financial system the new york time book review from a prizewinning economic historian an eye opening reinterpretation of the economic crisis"

In [35]:
X_train[0]

'an exquisite memoir about how to live  and love  every day with  death in the room   from poet nina riggs  mother of two young sons and the direct descendant of ralph waldo emerson  in the tradition of '

In [36]:
[lemmatizer.lemmatize(i, mapping_pos(i)) for i in kids]

['girl', 'boy', ' and']

In [37]:
# doing this processing before transforming with the count vectorizer
# class Nyt_LemmaTokenizer(object):
#     def __call__(self, sentences):
#         sentence = sentences.split(' ')
#         return [lemmatizer.lemmatize(i, mapping_pos(i)) for i in sentence]

In [38]:
# lemmtize_it(X_train[2])

In [39]:
X_train[0]

'an exquisite memoir about how to live  and love  every day with  death in the room   from poet nina riggs  mother of two young sons and the direct descendant of ralph waldo emerson  in the tradition of '

In [40]:
v_lemmtize_it =  np.vectorize(lemmtize_it)

In [41]:
X_train = v_lemmtize_it(X_train)

## Stop words

In [42]:
# this base of stop words is a frozen set
# text.ENGLISH_STOP_WORDS


In [43]:
nyt_stop_words = ['new', 'york', 'bestseller', 'besteller', 'bestselling']

In [44]:
stop_words = text.ENGLISH_STOP_WORDS.union(nyt_stop_words)

In [45]:
stop_words

frozenset({'a',
           'about',
           'above',
           'across',
           'after',
           'afterwards',
           'again',
           'against',
           'all',
           'almost',
           'alone',
           'along',
           'already',
           'also',
           'although',
           'always',
           'am',
           'among',
           'amongst',
           'amoungst',
           'amount',
           'an',
           'and',
           'another',
           'any',
           'anyhow',
           'anyone',
           'anything',
           'anyway',
           'anywhere',
           'are',
           'around',
           'as',
           'at',
           'back',
           'be',
           'became',
           'because',
           'become',
           'becomes',
           'becoming',
           'been',
           'before',
           'beforehand',
           'behind',
           'being',
           'below',
           'beside',
           'besides'

# Vectorizing the text

In [46]:
cv = CountVectorizer(stop_words=stop_words)

In [47]:
X_train

array(['an exquisite memoir about how to live and love every day with death in the room from poet nina riggs mother of two young son and the direct descendant of ralph waldo emerson in the tradition of',
       "the vision want to be human and what's more human than family so he head back to the begin to the laboratory where ultron create him and mold him into a weapon the place where he first rebel against his give destiny and imagine that he could be more that he could be a man there he build them a wife virginia two teenage twin viv and vin they look",
       "an intelligent explanation of the mechanism that produce the crisis and the response to it one of the great strength of tooze's book be to demonstrate the deeply intertwine nature of the european and american financial system the new york time book review from a prizewinning economic historian an eye opening reinterpretation of the economic crisis",
       ...,
       'a move story of love friendship grief heal and the magical

In [48]:
v_X_train = cv.fit_transform(X_train)

In [49]:
# act on the CountVectorizer object to get feature names
# this is the first past, with numbers and no stop words
len(cv.get_feature_names())
# number of words 7162

5842

In [50]:
# second pass with stop words
len(cv.get_feature_names())
# number of words 6902

5842

In [51]:
# third pass, no numbers and stop words
print(len(cv.get_feature_names()))
# cv.get_feature_names()
# number of words 6799

5842


In [52]:
# fourth pass will use lemmas
print(len(cv.get_feature_names()))
# number of words 5847 

5842


In [53]:
# accompanied
lemmatizer.lemmatize('brilliantly', 'a')
# lemmtize_it('brilliantly')

'brilliantly'

In [54]:
# fourth pass will use lemmas
print(len(cv.get_feature_names()))

5842


In [55]:
# fourth pass will use lemmas
print(len(cv.get_feature_names()))
# 5847 

5842


In [56]:
cv.get_feature_names()

['aaron',
 'ab',
 'abandon',
 'abandonment',
 'abby',
 'abc',
 'abduct',
 'abducted',
 'ability',
 'able',
 'aboard',
 'abou',
 'abraham',
 'abroad',
 'absconds',
 'absence',
 'absolutely',
 'absurd',
 'absurdity',
 'absurdly',
 'abuser',
 'academic',
 'academies',
 'academy',
 'accelerate',
 'accept',
 'acceptance',
 'accepted',
 'access',
 'accident',
 'acclaim',
 'accolade',
 'accompany',
 'accomplish',
 'accord',
 'account',
 'accustom',
 'ace',
 'acevedo',
 'achieve',
 'achievement',
 'acknowledge',
 'acne',
 'acquire',
 'act',
 'action',
 'activism',
 'activist',
 'actor',
 'actress',
 'actually',
 'acutely',
 'ad',
 'ada',
 'adam',
 'adapt',
 'adapts',
 'add',
 'addict',
 'addiction',
 'addictive',
 'address',
 'adhd',
 'adjei',
 'adjust',
 'administration',
 'admiral',
 'admire',
 'admit',
 'adolescence',
 'adolescent',
 'adorable',
 'adrienne',
 'adult',
 'adulthood',
 'adulting',
 'advance',
 'advanced',
 'advancement',
 'adventure',
 'adventurer',
 'adversity',
 'advice',
 '

In [57]:
# feature names before lemmas
len(cv.get_feature_names())
cv.get_feature_names()

['aaron',
 'ab',
 'abandon',
 'abandonment',
 'abby',
 'abc',
 'abduct',
 'abducted',
 'ability',
 'able',
 'aboard',
 'abou',
 'abraham',
 'abroad',
 'absconds',
 'absence',
 'absolutely',
 'absurd',
 'absurdity',
 'absurdly',
 'abuser',
 'academic',
 'academies',
 'academy',
 'accelerate',
 'accept',
 'acceptance',
 'accepted',
 'access',
 'accident',
 'acclaim',
 'accolade',
 'accompany',
 'accomplish',
 'accord',
 'account',
 'accustom',
 'ace',
 'acevedo',
 'achieve',
 'achievement',
 'acknowledge',
 'acne',
 'acquire',
 'act',
 'action',
 'activism',
 'activist',
 'actor',
 'actress',
 'actually',
 'acutely',
 'ad',
 'ada',
 'adam',
 'adapt',
 'adapts',
 'add',
 'addict',
 'addiction',
 'addictive',
 'address',
 'adhd',
 'adjei',
 'adjust',
 'administration',
 'admiral',
 'admire',
 'admit',
 'adolescence',
 'adolescent',
 'adorable',
 'adrienne',
 'adult',
 'adulthood',
 'adulting',
 'advance',
 'advanced',
 'advancement',
 'adventure',
 'adventurer',
 'adversity',
 'advice',
 '

# testing out a sentence to vectorize

In [58]:
X_train[0:5]

array(['an exquisite memoir about how to live and love every day with death in the room from poet nina riggs mother of two young son and the direct descendant of ralph waldo emerson in the tradition of',
       "the vision want to be human and what's more human than family so he head back to the begin to the laboratory where ultron create him and mold him into a weapon the place where he first rebel against his give destiny and imagine that he could be more that he could be a man there he build them a wife virginia two teenage twin viv and vin they look",
       "an intelligent explanation of the mechanism that produce the crisis and the response to it one of the great strength of tooze's book be to demonstrate the deeply intertwine nature of the european and american financial system the new york time book review from a prizewinning economic historian an eye opening reinterpretation of the economic crisis",
       'in the next installment of the new york time bestselling throne of gla

In [59]:
X_train[0:5]

array(['an exquisite memoir about how to live and love every day with death in the room from poet nina riggs mother of two young son and the direct descendant of ralph waldo emerson in the tradition of',
       "the vision want to be human and what's more human than family so he head back to the begin to the laboratory where ultron create him and mold him into a weapon the place where he first rebel against his give destiny and imagine that he could be more that he could be a man there he build them a wife virginia two teenage twin viv and vin they look",
       "an intelligent explanation of the mechanism that produce the crisis and the response to it one of the great strength of tooze's book be to demonstrate the deeply intertwine nature of the european and american financial system the new york time book review from a prizewinning economic historian an eye opening reinterpretation of the economic crisis",
       'in the next installment of the new york time bestselling throne of gla

In [60]:
type(X_train[2])

numpy.str_

In [61]:
X_train[2]

"an intelligent explanation of the mechanism that produce the crisis and the response to it one of the great strength of tooze's book be to demonstrate the deeply intertwine nature of the european and american financial system the new york time book review from a prizewinning economic historian an eye opening reinterpretation of the economic crisis"

In [62]:
X_train[2]

"an intelligent explanation of the mechanism that produce the crisis and the response to it one of the great strength of tooze's book be to demonstrate the deeply intertwine nature of the european and american financial system the new york time book review from a prizewinning economic historian an eye opening reinterpretation of the economic crisis"

In [63]:
X_train[0]

'an exquisite memoir about how to live and love every day with death in the room from poet nina riggs mother of two young son and the direct descendant of ralph waldo emerson in the tradition of'

In [64]:
# this doesn't work... need to process charaters before
lemmtize_it(X_train[0])

'an exquisite memoir about how to live and love every day with death in the room from poet nina riggs mother of two young son and the direct descendant of ralph waldo emerson in the tradition of'

In [65]:
re.sub
# X_train[0].replace('.')

<function re.sub(pattern, repl, string, count=0, flags=0)>

## testing lemmas and POS

In [85]:
# accompanied
# lemmatizer.lemmatize('accessibility', wordnet.ADJ)

In [67]:

# print(wordnet.morphy("accessibility",wordnet.ADJ))

In [68]:
for ss in wn.synsets('accessibility'):
    print(ss.lemmas()[0].pertainyms())
    thingy = ss.lemmas()[0].pertainyms()

[]
[]


In [69]:

thingy

[]

In [70]:
for ss in wn.synsets('accessibility'):
    print(ss)#[0].pertainyms())
#     thingy = ss.lemmas()[0].pertainyms()
    print(ss.lemmas())
    thingq = ss.lemmas()[1]

Synset('handiness.n.02')
[Lemma('handiness.n.02.handiness'), Lemma('handiness.n.02.accessibility'), Lemma('handiness.n.02.availability'), Lemma('handiness.n.02.availableness')]
Synset('approachability.n.01')
[Lemma('approachability.n.01.approachability'), Lemma('approachability.n.01.accessibility')]


In [71]:
thingq

Lemma('approachability.n.01.accessibility')

In [72]:
ss.lemmas()

[Lemma('approachability.n.01.approachability'),
 Lemma('approachability.n.01.accessibility')]

In [73]:
wn.lemmas('accessibility')[0].derivationally_related_forms()[0].name()

'accessible'

In [74]:
# wn.lemmas('brilliantly', 'b')#.derivationally_related_forms()
lemmatizer.lemmatize('angrily', 'v')

'angrily'

## to dos

drop uncommon words

#### review endings of Adverbs
- ly

- visualize counts and train dataframe
- put processing in .py file (this will be in a new file)


In [75]:
wn.synsets('angrily')[0].lemmas()[0].pertainyms()[0].name()

'angry'

In [76]:
# for adverbs and maybe adjectives
# wn.synsets('absurdly')[0].lemmas()[0].pertainyms()[0].name()
wn.synsets('absolutely')[0].lemmas()[0].pertainyms()[0].name()
# works for 
# acutely adverb
# absurdly adjective
# absolutely abverb RB in Penn

# think about acknowledged

'absolute'

In [77]:
cv.get_feature_names()

['aaron',
 'ab',
 'abandon',
 'abandonment',
 'abby',
 'abc',
 'abduct',
 'abducted',
 'ability',
 'able',
 'aboard',
 'abou',
 'abraham',
 'abroad',
 'absconds',
 'absence',
 'absolutely',
 'absurd',
 'absurdity',
 'absurdly',
 'abuser',
 'academic',
 'academies',
 'academy',
 'accelerate',
 'accept',
 'acceptance',
 'accepted',
 'access',
 'accident',
 'acclaim',
 'accolade',
 'accompany',
 'accomplish',
 'accord',
 'account',
 'accustom',
 'ace',
 'acevedo',
 'achieve',
 'achievement',
 'acknowledge',
 'acne',
 'acquire',
 'act',
 'action',
 'activism',
 'activist',
 'actor',
 'actress',
 'actually',
 'acutely',
 'ad',
 'ada',
 'adam',
 'adapt',
 'adapts',
 'add',
 'addict',
 'addiction',
 'addictive',
 'address',
 'adhd',
 'adjei',
 'adjust',
 'administration',
 'admiral',
 'admire',
 'admit',
 'adolescence',
 'adolescent',
 'adorable',
 'adrienne',
 'adult',
 'adulthood',
 'adulting',
 'advance',
 'advanced',
 'advancement',
 'adventure',
 'adventurer',
 'adversity',
 'advice',
 '

In [78]:
# wordnet tags
#  ADJ, ADJ_SAT, ADV, NOUN, VERB = 'a', 's', 'r', 'n', 'v'

In [79]:
# do 
nltk.pos_tag(nltk.word_tokenize(' archeological'))

[('archeological', 'JJ')]

# Save the dataframe

In [80]:
print(v_X_train.shape)
type(v_X_train)

(1098, 5842)


scipy.sparse.csr.csr_matrix

In [81]:
X_df = pd.DataFrame(v_X_train.toarray(),columns=cv.get_feature_names())

In [82]:
X_df.head()

Unnamed: 0,aaron,ab,abandon,abandonment,abby,abc,abduct,abducted,ability,able,...,zelda,zest,zeus,zillion,zimmer,zoe,zone,zoom,zorie,zoroastrian
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [83]:
print(y_train.shape)
type(y_train)

(1098,)


pandas.core.series.Series

In [86]:
# X_df.to_csv('csv_files/X_train_nlp.csv', encoding='utf-8', index=False)
# y_train.to_csv('csv_files/y_train_nlp.csv', encoding='utf-8', index=False)