# Text Feature Engineering

In [106]:
import re
import numpy as np
import pandas as pd
from sklearn import model_selection
from sklearn.feature_extraction import text
from sklearn.feature_extraction.text import CountVectorizer
from nltk.stem.wordnet import WordNetLemmatizer

### Split the data in a training set to apply transformations

In [2]:
df = pd.read_csv('csv_files/9_1_all_books_df.csv')

In [52]:
X_train, X_test, y_train, y_test = model_selection.train_test_split(df['description'], 
                                                                    df['best_seller'], 
                                                                    test_size = .2,
                                                                    random_state = 42)

In [104]:
# T_train on it's own is a series
type(X_train)

numpy.ndarray

# one hot encoding categories
- genres
- format
- publisher
- month (most likely not)

# vectorize the description
- look at html format
- key words
- take out NYT reference

### cleaning html tags
- as well as exploring vectorizing in pandas.

In [25]:
# no_html = df.description[3].replace('<i>', '')
no_html
df.description[1371]
# this is what we want to act on
# df.description.values

# use regrex to clean the html tags
# no_reg_html = re.sub('<.{1,9}>',' ', df.description[1372])
# no_reg_html
# vec_html = re.sub('<.{1,9}>',' ', df.description.values)

'<b>Nonpareil science writer David Quammen explains how recent discoveries in molecular biology can change our understanding of evolution and life’s history, with powerful implications for human health and even our own human nature. </b>'

#### function to take out the tags

In [57]:
def no_html(text):
    return re.sub('<.{1,9}>',' ', text)

#### function to remove years and numbers in general

In [88]:
def no_nums(text):
    return re.sub('\d+', ' ', text)

#### method 1 np.vectorize

In [58]:
no_html_v = np.vectorize(no_html)

In [59]:
no_html_v([X_train[1371],df.description[1372] ])

array([' Nonpareil science writer David Quammen explains how recent discoveries in molecular biology can change our understanding of evolution and life’s history, with powerful implications for human health and even our own human nature.  ',
       ' NAMED A BEST BOOK OF 2018 BY  THE NEW YORK TIMES  "Somehow Casey Gerald has pulled off the most urgently political, most deeply personal, and most engagingly spiritual statement of our time by just looking outside his window and inside himself. Extraordinary." - Marlon James "Staccato prose and peripatetic storytelling combine the cadences of the Bible with an urgency reminisc '],
      dtype='<U381')

In [None]:
# some series magic... originally made the function for a dataframe
X_train["no_html"] = no_html_v(X_train.values)

In [74]:
X_train = X_train['no_html']

In [75]:
X_train

array(['An exquisite memoir about how to live--and love--every day with "death in the room," from poet Nina Riggs, mother of two young sons and the direct descendant of Ralph Waldo Emerson, in the tradition of ',
       "The Vision wants to be human, and what's more human than family? So he heads back to the beginning, to the laboratory where Ultron created him and molded him into a weapon. The place where he first rebelled against his given destiny and imagined that he could be more -that he could be a man. There, he builds them. A wife, Virginia. Two teenage twins, Viv and Vin. They look",
       ' "An intelligent explanation of the mechanisms that produced the crisis and the response to it...One of the great strengths of Tooze\'s book is to demonstrate the deeply intertwined nature of the European and American financial systems."   --The New York Times Book Review       From a prizewinning economic historian, an eye-opening reinterpretation of the 2008 economic crisis ( ',
       ..

#### apply numpy function

In [91]:
no_nums_v = np.vectorize(no_nums)

In [97]:
X_train = no_nums_v(X_train)

#### method 2 pandas apply, this doesn't not apply to X_train

In [44]:
df['no_html_apply'] = df.description.apply(no_html)

In [None]:
df.head()

## Cleaning text
- punctuation
- lowercase
- lemmas

In [98]:
cv = CountVectorizer(stop_words='english')

In [99]:
v_X_train = cv.fit_transform(X_train)

In [84]:
# act on the CountVectorizer object to get feature names
# this is the first past, with numbers and no stop words
len(cv.get_feature_names())

7162

In [87]:
# second pass with stop words
len(cv.get_feature_names())

6902

In [103]:
# third pass, no numbers and stop words
print(len(cv.get_feature_names()))
# cv.get_feature_names()

6799


In [108]:
# fourth pass will use lemmas


## Stop words

In [6]:
# this base of stop words is a frozen set
text.ENGLISH_STOP_WORDS


frozenset({'a',
           'about',
           'above',
           'across',
           'after',
           'afterwards',
           'again',
           'against',
           'all',
           'almost',
           'alone',
           'along',
           'already',
           'also',
           'although',
           'always',
           'am',
           'among',
           'amongst',
           'amoungst',
           'amount',
           'an',
           'and',
           'another',
           'any',
           'anyhow',
           'anyone',
           'anything',
           'anyway',
           'anywhere',
           'are',
           'around',
           'as',
           'at',
           'back',
           'be',
           'became',
           'because',
           'become',
           'becomes',
           'becoming',
           'been',
           'before',
           'beforehand',
           'behind',
           'being',
           'below',
           'beside',
           'besides'