# Text preprocessing and vectorizing

### Loading dataset

In [3]:
import pandas as pd
poor = open("../dso-560-nlp-text-analytics-main/datasets/poor_amazon_toy_reviews.txt", encoding="mbcs").readlines()
good = open("../dso-560-nlp-text-analytics-main/datasets/good_amazon_toy_reviews.txt", encoding="mbcs").readlines()

In [5]:
good_reviews = list(map(lambda review: (review, 1), good))
poor_reviews = list(map(lambda review: (review, 0), poor))

In [3]:
all_reviews = good_reviews + poor_reviews
all_reviews_df = pd.DataFrame(all_reviews, columns=["review", "positive"])
all_reviews_df.head()

Unnamed: 0,review,positive
0,Excellent!!!\n,1
1,"""Great quality wooden track (better than some ...",1
2,my daughter loved it and i liked the price and...,1
3,Great item. Pictures pop thru and add detail a...,1
4,I was pleased with the product.\n,1


### Clean up incorrect incorrect decodings.

In [4]:
for i in range(len(all_reviews_df)):
    all_reviews_df.loc[i,'review'] = all_reviews_df.loc[i,'review'].lower()

In [5]:
all_reviews_df['review'].iloc[3]

'great item. pictures pop thru and add detail as &#34;painted.&#34;  pictures dry and it can be repainted.\n'

In [6]:
all_reviews_df["review"] = all_reviews_df["review"].str.findall(r'\b[^\s\d\W]+\b')

In [7]:
for i in range(len(all_reviews_df['review'])):
    all_reviews_df.loc[i,'review']=' '.join(all_reviews_df.loc[i,'review'])

In [8]:
all_reviews_df.head(10)

Unnamed: 0,review,positive
0,excellent,1
1,great quality wooden track better than some ot...,1
2,my daughter loved it and i liked the price and...,1
3,great item pictures pop thru and add detail as...,1
4,i was pleased with the product,1
5,children like it,1
6,really liked these they were a little larger t...,1
7,nice huge balloon had my local grocery store f...,1
8,great deal,1
9,awesome thanks,1


In [9]:
# replace \n and br
all_reviews_df["review"] =all_reviews_df["review"].str.replace('\n', '', case=False)
all_reviews_df["review"] =all_reviews_df["review"].str.replace('br', '', case=False)

In [10]:
all_reviews_df['review'].iloc[3]

'great item pictures pop thru and add detail as painted pictures dry and it can be repainted'

###   Normalize all references to recipients using regex

In [11]:
recipients = all_reviews_df["review"].str.findall(r'\bgift for (?:a|my) (\w+ \w+)\b') 

In [12]:
recipients = list(recipients)

In [13]:
flat_recipients = [item for sublist in recipients for item in sublist]

In [14]:
#potential list of words
from collections import Counter
Counter(flat_recipients).most_common(30)

[('year old', 176),
 ('friend s', 25),
 ('granddaughter she', 24),
 ('son he', 20),
 ('yr old', 19),
 ('little girl', 18),
 ('friend and', 18),
 ('friend who', 16),
 ('grandson he', 14),
 ('daughter s', 14),
 ('nephew and', 14),
 ('daughter she', 12),
 ('daughter who', 11),
 ('niece she', 11),
 ('grandson who', 11),
 ('son and', 10),
 ('nephew he', 10),
 ('daughter and', 10),
 ('grand daughter', 10),
 ('two year', 10),
 ('friend he', 9),
 ('one year', 9),
 ('granddaughter s', 9),
 ('little one', 8),
 ('nephew who', 8),
 ('grandson and', 8),
 ('little boy', 8),
 ('child or', 7),
 ('granddaughter and', 7),
 ('four year', 7)]

In [15]:
# replacing recipients
all_reviews_df["review"] = all_reviews_df["review"].str.replace(r'\b(sons?|kids?|year olds?|friends?|grandson|daughters?\
    |granddaughter|grandson|child|someone|niece|nephew|little girl|little boy|two year old|two year old|three year old|four years old)\b', '_Recipient_', case=False, regex=True)

In [16]:
# replace gift occasions
all_reviews_df["review"] = all_reviews_df["review"].str.replace(r'\bchristmas|xmas|birthdays|bday|anniversary|anniv\b', '_occasions_', case=False, regex=True)

### Vectorization

In [17]:
shapes = pd.DataFrame(columns = ['Vectoring Method', 'Shape'])

#create function to vertorize and store shapes
def vectorize(data, name, vectorizer):
    shape = vectorizer.fit_transform(data).shape
    shapes.loc[str(len(shapes)+1)] = [name, shape]
    return shapes

#### Count vectorizing with and without stopwords

In [18]:
from sklearn.feature_extraction.text import CountVectorizer

In [19]:
# count vectorizing review without removing stopwords

vectorizer = CountVectorizer()
vectorize(all_reviews_df.review, "count vect-ing without removing stopwords", vectorizer)

Unnamed: 0,Vectoring Method,Shape
1,count vect-ing without removing stopwords,"(114917, 37131)"


In [20]:
# vectorize review with removing stopwords

vectorizer = CountVectorizer(stop_words="english")
vectorize(all_reviews_df.review, "count vect-ing with removing stopwords", vectorizer)

Unnamed: 0,Vectoring Method,Shape
1,count vect-ing without removing stopwords,"(114917, 37131)"
2,count vect-ing with removing stopwords,"(114917, 36828)"


####  Count vectorizing with 1) no stemming or lemmatization, 2) stemming, 3) lemmatization

In [21]:
import nltk
all_reviews_df['tokenized_review'] = all_reviews_df.apply(lambda row: nltk.word_tokenize(row['review']), axis=1)

In [22]:
all_reviews_df

Unnamed: 0,review,positive,tokenized_review
0,excellent,1,[excellent]
1,great quality wooden track better than some ot...,1,"[great, quality, wooden, track, better, than, ..."
2,my daughter loved it and i liked the price and...,1,"[my, daughter, loved, it, and, i, liked, the, ..."
3,great item pictures pop thru and add detail as...,1,"[great, item, pictures, pop, thru, and, add, d..."
4,i was pleased with the product,1,"[i, was, pleased, with, the, product]"
...,...,...,...
114912,it s a piece of junk doesn t charge multiple b...,0,"[it, s, a, piece, of, junk, doesn, t, charge, ..."
114913,really small,0,"[really, small]"
114914,it is contained in glass which is dangerous if...,0,"[it, is, contained, in, glass, which, is, dang..."
114915,fake not original every time my yr old _Recipi...,0,"[fake, not, original, every, time, my, yr, old..."


In [23]:
import numpy as np
all_reviews_df['tokenized_review1'] = np.NAN

In [28]:
all_reviews_df

Unnamed: 0,review,positive,tokenized_review,tokenized_review1
0,excellent,1,[excellent],excellent
1,great quality wooden track better than some ot...,1,"[great, quality, wooden, track, better, than, ...",great quality wooden track better than some ot...
2,my daughter loved it and i liked the price and...,1,"[my, daughter, loved, it, and, i, liked, the, ...",my daughter loved it and i liked the price and...
3,great item pictures pop thru and add detail as...,1,"[great, item, pictures, pop, thru, and, add, d...",great item pictures pop thru and add detail as...
4,i was pleased with the product,1,"[i, was, pleased, with, the, product]",i was pleased with the product
...,...,...,...,...
114912,it s a piece of junk doesn t charge multiple b...,0,"[it, s, a, piece, of, junk, doesn, t, charge, ...",it s a piece of junk doesn t charge multiple b...
114913,really small,0,"[really, small]",really small
114914,it is contained in glass which is dangerous if...,0,"[it, is, contained, in, glass, which, is, dang...",it is contained in glass which is dangerous if...
114915,fake not original every time my yr old _Recipi...,0,"[fake, not, original, every, time, my, yr, old...",fake not original every time my yr old _Recipi...


In [24]:
for i in range(len(all_reviews_df['tokenized_review'])):
    all_reviews_df.loc[i,'tokenized_review1']=' '.join(all_reviews_df.loc[i,'tokenized_review'])

In [25]:
# vectorize tokenized but without stemming and lemmatization
vectorizer = CountVectorizer()
vectorize(all_reviews_df.tokenized_review1, "count vect-ing tokenized without stemming and lemmatization", vectorizer)

Unnamed: 0,Vectoring Method,Shape
1,count vect-ing without removing stopwords,"(114917, 37131)"
2,count vect-ing with removing stopwords,"(114917, 36828)"
3,count vect-ing tokenized without stemming and ...,"(114917, 37127)"


In [26]:
from nltk.stem.porter import *
stemmer = PorterStemmer()

In [29]:
all_reviews_df['stemmed'] = all_reviews_df['tokenized_review'].apply(lambda x: [stemmer.stem(y) for y in x])

In [30]:
for i in range(len(all_reviews_df['stemmed'])):
    all_reviews_df.loc[i,'stemmed']=' '.join(all_reviews_df.loc[i,'stemmed'])

In [31]:
# vectorize with stemming
vectorizer = CountVectorizer()
vectorize(all_reviews_df.stemmed, "count vectorize with stemmingg", vectorizer)

Unnamed: 0,Vectoring Method,Shape
1,count vect-ing without removing stopwords,"(114917, 37131)"
2,count vect-ing with removing stopwords,"(114917, 36828)"
3,count vect-ing tokenized without stemming and ...,"(114917, 37127)"
4,count vectorize with stemmingg,"(114917, 25350)"


In [32]:
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
all_reviews_df['Lemmatized'] = all_reviews_df['tokenized_review'].apply(lambda x: [lemmatizer.lemmatize(y) for y in x])

In [33]:
for i in range(len(all_reviews_df['Lemmatized'])):
    all_reviews_df.loc[i,'Lemmatized']=' '.join(all_reviews_df.loc[i,'Lemmatized'])

In [34]:
# vectorize with lemmatization
vectorizer = CountVectorizer()
vectorize(all_reviews_df.Lemmatized, "count vectorize with lemmatization", vectorizer)

Unnamed: 0,Vectoring Method,Shape
1,count vect-ing without removing stopwords,"(114917, 37131)"
2,count vect-ing with removing stopwords,"(114917, 36828)"
3,count vect-ing tokenized without stemming and ...,"(114917, 37127)"
4,count vectorize with stemmingg,"(114917, 25350)"
5,count vectorize with lemmatization,"(114917, 32898)"


#### TfIdfVectorizer versus CountVectorizer

In [35]:
# count vectorize review without removing stopwords

vectorizer = CountVectorizer()
vectorize(all_reviews_df.review, "count vect-ing", vectorizer)

Unnamed: 0,Vectoring Method,Shape
1,count vect-ing without removing stopwords,"(114917, 37131)"
2,count vect-ing with removing stopwords,"(114917, 36828)"
3,count vect-ing tokenized without stemming and ...,"(114917, 37127)"
4,count vectorize with stemmingg,"(114917, 25350)"
5,count vectorize with lemmatization,"(114917, 32898)"
6,count vect-ing,"(114917, 37131)"


In [36]:
#  TfIdfVectorizer vectorize review without removing stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(ngram_range=(1,1))
vectorize(all_reviews_df.review, "TfIdfVectorizer vectorize vect-ing", vectorizer)

Unnamed: 0,Vectoring Method,Shape
1,count vect-ing without removing stopwords,"(114917, 37131)"
2,count vect-ing with removing stopwords,"(114917, 36828)"
3,count vect-ing tokenized without stemming and ...,"(114917, 37127)"
4,count vectorize with stemmingg,"(114917, 25350)"
5,count vectorize with lemmatization,"(114917, 32898)"
6,count vect-ing,"(114917, 37131)"
7,TfIdfVectorizer vectorize vect-ing,"(114917, 37131)"


#### ngram sizes of 1, 2, and 3

In [37]:
#  TfIdfVectorizer (1,1) vectorize review without removing stopwords
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(ngram_range=(1,1))
vectorize(all_reviews_df.review, "TfIdfVectorizer vectorize vect-ing (1,1)", vectorizer)

Unnamed: 0,Vectoring Method,Shape
1,count vect-ing without removing stopwords,"(114917, 37131)"
2,count vect-ing with removing stopwords,"(114917, 36828)"
3,count vect-ing tokenized without stemming and ...,"(114917, 37127)"
4,count vectorize with stemmingg,"(114917, 25350)"
5,count vectorize with lemmatization,"(114917, 32898)"
6,count vect-ing,"(114917, 37131)"
7,TfIdfVectorizer vectorize vect-ing,"(114917, 37131)"
8,"TfIdfVectorizer vectorize vect-ing (1,1)","(114917, 37131)"


In [38]:
#  TfIdfVectorizer (2,2) vectorize review without removing stopwords

vectorizer = TfidfVectorizer(ngram_range=(2,2))
vectorize(all_reviews_df.review, "TfIdfVectorizer vectorize vect-ing (2,2)", vectorizer)

Unnamed: 0,Vectoring Method,Shape
1,count vect-ing without removing stopwords,"(114917, 37131)"
2,count vect-ing with removing stopwords,"(114917, 36828)"
3,count vect-ing tokenized without stemming and ...,"(114917, 37127)"
4,count vectorize with stemmingg,"(114917, 25350)"
5,count vectorize with lemmatization,"(114917, 32898)"
6,count vect-ing,"(114917, 37131)"
7,TfIdfVectorizer vectorize vect-ing,"(114917, 37131)"
8,"TfIdfVectorizer vectorize vect-ing (1,1)","(114917, 37131)"
9,"TfIdfVectorizer vectorize vect-ing (2,2)","(114917, 599590)"


In [39]:
#  TfIdfVectorizer (3,3) vectorize review without removing stopwords

vectorizer = TfidfVectorizer(ngram_range=(3,3))
vectorize(all_reviews_df.review, "TfIdfVectorizer vectorize vect-ing (3,3)", vectorizer)

Unnamed: 0,Vectoring Method,Shape
1,count vect-ing without removing stopwords,"(114917, 37131)"
2,count vect-ing with removing stopwords,"(114917, 36828)"
3,count vect-ing tokenized without stemming and ...,"(114917, 37127)"
4,count vectorize with stemmingg,"(114917, 25350)"
5,count vectorize with lemmatization,"(114917, 32898)"
6,count vect-ing,"(114917, 37131)"
7,TfIdfVectorizer vectorize vect-ing,"(114917, 37131)"
8,"TfIdfVectorizer vectorize vect-ing (1,1)","(114917, 37131)"
9,"TfIdfVectorizer vectorize vect-ing (2,2)","(114917, 599590)"
10,"TfIdfVectorizer vectorize vect-ing (3,3)","(114917, 1585098)"


## Extra

### Model with and without stopword

In [None]:
# create df to store models results
summary = pd.DataFrame(columns = ['Name', 'Accuracy', 'AUROC'])

In [None]:
# create function to run a model
def LogisticReg(name, variables, response):
    # import packages
    from sklearn.linear_model import LogisticRegression
    from sklearn.feature_extraction.text import CountVectorizer
    from sklearn.metrics import roc_auc_score
    import numpy as np
    # Vectorize
    X = vectorizer.fit_transform(variables)
    y = response.values
    # run logistic regression
    lr = LogisticRegression()
    lr.fit(X, y)
    # calculate accuracy and aucroc
    y_pred = lr.predict(X)
    accuracy = np.mean(y_pred == y)
    from sklearn.metrics import confusion_matrix
    confusion_matrix(y, y_pred)
    auc = roc_auc_score(y, y_pred)
    summary.loc['model '+ str(len(summary)+1)] = [name, accuracy, auc]
    print(summary)

In [None]:
# vectorize review without removing stopwords
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(max_features=1000)
# run a model
LogisticReg('with stopwords', all_reviews_df.review, all_reviews_df.positive)

In [None]:
# vectorize review without removing stopwords
vectorizer = CountVectorizer(stop_words="english", binary=True, min_df=2, max_features=1000)
# run a model
LogisticReg('without stopwords', all_reviews_df.review, all_reviews_df.positive)

### Models with  1) stemming 2) lemmatization

In [None]:
import nltk
all_reviews_df['tokenized_review'] = all_reviews_df.apply(lambda row: nltk.word_tokenize(row['review']), axis=1)

In [None]:
from nltk.stem.porter import *
stemmer = PorterStemmer()

In [None]:
all_reviews_df['stemmed'] = all_reviews_df['tokenized_review'].apply(lambda x: [stemmer.stem(y) for y in x])

In [None]:
for i in range(len(all_reviews_df['stemmed'])):
    all_reviews_df.loc[i,'stemmed']=' '.join(all_reviews_df.loc[i,'stemmed'])

In [None]:
# Model with stemmed reviews
vectorizer = CountVectorizer(max_features=1000)
LogisticReg("Stemmed",all_reviews_df.stemmed, all_reviews_df.positive)

In [None]:
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
all_reviews_df['Lemmatized'] = all_reviews_df['tokenized_review'].apply(lambda x: [lemmatizer.lemmatize(y) for y in x])

In [None]:
for i in range(len(all_reviews_df['Lemmatized'])):
    all_reviews_df.loc[i,'Lemmatized']=' '.join(all_reviews_df.loc[i,'Lemmatized'])

In [None]:
LogisticReg('Lemmatized', all_reviews_df.Lemmatized, all_reviews_df.positive)

### TfIdfVectorizer versus CountVectorizer

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords

In [None]:
vectorizer = TfidfVectorizer(ngram_range=(1,1),
                             token_pattern=r'\b[a-zA-Z]{3,}\b',
                             max_df=0.4, max_features=1000)

In [None]:
LogisticReg('TF-IDF (1,1)', all_reviews_df.review, all_reviews_df.positive)

In [None]:
# Vectorize using TF-IDF
vectorizer = TfidfVectorizer(ngram_range=(2,2),
                             token_pattern=r'\b[a-zA-Z]{3,}\b',
                             max_df=0.4, max_features=1000)
# Run a model
LogisticReg('TF-IDF (2,2)', all_reviews_df.review, all_reviews_df.positive)

In [None]:
# Vectorize using TF-IDF
vectorizer = TfidfVectorizer(ngram_range=(3,3),
                             token_pattern=r'\b[a-zA-Z]{3,}\b',
                             max_df=0.4, max_features=1000)
# Run a model
LogisticReg('TF-IDF (3,3)', all_reviews_df.review, all_reviews_df.positive)