# Building a Preprocessing Pipe for NLP

In [3]:
%run ../src/import_libraries.py
%matplotlib inline

%run ../src/initial_preprocessing.py


-------------------------------------------------------------------------

Defining Target (dropping neutral reviews):
Original_number of rows: 568454 
Updated number of rows:  525814

-------------------------------------------------------------------------

Original columns:  Index(['Id', 'ProductId', 'UserId', 'ProfileName', 'HelpfulnessNumerator',
       'HelpfulnessDenominator', 'Score', 'Time', 'Summary', 'Text',
       'Updated_Score'],
      dtype='object')

Leaving 'Text','Summary' and 'Score' columns:

Selected columns:  Index(['Text', 'Summary', 'Score', 'Updated_Score'], dtype='object')

-------------------------------------------------------------------------

Total number or reviews: 525814
Total number of na (nulls): 25

Dropping 25 out of 525814 reviews


***
## Train/Test Split with Optional Undersampling


In [15]:
#n=20000
n=len(df)

df_sample=df.sample(n,random_state=12)

## Increasing "weight" of Summary words:
df_sample['Text']= df_sample['Summary']+' '+df_sample['Text']

X=df_sample[['Text','Summary']].copy()
y=df_sample['Updated_Score']

X_train, X_test, y_train, y_test= train_test_split(X,y, test_size=0.10, random_state=1)

## Pre-processing functions 
***
To be used by vectorizers as a part of prep_pipe:

In [4]:
def preprocess_review(review, return_list=False):     
    review=review.lower()
    review_norm=word_tokenize(review)
    review_norm  = [SnowballStemmer('english').stem(token) for token in review_norm]
    review_norm = [x for x in review_norm if (x.isalpha() & (x not in stop_words) ) ]
        
    if return_list:
        return review_norm
    else:
        return " ".join(review_norm)

def preprocess_ngram_review(review):
    review=review.lower()
    words = review.translate(review.maketrans('', '', string.digits+string.punctuation))
    return words

## Custom sklearn classes to build New Features



In [5]:
## WordCounter class - creates a column counting number of words for each review

class WordCounter (BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
    
    def fit(self, data,  y = 0):
        return self
    
    def transform(self, data, y = 0):
        words_n = data.apply(lambda x: len(x.split()) )
        return words_n.values.reshape(-1,1)
    
## StringCounter class - creates a column counting occurences of any passed string for each review

class StringCounter (BaseEstimator, TransformerMixin):
    def __init__(self, str_to_count):
        self.str_to_count=str_to_count
    
    def fit(self, data,  y = 0):
        return self
    
    def transform(self, data, y = 0):
        string_n = data.apply(self.count_string) 
        return string_n.values.reshape(-1,1)
    
    def count_string(self, data):
        string_n=data.count(self.str_to_count) 
        total=np.sum([1 for x in data if x.isalpha()]) 
        if total==0:
            total=1
        string_p=string_n/total
        return string_p 
    
## CapitalCounter class - creates a column counting occurences of Capitalized characters for each review

class CapitalCounter (BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
    
    def fit(self, data,  y = 0):
        return self
    
    def transform(self, data, y = 0):
        capital_count=data.apply(self.count_capital)
        return capital_count.values.reshape(-1,1)
    
    def count_capital(self, data):
        capital_n=np.sum([1 for x in data if x.isupper()])
        total=np.sum([1 for x in data if x.isalpha()])
        if total==0:
            total=1
        capital_p=capital_n/total
        return capital_p 

## MisspellCounter class - creates a column counting occurences of misspelled words for each review  
    
class MisspellCounter (BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
    
    def fit(self, data,  y = 0):
        return self
    
    def transform(self, data, y = 0):
        misspell_count=data.apply(self.count_misspell)
        return misspell_count.values.reshape(-1,1)
    
    def count_misspell(self, data):
        misspell_n=np.sum([1 for x in data.split() if x.isalpha() and webster.get(x,0)==0])
        total=np.sum([1 for x in data.split() if x.isalpha()])
        if total==0:
            total=1
        misspell_p=misspell_n/total
        return misspell_p

## Loading webster dictionary and stop_words

In [11]:
# Loading webster dictionary to identify misspelled words

with open('../data/supplementary/webster.json') as data:
    webster = json.load(data)
    
exceptions=["against", "again", "should've", "should", 'because','few']
stop_words = stopwords.words('english') # + stop_words
for exc in exceptions: stop_words.pop(stop_words.index(exc))

## Building a Pre-processing Pipe

In [12]:
#Initialiazing sklearn classes:

text_vectorizer =CountVectorizer(preprocessor=process_review, min_df=0.0004)
bi_vectorizer = CountVectorizer( preprocessor=process_ngram_review, ngram_range=(2, 3), min_df=0.0006)
bi_summ_vectorizer = CountVectorizer(preprocessor=process_ngram_review, ngram_range=(2, 4), min_df=0.0006)

word_counter=WordCounter()
quest_counter=StringCounter('?')
excl_counter=StringCounter('!')
misspell_counter=MisspellCounter()
capital_counter=CapitalCounter()

scaler=MaxAbsScaler()

# FeatureUnion to be used on Text column:

text_fu = FeatureUnion([
    ('word_counter', word_counter),
    ('capital_counter', capital_counter),
    ('quest_counter', quest_counter),
    ('excl_counter', excl_counter),
    ('text_vect', text_vectorizer), 
    ('bi_text_vect', bi_vectorizer),
])

# FeatureUnion to be used on Summary column:

summ_fu = FeatureUnion([
    ('misspell_counter', misspell_counter) ,
    ('quest_counter', quest_counter) ,
    ('excl_counter', excl_counter),
    ('capital_counter',  capital_counter),
    ('sum_vect', text_vectorizer), 
    ('bi_summ_vect', bi_summ_vectorizer)
])

# ColumnTransformer to combine both FeatureUnions:

preprocessor = ColumnTransformer(transformers=[
    ('text_fu', text_fu, 'Text'),
    ('summ_fu', summ_fu, 'Summary'),
], remainder='passthrough')

# Prep_pipe to combine preprocessor and scaler:

prep_pipe = Pipeline([('prep', preprocessor),  ('scaler', scaler)
                     ])     

## Fitting and Transforming Data

In [13]:
started = datetime.now()

prep_train_X=prep_pipe.fit_transform(X_train)
prep_test_X=prep_pipe.transform(X_test)

print(datetime.now()-started)

0:00:59.205139


In [14]:
prep_test_X.shape

(2000, 28462)

## Summary:



* We have our pre-processing pipeline
* New features were carefully tested and selected
* Let's try different models (see Modeling.ipynb)