# This notebook contains user-defined transformers for adjusting the reviews for nlp modelling. The following classes for data transformation are defined:
 - class for removing duplicated reviews
 - class for removing nonenglish reviews
 - class for text cleaning (user defined symbols like commas, points, colons etc. are removed)
 - class for removing stop words (common english words which do not contribute to review sentiment, e.g. the, a, I, we, them etc.)
 - class for stemming (process of reducing inflected (or sometimes derived) words to their word stem, e.g. words like big, bigger, biggest are represented by word 'big')
 - class for changing ratings (the reviews can have five ratings in total whereas sentiments can be of three kind)

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import nltk
from nltk.corpus import stopwords
from wordcloud import WordCloud

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

### Loading the dataset

In [None]:
data = pd.read_csv('../input/ebay-reviews/ebay_reviews.csv')

In [None]:
import warnings
warnings.simplefilter(action='ignore', category=UserWarning)

### Transformers

In [None]:
class DuplicatesRemover(BaseEstimator, TransformerMixin):
    """
    Transformer to remove duplicated rows.
    """
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X2 = X.copy()
        #indexes of duplicated reviews
        duplicated_idx = X2.duplicated()
        X2 = X2[~duplicated_idx].dropna()
        return X2.set_index(np.arange(X2.shape[0]))

In [None]:
class ForeignReviewsRemover(BaseEstimator, TransformerMixin):
    """
    Transformer to remove non-english reviews.
    If no word from the common-english-wordslist is found in a review/title the row is treated as nonenglish
    
    see_dismissed - if True non-english rows are displayed
    """
  
    def __init__(self, see_dismissed=False):
        #built-in stopwords
        words = stopwords.words('english')
        #these words are kept because they can be useful in sentiment analysis
        additional_words = ['good','beautiful', 'great','best','well','work','working','excellent','price','nice','handy','bad','terrible','worse','worst','broken','no','useless', 'ok','yes','fine','ok','awesome','awful', 'low', 'high', 'cool']
        #additional words that are not defined in built-in stopwords
        words_to_remove={'i','me','a','d','o','y','s','t','don','ma'}
        words.extend(additional_words)
        self.words = list(set(words).difference(words_to_remove))
        self.see_dismissed = see_dismissed

    def fit(self, X, y=None, user_words=[]):
        self.words.extend(user_words)
        #pattern for removing stop words
        self.pattern = " | ".join(self.words) + "| ".join(self.words) + " |".join(self.words)
        return self

    def transform(self, X):
        X2 = X.copy()
        
        X_temp = pd.DataFrame()
        cols_to_join = []

        for col in X2.columns:
            if not X2.loc[:, col].dtypes == int:
                cols_to_join.append(col)
                
        X_temp['connected'] = X2[cols_to_join].astype(str).agg(' '.join, axis=1)
        #using the defined pattern in fit function
        idx = np.where(X_temp['connected'].str.contains(self.pattern, case=False, regex=True).values == True, True, False)
        
        if self.see_dismissed == True:
            for row in X2['connected'][~idx]:
                print(row)
        return X2[idx]

In [None]:
class TextCleaning(BaseEstimator, TransformerMixin):
    """
    Transformer to remove punctation and multiple spaces from text and change uppercase letters to lowercase.
    """
    def __init__(self, pattern="[!\"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~]"):
        self.pattern = pattern
    
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X2 = X.copy()
        X2.replace({"\s\s+":" "}, regex=False, inplace=True)
        
        for col in X2.columns:
            if X2.loc[:, col].dtypes == int: continue
            X2.loc[:, col] = X2.loc[:, col].str.replace(self.pattern, "", regex=True).str.lower()
        return X2

In [None]:
class StopWordsRemover(BaseEstimator, TransformerMixin):
    """
    Transformer to remove popular english words with some default exceptions. User can add his own words to keep.
    """
    
    def __init__(self, words_to_keep=['few','not', 'off','all','any','not','no','very']):
        stop_words = set(stopwords.words('english'))    
        self.eng_words = stop_words.difference(set(words_to_keep))
         
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X2 = X.copy()
        
        for col in X2.columns:
            if X2.loc[:, col].dtypes == int: continue
            for en, review in enumerate(X2.loc[:, col].astype(str)):
                new = (" ").join(j for j in review.split(" ") if j.lower() not in self.eng_words)
                try:
                    X2.loc[:, col].iloc[en] = new
                except:
                    continue
        return X2

In [None]:
class Stemmer(BaseEstimator, TransformerMixin):
    """
    Transformer to stemm words.
    stem - if False the words are not stemmed (for experimental reasons)
    """
    
    def __init__(self, stem=True):
        self.stemmer = nltk.PorterStemmer()
        self.stem = stem
    
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        if self.stem == False:
            return X
        else:
            X2 = X.copy()  
            for col in X2.columns:
                if X2.loc[:, col].dtypes == int: continue
                for en, review in enumerate(X2.loc[:, col].astype(str)):
                    new = (" ").join(self.stemmer.stem(j) for j in review.split(" "))
                    try:
                        X2[:, col].iloc[en] = new
                    except:
                        continue
            return X2

In [None]:
class Rating(BaseEstimator, TransformerMixin):
    """
    Transformer to change reviews ratings numbers
    labels_to_del - a list of reviews rating to remove (the assumption is that only ratings with 3 starts will be removed)
    """
    def __init__(self, scale={1:1, 2:1, 3:0, 4:0, 5:0}, labels_to_del=[]):
        self.scale = scale
        self.labels_to_del = labels_to_del
    
    def fit(self, X, y=None):
        if self.labels_to_del != []:
            self.idx_to_del = X['rating'] == self.labels_to_del[0]
        return self

    def transform(self, X):    
        X2 = X.copy()
        if self.labels_to_del:
            X2 = X2[~self.idx_to_del]
        X2.replace(self.scale, inplace=True)
        return X2

In [None]:
class Connector(BaseEstimator, TransformerMixin):
    """
    Transformer to connect columns in one column.
    """
        
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X2 = X.copy()
        X3 = pd.DataFrame()
        cols_to_join = []
        for col in X2.columns:
            if X2.loc[:, col].dtypes == int:
                X3['rating'] = X2.loc[:, col]     
            else:
                cols_to_join.append(col)
                
        X3['review'] = X2[cols_to_join].astype(str).agg(' '.join, axis=1)
        return X3

### How to make use of user-defined transformers? We will use a pipeline
### Note: the pipeline in the following cell is defined for classic machine learning approach (support vector machine, logistic regression etc.). Please keep in mind that for neural networks commas, dots and other symbols may have some influence on model accuracy 

In [None]:
#transformers for classic machine learning approach

#columns that are going to be transformed
cols = ['review title', 'review content', 'rating']

#pipeline
preprocessor = Pipeline([
    #at first duplicated reviews will be removed
    ('DuplicateRemover', DuplicatesRemover()),
    #next nonenglish reviews will be removed
    ('ForeignReviewsRemover',ForeignReviewsRemover()),
    #symbols that will be removed are defined in the transformer but a user can define his own/some additional symbols
    ('TextCleaning',TextCleaning()),
    #removing popular english words
    ('StopWordsRemover',StopWordsRemover()),
    #if stem is False the words will not be stemmed
    ('Stemmer', Stemmer(stem=False)),
    #connecting text columns to one column (this is only for machine learning process, it is not necessary for data analysis)
    #('Connector', Connector()),
    #rating changer, in this example negative(1, 2) ratings are equal to -1, neutral (3) 0 and positive(4,5) 1
    ('Rating', Rating(scale={1:-1, 2:-1, 3:0, 4:1, 5:1})),
    #the autor noticed that after cleaning the reviews some duplicated reviews are left, one more time duplicateremover is used (we could use it only one time, but it would make the process of data cleaning longer)
    ('DuplicateRemover2', DuplicatesRemover())
])

In [None]:
#cleaned reviews are available after running this cell (note: it may take a while)
preprocessor.fit(data[cols])
data_preprocessed = preprocessor.transform(data[cols])

In [None]:
data_preprocessed.head(20)

### All comments and questions are welcome