In [1]:
import string
import pandas as pd
import numpy as np
import re
from nltk.tokenize import word_tokenize
from nltk.sentiment.util import mark_negation
from nltk.corpus import stopwords
from sklearn.preprocessing  import MinMaxScaler
from nltk.stem import PorterStemmer
from sklearn.ensemble import VotingClassifier, RandomForestClassifier
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.naive_bayes import MultinomialNB
from imblearn.over_sampling import RandomOverSampler
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

In [2]:

def data_processing():
    """
        Apply pre-processing techniques to the dataset.

        Params:
            None

        Return:
            data (DataFrame) : processed dataset
    """
    print("Starting Data Processing")
    # Import data['Review Text'] into dataframe
    data = pd.read_csv('Womens Clothing E-Commerce Reviews.csv')
    #remove ID columns
    data = data.drop(data.columns[[0,1]], axis=1)
    
    #HERE
    # combine the title & review text columns to process them as one
    data['Review Info'] = data['Title'].fillna('') + ' ' + data['Review Text'].fillna('')

    # Separate Title, Review Text, and Rating columns
    # fill the remaining missing cells with an empty string
    data['Review Info'] = data['Review Info'].fillna('')

    # Fill missing fields
    data['Division Name'] = data['Division Name'].fillna('')
    data['Department Name'] = data['Department Name'].fillna('')
    data['Class Name'] = data['Class Name'].fillna('')

    # Normalization of continuous features
    mm=MinMaxScaler(feature_range=(0,1))
    data['Age']=mm.fit_transform(data[['Age']])
    data['Positive Feedback Count']=mm.fit_transform(data[['Positive Feedback Count']])

    # Make all words lower case & replace '-' with ' '
    data['Review Info'] = data['Review Info'].str.lower()
    data['Review Info'] = data['Review Info'].str.replace('-', ' ')
    data.to_csv('init.csv')

    # Tokenization - splits on whitespace & punctuation
    data['Review Info'] = data['Review Info'].apply(word_tokenize)
    # print(data['Review Text'].iloc[1118])
    data.to_csv('token.csv')

    # Remove words that are less than 2 characters (keeps ending punctuation for the purpose of negation function)
    ending_punctuation = '!.?'
    data['Review Info'] = data['Review Info'].apply(lambda x: [''.join(word) for word in x if len(word) > 2 or word in ending_punctuation])
    data.to_csv('token_adj.csv')

    # Negation
    data['Review Info'] = data['Review Info'].apply(mark_negation)
    data.to_csv('neg.csv')

    # Remove stop words
    # set our dictionary of stop words to be in english
    stop_words = set(stopwords.words("english"))
    data['Review Info'] = data['Review Info'].apply(lambda x: [word for word in x if ("_NEG" not in word and word not in stop_words) or ("_NEG" in word and word.split("_")[0] not in stop_words)])
    data.to_csv('stop.csv')

    # Remove punctuation & numbers
    punctuation = string.punctuation + ".....``~~"
    data['Review Info'] = data['Review Info'].apply(lambda x: [word for word in x if (word not in punctuation and not re.search(r'.*[\"\'\,].*', word)) and (not re.search(r'.*[0-9]+.*', word))])
    # print(data['Review Text'].iloc[1118])
    data.to_csv('punctnum.csv')

    #Stemming
    ps = PorterStemmer()
    data['Review Info'] = data['Review Info'].apply(lambda x: [ps.stem(word) for word in x])
    data.to_csv('stem.csv')

    # print(data['Review Text'].iloc[1118])

    # creates dummy variables for categorical features
    processed_data = pd.get_dummies(data, columns=['Division Name', 'Department Name', 'Class Name'])
    # remove dummy features that were blank cells
    processed_data = processed_data.drop(columns=['Division Name_', 'Department Name_', 'Class Name_'], axis=1)
    print(processed_data)

    processed_data.to_csv('final.csv')
    
    return processed_data

In [3]:
def implement_models(x_train, y_train, x_test, y_test, ensemble=None):
    """
        Implement the naive bayes ensemble and SVM ensemble. Will be implemented using the bagging technique - better for larger datasets.

        Params:
            x_train (List) : training set descriptive features
            y_train (List) : training set target features
            x_test (List) : testing set descriptive features
            y_test (List) : testing set target features
            ensemble : ensemble model

        Return:
            ensemble_results : predictions made by the ensemble
    """
    print("Implementing models...")

    if ensemble is None:
        ensembles = [('nb', MultinomialNB(alpha=1)),('rf',RandomForestClassifier(n_estimators=200, criterion='entropy', class_weight='balanced'))]
        ensemble = VotingClassifier(estimators=ensembles, voting='soft', weights=[0.6, 0.4])
    
    ensemble = ensemble.fit(x_train, y_train)
    
    # # make predictions on testing set
    ensemble_results = ensemble.predict(x_test)

    # # naive bayes 
    # nb = MultinomialNB()
    # # # fit the ensemble
    # nb = nb.fit(x_train, y_train)
    # # # make predictions on testing set
    # nb_results = nb.predict(x_test)
    # # # nb_results = None
    # print("Naive Bayes successfully predicted\n")

    # # naive bayes 
    # rf = RandomForestClassifier(n_estimators=35, criterion='gini', class_weight='balanced_subsample')
    # # # fit the ensemble
    # rf = rf.fit(x_train, y_train)
    # # # make predictions on testing set
    # rf_results = rf.predict(x_test)
    # # # nb_results = None
    # print("Random Forest successfully predicted\n")

    #return results
    return ensemble_results

In [4]:

def evaluate_model(prediction, true):
    print("Evaluating Model...")
    # Display Confusion Matrix
    cm = confusion_matrix(true, prediction)
    print(cm)
    
    # Calculate accuracy scores of the model
    result = classification_report(true, prediction)
    print(result)

    return

In [5]:

def select_extract_features(data):
    # divide descriptive & target features
    descriptive = data.drop(['Rating','Title','Review Text', 'Review Info'], axis=1)
    target = data['Rating']

    # Feature selection
    test = SelectKBest(score_func=chi2, k=5)
    fit = test.fit(descriptive, target)
    
    # Summarize scores
    np.set_printoptions(precision=3)
    print(fit.scores_)

    features = fit.transform(descriptive)
    feature_names = descriptive.columns[test.get_support(indices=True)].tolist()
    print(features)
    print(feature_names)

    # Text vectorization to get count of words - break out list of words into individual features (like creating dummy variables)
    vectorizer = CountVectorizer(analyzer=lambda x: x)
    bag_of_words = vectorizer.fit_transform(data['Review Info'])

    # create dataframes for the title and review text bag of words
    bag_of_words = pd.DataFrame(bag_of_words.toarray(), columns=vectorizer.get_feature_names()).add_prefix('review_')

    # concat dataset with ideal features to split out title and review text features
    new_descriptive = pd.concat([descriptive[feature_names], bag_of_words], axis=1)

    return new_descriptive, target

In [8]:
def find_ideal_params(x_train, y_train, x_test, y_test):
    """
        Implement the naive bayes ensemble and SVM ensemble. Will be implemented using the bagging technique - better for larger datasets.

        Params:
            x_train (List) : training set descriptive features
            y_train (List) : training set target features
            x_test (List) : testing set descriptive features
            y_test (List) : testing set target features

        Return:
            PREDICTIONS FROM THE TWO ENSEMBLES
    """
    print("Predicting model params...")
    
    # create ensemble with nb & rf
    ensembles = [('nb', MultinomialNB()),('rf',RandomForestClassifier())]
    ensemble = VotingClassifier(estimators=ensembles)

    # PARAMS TO SEARCH
    params = {
        # voting classifier parameters
        'voting': ['hard', 'soft'],
        'weights': [[0.5, 0.5], [0.6, 0.4], [0.4, 0.6], [0.7, 0.3], [0.3, 0.7]],
        # naive bayes parameters
        'nb__alpha': [1, 0.1, 0.01, 0.001],
        # random forest parameters
        'rf__n_estimators': [50, 100, 150, 200],
        'rf__criterion': ['gini', 'entropy'],
        'rf__class_weight': ['balanced_subsample', 'balanced']
    }

    # finds the ensemble that gives the best cross validation score
    estimator = RandomizedSearchCV(ensemble, param_distributions=params)
    results = estimator.fit(x_train, y_train)
    
    # print the parameters that give the best results
    print(results.best_params_)

    return results

In [9]:
# RUNS ALL FUNCTIONS INITIALIZED ABOVE
# to make things cleaner

# process data
data = data_processing()
# select optimal features
new_descriptive, target = select_extract_features(data)

# split data into training & testing sets ... 30% testing
x_train, x_test, y_train, y_test = train_test_split(new_descriptive, target, test_size=0.3)

# imblearn package to oversample training set
# To avoid bias for specific ratings
print('Over sampling the data')
oversampler = RandomOverSampler(random_state=42)
x_oversample, y_oversample = oversampler.fit_resample(x_train, y_train)

# find hyperparameters for ensemble
search_results = find_ideal_params(x_oversample, y_oversample, x_test, y_test)
model = search_results.best_estimator_

# optional - feed model from hyperparameter search as an argument
ensemble_results = implement_models(x_oversample, y_oversample, x_test, y_test, model)

# pass model results to evaluate the performance of each ensemble
print("Evaluate the Model:")
evaluate_model(ensemble_results, y_test)
# print()
# print("Evaluate NB:")
# evaluate_model(nb_results, y_test)
# print()
# print("Evaluate RF:")
# evaluate_model(rf_results, y_test)

Starting Data Processing
            Age                                              Title  \
0      0.185185                                                NaN   
1      0.197531                                                NaN   
2      0.518519                            Some major design flaws   
3      0.395062                                   My favorite buy!   
4      0.358025                                   Flattering shirt   
...         ...                                                ...   
23481  0.197531                     Great dress for many occasions   
23482  0.370370                         Wish it was made of cotton   
23483  0.160494                              Cute, but see through   
23484  0.123457  Very cute dress, perfect for summer parties an...   
23485  0.419753                    Please make more like this one!   

                                             Review Text  Rating  \
0      Absolutely wonderful - silky and sexy and comf...       4  

KeyboardInterrupt: 