# Stop words training

In [26]:
import seaborn as sns
import pandas as pd
import autograd.numpy as np
from autograd import grad
import matplotlib.pyplot as plt
%matplotlib inline


In [28]:
# import data
train_data = pd.read_csv('Training.csv')

train_data.describe(include='all')

# creating y labels less than cutoff to be 0 and 1 otherwise
y = train_data['overall'].apply(lambda x: 0 if x <= cutoff else 1)


In [29]:
import re

# Preprocessing data

GENERIC_STOP_WORDS = ['ourselves', 'hers', 'between', 'yourself', 'but', 'again', 'there', 'about', 'once', 'during', 'out', 'very', 'having', 'with', 'they', 'own', 'an', 'be', 'some', 'for', 'do', 'its', 'yours', 'such', 'into', 'of', 'most', 'itself', 'other', 'off', 'is', 's', 'am', 'or', 'who', 'as', 'from', 'him', 'each', 'the', 'themselves', 'until', 'below', 'are', 'we', 'these', 'your', 'his', 'through', 'don', 'nor', 'me', 'were', 'her', 'more', 'himself', 'this', 'down', 'should', 'our', 'their', 'while',
                          'above', 'both', 'up', 'to', 'ours', 'had', 'she', 'all', 'when', 'at', 'any', 'before', 'them', 'same', 'and', 'been', 'have', 'in', 'will', 'on', 'does', 'yourselves', 'then', 'that', 'because', 'what', 'over', 'why', 'so', 'can', 'did', 'now', 'under', 'he', 'you', 'herself', 'has', 'just', 'where', 'too', 'only', 'myself', 'which', 'those', 'i', 'after', 'few', 'whom', 't', 'being', 'if', 'theirs', 'my', 'against', 'a', 'by', 'doing', 'it', 'how', 'further', 'was', 'here', 'than', 'as']
STOP_WORDS_SUMMARY = []
STOP_WORDS_REVIEW_TEXT = []



def edit_summary(content):

    # step 1 - convert the text to only lower case
    content = content.lower()

    # step 2 - remove unwanted and unicode characters
    content = re.sub(
        r"(@\[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)|^rt|http.+?", "", content)

    # step 3 - remove stop words
    stopwords = GENERIC_STOP_WORDS + STOP_WORDS_SUMMARY

    filtered_words = [
        word for word in content.split() if word not in stopwords]
    text = " ".join(filtered_words)

    return text


def edit_review_text(content):

    # step 1 - convert the text to only lower case
    content = content.lower()

    # step 2 - remove unwanted and unicode characters
    content = re.sub(
        r"(@\[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)|^rt|http.+?", "", content)

    # step 3 - remove stop words
    stopwords = GENERIC_STOP_WORDS + STOP_WORDS_REVIEW_TEXT
    
    filtered_words = [
        word for word in content.split() if word not in stopwords]
    text = " ".join(filtered_words)

    return text

# create categories for vote label


def assign_vote_label(i):
    if i <= 2.0:
        return 'low'
    if i > 2.0 and i <= 10.0:
        return 'medium'
    if i > 10.0 and i <= 50.0:
        return 'good'
    if i > 50.0:
        return 'high'


# Stop words search

In [30]:
from sklearn.feature_extraction.text import TfidfVectorizer

cutoff = 1

# summary
data = train_data["summary"].astype(str).apply(edit_summary)
vectorizer1 = TfidfVectorizer()
output1 = vectorizer1.fit_transform(data)
dict1 = vectorizer1.vocabulary_

# top 3000 words by frequency
top_terms1 = sorted(dict1.keys(), key=lambda x: x[1], reverse=True)[:3000]
print(top_terms1)
feature_names1 = vectorizer1.get_feature_names_out()
stop_words1 = []
y = train_data['overall'].apply(lambda x: 0 if x <= cutoff else 1)

# checking each word if it is relevant to the cutoff
for word in top_terms1:
    index = np.where(feature_names1 == word)[0]
    star_arr = np.where(output1.todense()[:, index] != 0)[0]
    num_zeros = np.count_nonzero(np.array(y[star_arr] == 0))
    if (np.array(y[star_arr]).size == 0):
        continue
    ratio = num_zeros/np.array(y[star_arr]).size
    # if y=zeros/ total lies in the region of [0.45,0.55] we include that in special stop words
    if ratio >= 0.45 and ratio <= 0.55:
        stop_words1.append(word)

print(stop_words1)


# review Text
data = train_data["reviewText"].astype(str).apply(edit_review_text)
vectorizer2 = TfidfVectorizer()
output2 = vectorizer2.fit_transform(data)
dict2 = vectorizer2.vocabulary_
# top 3000 words by frequency
top_terms2 = sorted(dict2.keys(), key=lambda x: x[1], reverse=True)[:3000]
feature_names2 = vectorizer2.get_feature_names_out()
stop_words2 = []
y = train_data['overall'].apply(lambda x: 0 if x <= cutoff else 1)

# checking each word if it is relevant to the cutoff
for word in top_terms2:
    index = np.where(feature_names2 == word)[0]
    star_arr = np.where(output2.todense()[:, index] != 0)[0]
    num_zeros = np.count_nonzero(np.array(y[star_arr] == 0))
    if (np.array(y[star_arr]).size == 0):
        continue
    ratio = num_zeros/np.array(y[star_arr]).size
    # if y=zeros/ total lies in the region of [0.45,0.55] we include that in special stop words
    if ratio >= 0.45 and ratio <= 0.55:
        stop_words2.append(word)

print(stop_words2)

# Results

# STOP_WORDS_SUMMARY = ['ozzy', 'fyi', 'hydrated', '2x', 'exposure', 'extremely', 'execution', 'expert', '3x', '2wd', 'swivel', 'dwarf', 'sweat', 'owner', 'twins', 'rv', 'everybody', 'uv', 'evil', 'overseas', 'overlooked', 'overwhelmed', 'evenly', 'evo', 'quits', 'bux', 'rust', 'quit', 'burnout', 'sunshade', 'suction', 'ruined', 'bubbled', 'suck', 'purchasing', 'suspect', 'functionally', 'kurt', 'sucked', 'funky', 'suggestion', 'vu', 'rub', 'turner', 'bubblegum', 'puke', 'public', 'funniest', 'buble', 'julian', 'hunter', 'outshines', 'submerged', 'bullet', 'punish', 'puts', 'butterfly', 'guaranteed', 'rum', 'mushy', 'purposely', 'rubbed', 'nutrition', 'august', 'bucks', 'cumbersome', 'suffer', 'pumps', 'funko', 'cutie', 'pups', 'stopped', 'atv', 'strut', 'steer', 'wth', 'starters', 'stravinsky', 'atleast', 'stinks', 'stage', 'stripped', 'styrofoam', 'starch', 'att', 'stylus', 'struggled', 'asian', 'isuzu', 'essence', 'es', 'asking', 'ashlee', '5star', 'os', '3s', 'broken', 'broke', 'trd', 'arctic', 'trucks', 'brightness', 'crappy', 'cracked', 'triple', 'transfer', 'prevents', 'fraction', 'trance', 'breath', 'written', 'orbital', 'proud', 'producing', 'freaking', 'dreadful', 'tripe', 'fred', 'grown', 'frustration', 'gross', 'fruits', 'grocery', 'artisan', 'fruitcake', 'preservatives', 'grainy', 'trump', 'crumbled', 'grains', 'pricing', 'mrs', 'draft', 'crash', 'privacy', 'arms', 'cracks', 'ar', 'erase', 'articulation', 'breyer', 'crayons', 'sponge', 'applications', 'appeared', 'tpms', 'spout', 'upon', 'specifications', 'spry', 'apricots', 'spots', 'spiced', '6p', 'springs', 'ipad3']
# STOP_WORDS_REVIEW_TEXT = ['ez', 'oza', 'azalea', 'ozawa', 'ozzmosis', 'typo', 'mysteriously', 'gy', 'tyler', 'oye', 'ryche', 'bytes', 'myung', 'wyclef', 'lydon', 'wynonna', 'nyquil', 'hypocrisy', 'dysfunctional', 'hydrated', 'xylichew', 'syrupi', 'symbols', '7yr', '9yr', '7yo', 'examine', 'ox', '3xl', 'axel', '5xl', 'expelled', 'execs', 'experiencewhat', 'experiece', 'excitment', 'exploited', 'experimentalism', 'existed', 'axemen', 'exacerbate', 'exiles', 'extremly', 'expendable', 'exhibit', 'exceptionthe', 'exits', 'exercised', 'extol', '7x', 'expenses', 'expectancy', 'experienceupdate', '9x9', 'expanders', 'lxl', 'excavator', 'awg', '5wt', 'pwc', 'tweaked', 'awareness', 'swifts', 'swearing', 'twinkie', 'swooping', 'awfull', 'awash', 'swells', '3way', 'sweetie', 'sweaters', 'awakening', 'awfulthe', 'awfuli', 'twinings', 'swallows', 'swollen', 'twining', 'twopart', 'ownership', 'swivels', 'sweatwicking', 'awayit', 'overtightened', 'avenues', 'evacuation', 'evacuate', 'overrated', 'pvc', 'evidenced', 'dvds', 'oversexed', 'evangelical', 'evaporate', 'overuse', 'overprocessed', 'averages', 'evolutionary', 'overdo', 'everclear', 'dvdif', 'evens', 'overpowered', 'avocados', 'everhard', 'aviator', 'evey', 'sucks', 'sucked', 'buick', 'buyer', 'bux', 'junkyard', 'purolator', 'duralast', 'surge', 'autos', 'turbine', 'runaround', 'purchasesthe', 'buggy', 'fuseholder', 'ruben', 'outupdatethe', 'puddles', 'suck', 'sucksthe', 'bulks', 'purchasethis']


['ez', 'ezgo', 'ozzy', 'ozawa', 'zzzzzzzzzzz', 'fzt', 'tzigane', 'aztec', 'zz', 'iz', 'ozzys', 'oz', 'azoom', 'synthetic', 'fyi', 'sylvania', 'type', 'eye', 'syrup', 'hype', 'cylinders', 'typical', 'system', 'eyes', 'ly', 'tyc', 'hybrid', 'sync', 'hyundai', 'hyunda', 'sycho', 'bye', 'syndrome', 'lyrics', 'dynamic', 'cynical', 'byrds', 'dying', 'symphony', 'lynch', 'dylan', 'kyriacos', 'mysterians', 'py', 'hyped', 'dylans', 'byron', 'synths', 'hypegood', 'ryan', 'mystery', 'symphonica', 'nyro', 'pyromania', 'hysteria', 'cycle', 'eyed', 'synthesizer', 'hymns', 'byootay', 'iyer', 'myrath', 'syncopated', 'dyes', 'hydrated', 'tylenol', 'syrups', 'dysrhythmia', 'nyquil', 'ay', 'byullo', 'xylitol', 'rye', 'myth', 'byer', 'typically', 'lysol', 'gym', 'nylon', 'mytouch', 'mycharge', 'symmetry', 'python', 'pyle', 'cylinder', 'eyepiece', '4yo', 'kydex', 'kylin', 'bypass', 'lycrastretchneoprene', 'tying', 'cyclists', 'gymboss', 'cycling', 'cycleops', 'bygone', 'hydration', '4yr', 'eyewear', '2year

In [31]:
cutoff = 2

# summary
data = train_data["summary"].astype(str).apply(edit_summary)
vectorizer1 = TfidfVectorizer()
output1 = vectorizer1.fit_transform(data)
dict1 = vectorizer1.vocabulary_

# top 3000 words by frequency
top_terms1 = sorted(dict1.keys(), key=lambda x: x[1], reverse=True)[:3000]
print(top_terms1)
feature_names1 = vectorizer1.get_feature_names_out()
stop_words1 = []
y = train_data['overall'].apply(lambda x: 0 if x <= cutoff else 1)

# checking each word if it is relevant to the cutoff
for word in top_terms1:
    index = np.where(feature_names1 == word)[0]
    star_arr = np.where(output1.todense()[:, index] != 0)[0]
    num_zeros = np.count_nonzero(np.array(y[star_arr] == 0))
    if (np.array(y[star_arr]).size == 0):
        continue
    ratio = num_zeros/np.array(y[star_arr]).size
    # if y=zeros/ total lies in the region of [0.45,0.55] we include that in special stop words
    if ratio >= 0.45 and ratio <= 0.55:
        stop_words1.append(word)

print(stop_words1)


# review Text
data = train_data["reviewText"].astype(str).apply(edit_review_text)
vectorizer2 = TfidfVectorizer()
output2 = vectorizer2.fit_transform(data)
dict2 = vectorizer2.vocabulary_
# top 3000 words by frequency
top_terms2 = sorted(dict2.keys(), key=lambda x: x[1], reverse=True)[:3000]
feature_names2 = vectorizer2.get_feature_names_out()
stop_words2 = []
y = train_data['overall'].apply(lambda x: 0 if x <= cutoff else 1)

# checking each word if it is relevant to the cutoff
for word in top_terms2:
    index = np.where(feature_names2 == word)[0]
    star_arr = np.where(output2.todense()[:, index] != 0)[0]
    num_zeros = np.count_nonzero(np.array(y[star_arr] == 0))
    if (np.array(y[star_arr]).size == 0):
        continue
    ratio = num_zeros/np.array(y[star_arr]).size
    # if y=zeros/ total lies in the region of [0.45,0.55] we include that in special stop words
    if ratio >= 0.45 and ratio <= 0.55:
        stop_words2.append(word)

print(stop_words2)

# Results


# STOP_WORDS_SUMMARY = ['ozzy', 'oz', 'azoom', 'fyi', 'hybrid', 'syndrome', 'hydrated', '4yo', 'expected', 'excited', '2x', 'exposure', 'except', 'extreme', 'example', 'away', '2wd', 'swivel', 'awd', 'twisted', 'sweater', 'tweezers', 'swimming', 'twins', 'ever', 'uv', 'evil', 'overhyped', 'overseas', 'overlooked', 'evenly', 'evo', 'avid', 'overpaid', 'custom', 'much', 'quits', 'hurts', 'cushion', 'quality', 'customer', 'bux', 'multiple', 'bulbs', 'turned', 'supposed', 'guess', 'sunshade', 'cups', 'cut', 'sub', 'purchases', 'dunno', 'bunch', 'suv', 'quickly', 'dumb', 'functionally', 'kurt', 'substantial', 'sucked', 'pulled', 'gutter', 'outside', 'suggestion', 'rub', 'juice', 'turner', 'fuss', 'public', 'funniest', 'business', 'burn', 'hunter', 'suffers', 'outshines', 'puppy', 'qualities', 'autotune', 'guitars', 'outs', 'sudden', 'suits', 'subpar', 'putty', 'punish', 'puts', 'butterfly', 'judge', 'bubbles', 'authentic', 'guaranteed', 'jug', 'rum', 'gummies', 'rubbed', 'sugarfree', 'nutrition', 'nutritious', 'quinoa', 'sufferers', 'purposes', 'justify', 'lumpy', 'putting', 'burr', 'ounces', 'button', 'turbo', 'suffer', 'supplies', 'hunting', 'surplus', 'funko', 'numbers', 'cutie', 'pups', 'duplicates', 'puppets', 'item', 'attachment', 'starting', 'stick', 'stiff', 'step', 'strands', 'stretches', 'storm', 'gt', 'stretched', 'itll', 'street', 'wth', 'items', 'stainless', 'starters', 'atleast', 'stage', 'standards', 'stp', 'strange', 'stripped', 'strawberries', 'att', 'bt', 'steal', 'stroller', 'statue', 'use', 'asian', 'uses', 'usually', 'isuzu', 'issues', 'tsx', 'essence', 'ashlee', '5star', 'os', '3s', 'order', 'original', 'tried', 'cruiser', 'products', 'brake', 'pressure', 'trouble', 'try', 'trd', 'writing', 'arctic', 'driver', 'trailer', 'brand', 'probably', 'wrench', 'bristles', 'wrap', 'grounding', 'drive', 'process', 'error', 'area', 'protect', 'pro', 'transparent', 'prevent', 'transfer', 'traction', 'prevents', 'orignal', 'fraction', 'trance', 'breath', 'written', 'orbital', 'proud', 'previous', 'freaking', 'tripe', 'grow', 'travels', 'problematic', 'brothers', 'grace', 'brilliance', 'projects', 'tribute', 'priest', 'drinking', 'brandy', 'pretzel', 'grown', 'orleans', 'irons', 'dropping', 'programs', 'frustration', 'yr', 'fruits', 'artisan', 'fruitcake', 'trump', 'crumbled', 'grains', 'grapefruit', 'pretzels', 'oregano', 'brownie', 'truffles', 'trade', 'pricing', 'traditionally', 'mrs', 'produce', 'draft', 'crash', 'droid', 'arms', 'armorsuit', 'tricky', 'grippy', 'prints', 'ar', 'army', 'armour', 'drawstring', 'props', 'erase', 'articulation', 'breyer', 'drone', 'grandsons', 'artwork', 'grandaughter', 'dresses', 'equipped', 'squares', 'updated', 'sponge', 'spray', 'applications', 'update', 'operate', 'open', 'spark', 'tpms', 'rpms', 'specifications', 'appropriate', 'sports', 'split', 'options', 'spiced', 'appealing', '6p', 'ipad3', 'speaker', 'operates', 'appears']
# STOP_WORDS_REVIEW_TEXT = ['ez', 'oza', 'ozzys', 'azalea', 'ozawa', 'ozzmosis', 'czech', 'izumi', 'typo', 'dye', 'fyi', 'eyourlife', 'sync', 'sympathetic', 'tyvek', 'tyc', 'eyeglasses', 'hyper', 'gy', 'synthetics', 'tying', 'synth', 'synths', 'cymbals', 'ryche', 'bytes', 'synthesizers', 'myung', 'wyclef', 'wynonna', 'lynyrd', 'hypocrisy', 'xy', 'lynne', 'mythical', 'cyndi', 'fye', 'cycling', 'dynasty', 'syrupy', 'ryders', 'systemic', 'byron', 'lynn', 'py', 'mystic', 'hynde', 'synthy', 'synchronization', 'dysfunctional', 'sydney', 'xylichew', 'synchronize', 'tylt', 'lysol', 'myselfi', 'symbols', '4yo', 'lycra', '7yr', '9yr', '7yo', 'tykes', 'ex', 'extremely', 'extreme', 'exact', 'explain', 'external', 'excursion', 'explained', 'exterior', 'ox', 'experiencing', 'exist', '2xl', 'existence', 'excessive', 'expert', 'expose', 'extendable', 'expectedwhich', 'expires', 'extending', 'expelled', 'expression', 'execs', 'experiencewhat', 'extinction', 'axe', 'excitment', 'exploited', 'executed', 'axemen', 'excerpt', 'mxpx', 'exiles', 'expendable', 'exhibit', 'exceptionthe', 'xx', 'exemplified', 'experiments', 'excite', 'exploits', 'expend', 'execrable', 'expansive', 'axes', 'exercised', 'explode', 'extol', '7x', 'example2', 'expectancy', 'extruded', 'expectedi', 'expat', 'expiry', 'experienceupdate', '9x9', 'expanders', '1x', '3x9', 'expectthe', 'expansions', 'two', 'away', 'twice', 'awg', 'twisting', 'tweezers', '4wd', 'switching', 'swivel', 'swear', '5w', 'twitchy', 'awkward', 'swirl', 'awning', 'twenty', 'gw', 'sweaty', 'awing', 'awe', 'iwe', 'pwc', 'swizz', 'bw', 'owes', 'tweaked', 'swings', 'swifts', 'swearing', 'twodisc', 'twinkie', 'swooping', 'awash', 'swells', 'awake', '3way', 'swine', 'sweetie', 'dwarfed', 'swoon', 'fw', 'tweedy', 'twothe', 'swords', 'awkwardly', 'twain', 'tweens', 'fword', 'swashbuckling', 'twominute', 'sweeps', 'awaken', 'sweats', 'twister', 'sweetener', 'sweeteners', 'swallows', 'twopart', 'awayit', 'twoyearold', 'even', 'dvd', 'ever', 'overtightened', 'everywhere', 'overheating', 'overkill', 'tv', 'evacuate', 'tvs', 'avoided', 'event', 'pvc', 'evidenced', 'oversexed', 'overdubbed', 'evangelical', 'evaporate', 'overuse', 'overprocessed', 'averages', 'evolutionary', 'overdo', 'everchanging', 'everclear', 'dvdif', 'avantgarde', 'overcompensates', 'avante', 'overbearing', 'oversell', 'everly', 'overproducing', 'oversees', 'evita', 'iverson', 'overated', 'evens', 'overstuffed', 'overcooked', 'overwhelms', 'lv', 'avocados', 'overpay', 'availablei', 'everhard', 'aviator', 'sview', 'evey', 'overfill', 'overpaid', 'guess', 'buy', 'buying', 'purchased', 'bump', 'dust', 'suddenly', 'suction', 'purchase', 'curved', 'auto', 'support', 'supposedly', 'turned', 'output', 'audio', 'pump', 'hurts', 'purchasing', 'nuts', 'bulbs', 'pulled', 'pushes', 'push', 'guys', 'buick', 'turn', 'fuses', 'huge', 'bux', 'rust', 'sufficiently', 'tubes', 'outlet', 'bubbles', 'bubble', 'suggests', 'outta', 'duralast', 'current', 'questions', 'turning', 'automotive', 'bunch', 'autos', 'suppose', 'purchasesthe', 'cure', 'rusty', 'multiple', 'bummer', 'luckily', 'pushed', 'fuseholder', 'rubicon', 'jumped', 'suzuki', 'quad', 'outupdatethe', 'guard', 'puddles', 'june', 'question', 'outright', 'bundled', 'sucksthe', 'bulks', 'purchasethis']



['ozzy', 'oz', 'azoom', 'fyi', 'hybrid', 'syndrome', 'hydrated', '4yo', 'expected', 'excited', '2x', 'exposure', 'except', 'extreme', 'example', 'away', '2wd', 'swivel', 'awd', 'twisted', 'sweater', 'tweezers', 'swimming', 'twins', 'ever', 'uv', 'evil', 'overhyped', 'overseas', 'overlooked', 'evenly', 'evo', 'avid', 'overpaid', 'custom', 'much', 'quits', 'hurts', 'cushion', 'quality', 'customer', 'bux', 'multiple', 'bulbs', 'turned', 'supposed', 'guess', 'sunshade', 'cups', 'cut', 'sub', 'purchases', 'dunno', 'bunch', 'suv', 'quickly', 'dumb', 'functionally', 'kurt', 'substantial', 'sucked', 'pulled', 'gutter', 'outside', 'suggestion', 'rub', 'juice', 'turner', 'fuss', 'public', 'funniest', 'business', 'burn', 'hunter', 'suffers', 'outshines', 'puppy', 'qualities', 'autotune', 'guitars', 'outs', 'sudden', 'suits', 'subpar', 'putty', 'punish', 'puts', 'butterfly', 'judge', 'bubbles', 'authentic', 'guaranteed', 'jug', 'rum', 'gummies', 'rubbed', 'sugarfree', 'nutrition', 'nutritious', 'q

In [32]:
cutoff = 3

# summary
data = train_data["summary"].astype(str).apply(edit_summary)
vectorizer1 = TfidfVectorizer()
output1 = vectorizer1.fit_transform(data)
dict1 = vectorizer1.vocabulary_

# top 3000 words by frequency
top_terms1 = sorted(dict1.keys(), key=lambda x: x[1], reverse=True)[:3000]
print(top_terms1)
feature_names1 = vectorizer1.get_feature_names_out()
stop_words1 = []
y = train_data['overall'].apply(lambda x: 0 if x <= cutoff else 1)

# checking each word if it is relevant to the cutoff
for word in top_terms1:
    index = np.where(feature_names1 == word)[0]
    star_arr = np.where(output1.todense()[:, index] != 0)[0]
    num_zeros = np.count_nonzero(np.array(y[star_arr] == 0))
    if (np.array(y[star_arr]).size == 0):
        continue
    ratio = num_zeros/np.array(y[star_arr]).size
    # if y=zeros/ total lies in the region of [0.45,0.55] we include that in special stop words
    if ratio >= 0.45 and ratio <= 0.55:
        stop_words1.append(word)

print(stop_words1)


# review Text
data = train_data["reviewText"].astype(str).apply(edit_review_text)
vectorizer2 = TfidfVectorizer()
output2 = vectorizer2.fit_transform(data)
dict2 = vectorizer2.vocabulary_
# top 3000 words by frequency
top_terms2 = sorted(dict2.keys(), key=lambda x: x[1], reverse=True)[:3000]
feature_names2 = vectorizer2.get_feature_names_out()
stop_words2 = []
y = train_data['overall'].apply(lambda x: 0 if x <= cutoff else 1)

# checking each word if it is relevant to the cutoff
for word in top_terms2:
    index = np.where(feature_names2 == word)[0]
    star_arr = np.where(output2.todense()[:, index] != 0)[0]
    num_zeros = np.count_nonzero(np.array(y[star_arr] == 0))
    if (np.array(y[star_arr]).size == 0):
        continue
    ratio = num_zeros/np.array(y[star_arr]).size
    # if y=zeros/ total lies in the region of [0.45,0.55] we include that in special stop words
    if ratio >= 0.45 and ratio <= 0.55:
        stop_words2.append(word)

print(stop_words2)

# Results

# STOP_WORDS_SUMMARY = ['ozzy', 'oz', 'azoom', 'fyi', 'eyes', 'hybrid', 'lyrics', 'hydrated', '4yo', 'explorer', '2x', 'exposure', 'xxl', 'extreme', 'example', 'excellence', '2wd', 'owners', 'aware', 'awd', 'tweaking', 'twist', 'awaited', 'owe', 'twisted', 'sweater', 'tweezers', 'swimming', 'uv', 'rvs', 'everything', 'every', 'dvd', 'overlooked', 'avid', 'custom', 'cushion', 'super', 'durable', 'bux', 'multiple', 'duty', 'cute', 'pump', 'sunshade', 'purchases', 'suited', 'bunch', 'suv', 'fuses', 'fusion', 'buick', 'audi', 'funny', 'current', 'quads', 'quick', 'bumper', 'guide', 'gutter', 'public', 'business', 'audio', 'punk', 'guitars', 'outs', 'turkey', 'humor', 'buffett', 'bueno', 'sunday', 'buzz', 'judge', 'cube', 'buddy', 'submarine', 'guaranteed', 'jug', 'rum', 'gummies', 'rubbed', 'sugarfree', 'nutritious', 'quinoa', 'sufferers', 'gummy', 'justify', 'quest', 'cupcakes', 'quickie', 'punching', 'supposedly', 'ounces', 'turbo', 'substantially', 'suffer', 'supplies', 'building', 'numbers', 'cutie', 'puppets', 'hunt', 'outfits', 'still', 'strands', 'gt', 'stretched', 'xtreme', 'stores', 'utilitarian', 'stainless', 'starters', 'atleast', 'stage', 'standards', 'strength', 'stp', 'steve', 'styles', 'stages', 'stratovarius', 'steep', 'stevia', 'strawberries', 'strawberry', 'htc', 'bt', 'steal', 'study', 'stroller', 'stove', 'statue', 'isuzu', 'user', 'tsx', 'usa', 'essence', 'ashlee', 'aspect', 'bs', '5star', 'usb', 'aside', 'product', 'around', 'crew', 'grand', 'trd', 'armor', 'arctic', 'driver', 'wrench', 'gray', 'pricey', 'protection', 'trim', 'fresh', 'drilling', 'true', 'ground', 'crossbar', 'wrx', 'practicality', 'transfer', 'traction', 'prevents', 'orignal', 'fraction', 'prius', 'groove', 'breath', 'breakfast', 'orbital', 'proud', 'freaking', 'art', 'travels', 'grace', 'brilliance', 'group', 'iron', 'drinking', 'brigade', 'organized', 'primus', 'groundbreaking', 'critical', 'orchestration', 'dressed', 'travelling', 'kryptonite', 'orleans', 'irons', 'dropping', 'programs', 'frustration', 'yr', 'crackers', 'artisan', 'fruitcake', 'crumbled', 'grains', 'grapefruit', 'creamy', 'pretzels', 'brownie', 'freshness', 'crispy', 'breakable', 'drinkers', 'crisp', 'produce', 'grill', 'pricy', 'draft', 'crash', 'branded', 'wrapped', 'droid', 'protectors', 'armorsuit', 'orientation', 'preference', 'frills', 'tray', 'grippy', 'prints', 'vr', 'practice', 'armour', 'drawstring', 'trek', 'trains', 'dragonfly', 'tracker', 'trike', 'dryer', 'breyer', 'princess', 'grandsons', 'dresses', 'crayon', 'equipped', 'sponge', 'applications', 'operate', 'iphone', 'spot', 'specifications', 'appropriate', 'sports', 'approach', 'appetite', 'spell', 'spins', 'spiced', 'spices', 'spanish', '6p', 'ipad3', 'operates', 'speeds', 'iphone4', '3pack']
# STOP_WORDS_REVIEW_TEXT = ['oza', 'azalea', 'ozawa', 'czech', 'gza', 'izumi', 'typical', 'types', 'eye', 'nylon', 'eyourlife', 'sympathetic', 'tyvek', 'tyc', 'gy', 'mynbspa', 'synthetics', 'tying', 'cycles', 'lyrical', 'symphonies', 'bytes', 'myth', 'myung', 'gypsys', 'wyclef', 'synthesizer', 'gym', 'lynyrd', 'mysterious', 'xy', 'lynne', 'cyndi', 'fye', 'dynasty', 'lynch', 'systemic', 'byron', 'syncing', 'lynn', 'mystic', 'nyc', 'wynton', 'eyez', 'synthpop', 'kylies', 'ky', 'kylie', 'myrath', 'lyles', 'lyle', 'cy', 'synergy', 'rye', 'synched', 'xylichew', 'cyanide', 'myselfoverall', 'synchronize', 'tylt', 'lysol', '4yo', 'kydex', 'lycra', '3yearold', '7yr', '9yr', '7yearold', 'tykes', '5yr', '5yo', '9yo', 'existing', 'extended', 'exists', 'exactly', '3x', 'extension', 'ox', 'exchange', 'expedition', 'excess', 'extendable', 'oxidation', 'extenders', 'gx470', 'expectedwhich', 'expires', 'exposure', 'extending', 'lx', 'exercises', 'exemplifies', 'experiencewhat', 'extinction', 'experimental', 'exercise', 'excitment', 'exploited', 'expendable', 'exhibit', 'exceptionthe', 'explicit', 'exemplified', 'exaggeration', 'execrable', 'expansive', 'expectationsi', 'examplethe', 'expressiveness', 'excessively', '7x', 'example2', 'expectedi', 'expat', 'expiry', 'experienceupdate', '1x', '3x9', 'axei', 'xxs', 'expander', 'expectthe', 'owning', 'aware', 'tweezers', 'twin', 'ww', '5w', 'awning', 'awing', 'swipe', 'iwe', 'swallow', 'switchi', 'pwc', 'tweaked', 'twinkie', 'swooping', 'awash', 'twang', 'sweeping', '3way', 'swine', 'sweetie', 'dwarfed', 'await', 'tweedy', 'twothe', 'twain', 'fword', 'swashbuckling', 'swamp', 'twominute', 'twists', 'sweeps', 'twentysomething', 'ewan', 'awaya', 'swims', 'swung', 'sweetit', 'sw', 'swallowed', 'swarovski', 'owls', 'awayit', 'swimmer', 'twoyearold', 'twoplayer', 'wwwthroatpunchgamescom', 'overall', 'rv', 'everything', 'overheating', 'oval', 'evacuate', 'tvs', 'everyday', 'eventhough', 'evap', 'wv', 'oversexed', 'everybodys', 'evaporate', 'overtones', 'averages', 'evolutionary', 'overdo', 'everchanging', 'overlooked', 'dvdif', 'overthetop', 'avante', 'oversell', 'everly', 'overproducing', 'evoke', 'evita', 'overjoyed', 'evolving', 'overcoming', 'av', 'evaluating', 'oversimplification', 'ivy', 'overthe', 'lv', 'overpay', 'overs', 'dv', 'ivory', 'avaiable', 'everhard', 'cvs', 'evo', 'aviator', 'sview', 'overfill', 'evaporation', 'everydayproduct', 'durable', 'custom', 'quart', 'pure', 'built', 'aux', 'tundra', 'quick', 'fuel', 'jumps', 'dual', 'super', 'hunting', 'survive', 'bulk', 'bux', 'outta', 'duralast', 'suv', 'suspension', 'autos', 'quarter', 'rundown', 'fun', 'mustang', 'fuseholder', 'rubicon', 'bug', 'cute', 'suzuki', 'bumps', 'suave', 'cushions', 'puddles', 'lunch', 'functionality', 'surrounding', 'bulks', 'purchasethis']


['ozzy', 'oz', 'azoom', 'fyi', 'eyes', 'hybrid', 'lyrics', 'hydrated', '4yo', 'explorer', '2x', 'exposure', 'xxl', 'extreme', 'example', 'excellence', '2wd', 'owners', 'aware', 'awd', 'tweaking', 'twist', 'awaited', 'owe', 'twisted', 'sweater', 'tweezers', 'swimming', 'uv', 'rvs', 'everything', 'every', 'dvd', 'overlooked', 'avid', 'custom', 'cushion', 'super', 'durable', 'bux', 'multiple', 'duty', 'cute', 'pump', 'sunshade', 'purchases', 'suited', 'bunch', 'suv', 'fuses', 'fusion', 'buick', 'audi', 'funny', 'current', 'quads', 'quick', 'bumper', 'guide', 'gutter', 'public', 'business', 'audio', 'punk', 'guitars', 'outs', 'turkey', 'humor', 'buffett', 'bueno', 'sunday', 'buzz', 'judge', 'cube', 'buddy', 'submarine', 'guaranteed', 'jug', 'rum', 'gummies', 'rubbed', 'sugarfree', 'nutritious', 'quinoa', 'sufferers', 'gummy', 'justify', 'quest', 'cupcakes', 'quickie', 'punching', 'supposedly', 'ounces', 'turbo', 'substantially', 'suffer', 'supplies', 'building', 'numbers', 'cutie', 'puppet

In [33]:
cutoff = 4

# summary
data = train_data["summary"].astype(str).apply(edit_summary)
vectorizer1 = TfidfVectorizer()
output1 = vectorizer1.fit_transform(data)
dict1 = vectorizer1.vocabulary_

# top 3000 words by frequency
top_terms1 = sorted(dict1.keys(), key=lambda x: x[1], reverse=True)[:3000]
print(top_terms1)
feature_names1 = vectorizer1.get_feature_names_out()
stop_words1 = []
y = train_data['overall'].apply(lambda x: 0 if x <= cutoff else 1)

# checking each word if it is relevant to the cutoff
for word in top_terms1:
    index = np.where(feature_names1 == word)[0]
    star_arr = np.where(output1.todense()[:, index] != 0)[0]
    num_zeros = np.count_nonzero(np.array(y[star_arr] == 0))
    if (np.array(y[star_arr]).size == 0):
        continue
    ratio = num_zeros/np.array(y[star_arr]).size
    # if y=zeros/ total lies in the region of [0.45,0.55] we include that in special stop words
    if ratio >= 0.45 and ratio <= 0.55:
        stop_words1.append(word)

print(stop_words1)


# review Text
data = train_data["reviewText"].astype(str).apply(edit_review_text)
vectorizer2 = TfidfVectorizer()
output2 = vectorizer2.fit_transform(data)
dict2 = vectorizer2.vocabulary_
# top 3000 words by frequency
top_terms2 = sorted(dict2.keys(), key=lambda x: x[1], reverse=True)[:3000]
feature_names2 = vectorizer2.get_feature_names_out()
stop_words2 = []
y = train_data['overall'].apply(lambda x: 0 if x <= cutoff else 1)

# checking each word if it is relevant to the cutoff
for word in top_terms2:
    index = np.where(feature_names2 == word)[0]
    star_arr = np.where(output2.todense()[:, index] != 0)[0]
    num_zeros = np.count_nonzero(np.array(y[star_arr] == 0))
    if (np.array(y[star_arr]).size == 0):
        continue
    ratio = num_zeros/np.array(y[star_arr]).size
    # if y=zeros/ total lies in the region of [0.45,0.55] we include that in special stop words
    if ratio >= 0.45 and ratio <= 0.55:
        stop_words2.append(word)

print(stop_words2)

# Results


# STOP_WORDS_SUMMARY = ['hyundai', 'eyed', '4yo', 'cycling', '2x', 'excellence', 'excelent', 'exquisite', '2wd', 'awsome', 'awhile', 'awaited', 'owe', 'twisted', 'swimming', 'rvs', 'overlooked', 'sunshade', 'purchases', 'subaru', 'buick', 'audi', 'quads', 'tuner', 'tune', 'gutter', 'junior', 'musical', 'guitars', 'funk', 'humor', 'bueno', 'sunday', 'buzz', 'dude', 'hurrah', 'guessing', 'junkie', 'jug', 'rum', 'rubbed', 'nutritious', 'muffins', 'turmeric', 'hunters', 'supposedly', 'ounces', 'turbo', 'suffer', 'supplies', 'cutie', 'puppets', 'cuddly', 'gt', 'utilitarian', 'attaches', 'stroke', 'starters', 'standards', '6th', 'stretching', 'stream', 'bt', 'stout', 'strategy', 'isuzu', 'tsx', 'usefull', 'usa', 'ass', 'bs', 'crew', 'trd', 'drilling', 'transmission', 'draw', 'brother', 'traction', 'prevents', 'orignal', 'fraction', 'prius', 'groove', 'proud', 'grace', 'brilliance', 'brigade', 'orchestration', 'treasure', 'previously', 'irons', 'franklin', 'programs', 'frustration', 'protein', 'artisan', 'crumbled', 'grains', 'grapefruit', 'drinkers', 'crisp', 'breads', 'brewing', 'creme', 'branded', 'wrapped', 'armorsuit', 'orientation', 'preference', 'tray', 'prints', 'vr', 'armour', 'dragonfly', 'dryer', 'breyer', 'dresses', 'promo', 'transformers', 'spread', 'appetite', 'ipod', 'spices', 'spanish', 'operates', 'speeds', 'iphone4']
# STOP_WORDS_REVIEW_TEXT = ['oza', 'rzr', 'azalea', 'czech', 'gza', 'eyourlife', 'tyc', 'ty', 'synthetics', 'dynasty', 'systemic', 'lynn', 'gymnastics', 'wynton', 'eyez', 'kylies', 'ky', 'myrath', 'lyles', 'synergy', 'ayreon', 'nyro', 'hypocritical', 'hybrids', 'sycamore', 'rythm', 'lyon', 'symmetry', 'hydrate', 'myselfoverall', 'synchronize', 'lysol', 'cycled', 'eyesight', '4yo', '5yo', '9yo', '6year', 'ox', 'expedition', 'gx470', 'fx4', 'lx', 'exemplifies', 'exemplified', 'exquisite', 'expectationsi', 'examplethe', 'explodes', 'exhort', '7x', 'example2', 'expectedi', 'expat', 'expiry', 'experiencei', '9x', '3x9', '5w', 'awning', 'sweetie', 'twain', 'fword', 'swashbuckling', 'twentysomething', 'ewan', 'awaya', 'twochannel', 'twitching', 'dwells', 'sweetit', 'swallowed', 'sweetens', 'owls', 'twoyearold', 'twoplayer', 'wwwthroatpunchgamescom', 'overcharging', 'tvs', 'eventhough', 'evap', 'aviation', 'overdrive', 'evaporate', 'everchanging', 'dvdif', 'evoking', 'events', 'overcoming', 'av', 'evergrowing', 'evergreen', 'overcomes', 'dv', 'cvs', 'aviator', 'overtheear', 'averagesized', 'everydayproduct', 'bux', 'duralast', 'lubrication', 'puddles', 'purchasethis']

['hyundai', 'eyed', '4yo', 'cycling', '2x', 'excellence', 'excelent', 'exquisite', '2wd', 'awsome', 'awhile', 'awaited', 'owe', 'twisted', 'swimming', 'rvs', 'overlooked', 'sunshade', 'purchases', 'subaru', 'buick', 'audi', 'quads', 'tuner', 'tune', 'gutter', 'junior', 'musical', 'guitars', 'funk', 'humor', 'bueno', 'sunday', 'buzz', 'dude', 'hurrah', 'guessing', 'junkie', 'jug', 'rum', 'rubbed', 'nutritious', 'muffins', 'turmeric', 'hunters', 'supposedly', 'ounces', 'turbo', 'suffer', 'supplies', 'cutie', 'puppets', 'cuddly', 'gt', 'utilitarian', 'attaches', 'stroke', 'starters', 'standards', '6th', 'stretching', 'stream', 'bt', 'stout', 'strategy', 'isuzu', 'tsx', 'usefull', 'usa', 'ass', 'bs', 'crew', 'trd', 'drilling', 'transmission', 'draw', 'brother', 'traction', 'prevents', 'orignal', 'fraction', 'prius', 'groove', 'proud', 'grace', 'brilliance', 'brigade', 'orchestration', 'treasure', 'previously', 'irons', 'franklin', 'programs', 'frustration', 'protein', 'artisan', 'crumbled'

In [None]:
# For multi classification


# summary
data = train_data["summary"].astype(str).apply(edit_summary)
vectorizer1 = TfidfVectorizer()
output1 = vectorizer1.fit_transform(data)
dict1 = vectorizer1.vocabulary_

# top 3000 words by frequency
top_terms1 = sorted(dict1.keys(), key=lambda x: x[1], reverse=True)[:3000]
print(top_terms1)
feature_names1 = vectorizer1.get_feature_names_out()
stop_words1 = []
y = train_data['overall'].apply(lambda x: 0 if x <= cutoff else 1)

# checking each word if it is relevant to the cutoff
for word in top_terms1:
    index = np.where(feature_names1 == word)[0]
    star_arr = np.where(output1.todense()[:, index] != 0)[0]
    num_zeros = np.count_nonzero(np.array(y[star_arr] == 0))
    if (np.array(y[star_arr]).size == 0):
        continue
    ratio = num_zeros/np.array(y[star_arr]).size
    # if y=zeros/ total lies in the region of [0.45,0.55] we include that in special stop words
    if ratio >= 0.45 and ratio <= 0.55:
        stop_words1.append(word)

print(stop_words1)


# review Text
data = train_data["reviewText"].astype(str).apply(edit_review_text)
vectorizer2 = TfidfVectorizer()
output2 = vectorizer2.fit_transform(data)
dict2 = vectorizer2.vocabulary_
# top 3000 words by frequency
top_terms2 = sorted(dict2.keys(), key=lambda x: x[1], reverse=True)[:3000]
feature_names2 = vectorizer2.get_feature_names_out()
stop_words2 = []
y = train_data['overall'].apply(lambda x: 0 if x <= cutoff else 1)

# checking each word if it is relevant to the cutoff
for word in top_terms2:
    index = np.where(feature_names2 == word)[0]
    star_arr = np.where(output2.todense()[:, index] != 0)[0]
    num_zeros = np.count_nonzero(np.array(y[star_arr] == 0))
    if (np.array(y[star_arr]).size == 0):
        continue
    ratio = num_zeros/np.array(y[star_arr]).size
    # if y=zeros/ total lies in the region of [0.45,0.55] we include that in special stop words
    if ratio >= 0.45 and ratio <= 0.55:
        stop_words2.append(word)

print(stop_words2)

# Results


# STOP_WORDS_SUMMARY = ['hyundai', 'eyed', '4yo', 'cycling', '2x', 'excellence', 'excelent', 'exquisite', '2wd', 'awsome', 'awhile', 'awaited', 'owe', 'twisted', 'swimming', 'rvs', 'overlooked', 'sunshade', 'purchases', 'subaru', 'buick', 'audi', 'quads', 'tuner', 'tune', 'gutter', 'junior', 'musical', 'guitars', 'funk', 'humor', 'bueno', 'sunday', 'buzz', 'dude', 'hurrah', 'guessing', 'junkie', 'jug', 'rum', 'rubbed', 'nutritious', 'muffins', 'turmeric', 'hunters', 'supposedly', 'ounces', 'turbo', 'suffer', 'supplies', 'cutie', 'puppets', 'cuddly', 'gt', 'utilitarian', 'attaches', 'stroke', 'starters', 'standards', '6th', 'stretching', 'stream', 'bt', 'stout', 'strategy', 'isuzu', 'tsx', 'usefull', 'usa', 'ass', 'bs', 'crew', 'trd', 'drilling', 'transmission', 'draw', 'brother', 'traction', 'prevents', 'orignal', 'fraction', 'prius', 'groove', 'proud', 'grace', 'brilliance', 'brigade', 'orchestration', 'treasure', 'previously', 'irons', 'franklin', 'programs', 'frustration', 'protein', 'artisan', 'crumbled', 'grains', 'grapefruit', 'drinkers', 'crisp', 'breads', 'brewing', 'creme', 'branded', 'wrapped', 'armorsuit', 'orientation', 'preference', 'tray', 'prints', 'vr', 'armour', 'dragonfly', 'dryer', 'breyer', 'dresses', 'promo', 'transformers', 'spread', 'appetite', 'ipod', 'spices', 'spanish', 'operates', 'speeds', 'iphone4']
# STOP_WORDS_REVIEW_TEXT = ['oza', 'rzr', 'azalea', 'czech', 'gza', 'eyourlife', 'tyc', 'ty', 'synthetics', 'dynasty', 'systemic', 'lynn', 'gymnastics', 'wynton', 'eyez', 'kylies', 'ky', 'myrath', 'lyles', 'synergy', 'ayreon', 'nyro', 'hypocritical', 'hybrids', 'sycamore', 'rythm', 'lyon', 'symmetry', 'hydrate', 'myselfoverall', 'synchronize', 'lysol', 'cycled', 'eyesight', '4yo', '5yo', '9yo', '6year', 'ox', 'expedition', 'gx470', 'fx4', 'lx', 'exemplifies', 'exemplified', 'exquisite', 'expectationsi', 'examplethe', 'explodes', 'exhort', '7x', 'example2', 'expectedi', 'expat', 'expiry', 'experiencei', '9x', '3x9', '5w', 'awning', 'sweetie', 'twain', 'fword', 'swashbuckling', 'twentysomething', 'ewan', 'awaya', 'twochannel', 'twitching', 'dwells', 'sweetit', 'swallowed', 'sweetens', 'owls', 'twoyearold', 'twoplayer', 'wwwthroatpunchgamescom', 'overcharging', 'tvs', 'eventhough', 'evap', 'aviation', 'overdrive', 'evaporate', 'everchanging', 'dvdif', 'evoking', 'events', 'overcoming', 'av', 'evergrowing', 'evergreen', 'overcomes', 'dv', 'cvs', 'aviator', 'overtheear', 'averagesized', 'everydayproduct', 'bux', 'duralast', 'lubrication', 'puddles', 'purchasethis']

['hyundai', 'eyed', '4yo', 'cycling', '2x', 'excellence', 'excelent', 'exquisite', '2wd', 'awsome', 'awhile', 'awaited', 'owe', 'twisted', 'swimming', 'rvs', 'overlooked', 'sunshade', 'purchases', 'subaru', 'buick', 'audi', 'quads', 'tuner', 'tune', 'gutter', 'junior', 'musical', 'guitars', 'funk', 'humor', 'bueno', 'sunday', 'buzz', 'dude', 'hurrah', 'guessing', 'junkie', 'jug', 'rum', 'rubbed', 'nutritious', 'muffins', 'turmeric', 'hunters', 'supposedly', 'ounces', 'turbo', 'suffer', 'supplies', 'cutie', 'puppets', 'cuddly', 'gt', 'utilitarian', 'attaches', 'stroke', 'starters', 'standards', '6th', 'stretching', 'stream', 'bt', 'stout', 'strategy', 'isuzu', 'tsx', 'usefull', 'usa', 'ass', 'bs', 'crew', 'trd', 'drilling', 'transmission', 'draw', 'brother', 'traction', 'prevents', 'orignal', 'fraction', 'prius', 'groove', 'proud', 'grace', 'brilliance', 'brigade', 'orchestration', 'treasure', 'previously', 'irons', 'franklin', 'programs', 'frustration', 'protein', 'artisan', 'crumbled'