In [1]:
import pandas as pd
import numpy as np

# word embedding
from nltk.tokenize import sent_tokenize, word_tokenize 
import warnings 
  
warnings.filterwarnings(action = 'ignore') 
  
import gensim 
from gensim.models import Word2Vec 
from collections import defaultdict
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
stemmer = SnowballStemmer('english')

import nltk
nltk.download('wordnet')

# classify
from sklearn import tree
from sklearn.naive_bayes import BernoulliNB
from sklearn.linear_model import SGDClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

# evaluate
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

import nltk
nltk.download('punkt')
import itertools

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/chenstar/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /Users/chenstar/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


## Word Embedding

In [2]:
df = pd.read_csv('sweatshirt_100words.csv') # 7000+ reviews
category = ['color', 'size', 'qualiti', 'comfi', 'price', 'materi']

In [3]:
def lemmatize_stemming(text):
    '''
    Words are lemmatized — words in third person are changed to first person 
    and verbs in past and future tenses are changed into present.
    Words are stemmed — words are reduced to their root form.
    '''
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))

In [4]:
# tokenize
data = []
for text in range(df.shape[0]):
    s = df['reviewText'][text] 

    # Replaces escape character with space 
    f = s.replace("\n", " ") 
 

    # iterate through each sentence in the file 
    for i in sent_tokenize(f): 
        temp = [] 

        # tokenize the sentence into words 
        for j in word_tokenize(i):
            j = lemmatize_stemming(j)
            temp.append(j.lower()) 

        data.append(temp)

In [5]:
words = list(itertools.chain.from_iterable(data))
model1 = gensim.models.Word2Vec(data,min_count = 1,size = 100, window = 5) 

category = ['color', 'size', 'qualiti', 'comfi', 'price', 'materi']
similarity = defaultdict(list)
for i in category:
    for j in words:
        similarity[i].append(model1.similarity(i,j))
        
df_sim = pd.DataFrame(similarity)
df_sim.index = words

list_words = [] # including related words in each criteria
for i in category:
    words = df_sim[i].loc[df_sim[i]>0.6].index
    words = np.unique(words)
    list_words.append(words)

In [6]:
list_words = [['array', 'bright', 'color', 'colour', 'depict', 'desrib', 'od',
        'pictur', 'picture-', 'pricepoint', 'red', 'satur', 'select',
        'shade', 'show', 'vibrant'], 
        ['2x', '2xl', '3x', '3xl', '4x', '4xl', 'bigger', 'bite', 'decid',
        'did..', 'exchang', 'extra', 'l', 'larg', 'larger', 'loose-fit',
        'm', 'may', 'medium', 'might', 'normal', 'order',
        'price..although', 'run', 'should', 'sip', 'size', 'sizw',
        'smaller', 'somewher', 'suggest', 'surer', 'swap', 'them.i',
        'then', 'upmi', 'useal', 'usual', 'waaayyyyy', 'wil', 'x', 'xl',
        'xtra', 'xxl', 'xxxxl', 'youth', 'zip'], ['.size', '105lbs', '11-buck', '23.00', '4-star', 'afford', 'amaz',
        'asid', 'bargain', 'basic', 'beat', 'best', 'bethat', 'big/wid',
        'boot', 'bunchi', 'cheap', 'choic', 'clip', 'colour', 'companion',
        'contribut', 'damn', 'dat', 'deal', 'decent', 'diliveri',
        'disappoint', 'disgust', 'downgrad', 'durabl', 'econo', 'elig',
        'excel', 'exepcion', 'fabix', 'fabric', 'fabric..', 'fast', 'feel',
        'find..', 'fit-wis', 'fitment', 'fleec', 'gidan', 'glue', 'good',
        'good..', 'goof', 'headphon', 'heavyweight', 'hefti', 'high',
        'high-end', 'impres', 'kind', 'lighter', 'lightweight', 'low',
        'make', 'mater', 'materi', 'materiel', 'medium-weight', 'mic',
        'midweight', 'money', 'none', 'ok', 'ok.', 'otherwis', 'outstand',
        'overal', 'pace', 'pictur', 'pleasant', 'point', 'poor', 'price',
        'pricepoint', 'product', 'qualiti', 'qualliti', 'realiti',
        'reallti', 'reason', 'repent', 'reput', 'select', 'servic',
        'shade', 'shirtshirt', 'simpl', 'siz3', 'sound', 'space/a',
        'strength', 'sturdi', 'such', 'surviv', 'terrif', 'textur',
        'theyr', 'tprice', 'trash', 'triangl', 'unbeat', 'valu',
        'versatil', 'well-mad', 'worth'], ['.color', '.nice', '10-16', '105', '125lbs', '2996', '4-star',
        '511', '80/20', '88-90lbs', '8oz', 'abd', 'above~', 'acuratt',
        'afford', 'alright', 'amaz', 'anyway', 'asid', 'awesom', 'baggi',
        'basic', 'beefi', 'besid', 'boxi', 'breath', 'breathabl', 'bright',
        'broad-should', 'bulki', 'cardigan', 'cheap', 'chines', 'chocker',
        'class', 'collect', 'colorfast', 'comfi', 'comfort', 'comfti',
        'constuct', 'cotton/poli', 'cotton/polyest', 'cottoni', 'cozi',
        'crop', 'cudd', 'curvi', 'cus', 'cute', 'darn', 'diamet',
        'disproport', 'dumb', 'duper', 'durabl', 'duribl', 'easi', 'econo',
        'extrem', 'fabric', 'fair', 'feel', 'fitment', 'fleec', 'flimsi',
        'fluffi', 'fuzzi', 'gaudi', 'generous', 'gestur', 'gigant',
        'glove', 'good', 'great', 'greatest', 'heavi', 'heavy-blend',
        'heavyweight', 'heft', 'hell', 'hella', 'hoodie~it', 'hott',
        'howev', 'huge', 'ideal', 'impress', 'incred', 'inexpens', 'insid',
        'interior', 'kept', 'knit', 'light', 'light-weight',
        'light/heavyweight', 'lightweight', 'litt', 'lol', 'loos',
        'low-coast', 'materi', 'middleweight', 'midweight', 'mute', 'n',
        'neither', 'nice', 'nonetheless', 'notabl', 'off-whit', 'ok',
        'ok.', 'okay', 'otherwis', 'overal', 'overs', 'perfect', 'piti',
        'pleanti', 'pleasant', 'pleased-it', 'plush', 'pretti', 'provid',
        'puchas', 'quiet', 'quit', 'real', 'reali', 'realli', 'relat',
        'relax', 'remind', 'resili', 'robberi', 'roomi', 's+', 'sane',
        'shape', 'signatur', 'silki', 'simpl', 'simple/standard',
        'simplic', 'siz3', 'sloppi', 'small/petit', 'smart', 'smooth',
        'snugg', 'soft', 'solid', 'soo', 'sooo', 'soooooo', 'ssooo',
        'stark', 'stiff', 'stiffer', 'strong', 'stun', 'sturdi', 'stylish',
        'suffoc', 'sum', 'super', 'supper', 'surpric', 'surpris',
        'sweatshirt/hoodi', 'taylor', 'theyr', 'thick', 'thin', 'toasti',
        'tough', 'transpar', 'triangl', 'trim', 'ultra', 'unbeliev',
        'uncomfort', 'uneven', 'unwear', 'usabl', 'utter', 'veri', 'vivid',
        'warm', 'warm-but', 'warmth', 'washabl', 'wayyyi', 'weak',
        'wearabl', 'weight', 'weird', 'wel', 'well-mad', 'wellmad', 'whoa',
        'woild', 'wonder', 'worm', 'wow', 'yellow/gold'], 
        ['105lbs', '23.00', 'afford', 'asid', 'bargain', 'basic', 'beat',
        'best', 'bethat', 'boot', 'cheap', 'choic', 'contribut', 'cost',
        'dat', 'deal', 'decent', 'excel', 'fabric..', 'goof', 'headphon',
        'look', 'low', 'mic', 'money', 'outstand', 'overal', 'pay',
        'price', 'princ', 'product', 'qualiti', 'reason', 'repent',
        'reput', 'servic', 'surviv', 'thank', 'unbeat', 'valu', 'worth'], ['125lb', '1990s..', '4-star', '8oz', 'also', 'although', 'amaz',
        'asid', 'basic', 'besid', 'bethat', 'big/wid', 'breathabl',
        'brilliant', 'broad-should', 'buhh', 'bulgi', 'bulki', 'bunchi',
        'cheap', 'cheep', 'clip', 'comfi', 'comfort', 'cumbersom', 'darn',
        'decent', 'deeper', 'deriv', 'descript', 'diamet', 'disproport',
        'drawstr', 'duper', 'durabl', 'econo', 'either', 'elig', 'extrem',
        'fabric', 'fair', 'farmstead', 'featur', 'feel', 'fitment',
        'fleec', 'flimsi', 'fluffi', 'fond', 'freedom', 'fricken',
        'garment', 'generous', 'gether', 'gigant', 'glue', 'good',
        'greatest', 'headphon', 'heavier', 'heavy-blend', 'heavy/thick',
        'heavyweight', 'heft', 'hefti', 'hood', 'howev', 'indulg',
        'inferior', 'insid', 'inside..', 'irrit', 'is', 'itself', 'kind',
        'light-weight', 'lighter', 'lightweight', 'line', 'low', 'make',
        'materi', 'materiel', 'mic', 'middleweight', 'nice', 'none',
        'notabl', 'nowher', 'ok', 'ok.', 'okay', 'otherwis', 'outstand',
        'overal', 'pictur', 'pleasant', 'pleased-it', 'plush', 'point',
        'poor', 'pretti', 'proplem', 'protrud', 'puchas', 'puff',
        'qualiti', 'quit', 'roooooomi', 'scratchi', 'seem', 'shade',
        'shrinkabl', 'side', 'silki', 'simple/standard', 'siz3', 'soft',
        'space/a', 'staff', 'stark', 'stiff', 'strong', 'sturdi',
        'substanti', 'super', 'surpris', 'sweater.thin', 'terrif',
        'textur', 'theyr', 'thick', 'thicker', 'thin', 'thinner', 'though',
        'tough', 'triangl', 'ultra', 'usabl', 'valu', 'veri', 'w/same',
        'warm-but', 'warmer', 'warmth', 'weak', 'weight', 'well-mad',
        'which', 'wide', 'wild', 'writ']]

In [7]:
'''
result:
[array(['array', 'bright', 'color', 'colour', 'depict', 'desrib', 'od',
        'pictur', 'picture-', 'pricepoint', 'red', 'satur', 'select',
        'shade', 'show', 'vibrant'], dtype=object),
 array(['2x', '2xl', '3x', '3xl', '4x', '4xl', 'bigger', 'bite', 'decid',
        'did..', 'exchang', 'extra', 'l', 'larg', 'larger', 'loose-fit',
        'm', 'may', 'medium', 'might', 'normal', 'order',
        'price..although', 'run', 'should', 'sip', 'size', 'sizw',
        'smaller', 'somewher', 'suggest', 'surer', 'swap', 'them.i',
        'then', 'upmi', 'useal', 'usual', 'waaayyyyy', 'wil', 'x', 'xl',
        'xtra', 'xxl', 'xxxxl', 'youth', 'zip'], dtype=object),
 array(['.size', '105lbs', '11-buck', '23.00', '4-star', 'afford', 'amaz',
        'asid', 'bargain', 'basic', 'beat', 'best', 'bethat', 'big/wid',
        'boot', 'bunchi', 'cheap', 'choic', 'clip', 'colour', 'companion',
        'contribut', 'damn', 'dat', 'deal', 'decent', 'diliveri',
        'disappoint', 'disgust', 'downgrad', 'durabl', 'econo', 'elig',
        'excel', 'exepcion', 'fabix', 'fabric', 'fabric..', 'fast', 'feel',
        'find..', 'fit-wis', 'fitment', 'fleec', 'gidan', 'glue', 'good',
        'good..', 'goof', 'headphon', 'heavyweight', 'hefti', 'high',
        'high-end', 'impres', 'kind', 'lighter', 'lightweight', 'low',
        'make', 'mater', 'materi', 'materiel', 'medium-weight', 'mic',
        'midweight', 'money', 'none', 'ok', 'ok.', 'otherwis', 'outstand',
        'overal', 'pace', 'pictur', 'pleasant', 'point', 'poor', 'price',
        'pricepoint', 'product', 'qualiti', 'qualliti', 'realiti',
        'reallti', 'reason', 'repent', 'reput', 'select', 'servic',
        'shade', 'shirtshirt', 'simpl', 'siz3', 'sound', 'space/a',
        'strength', 'sturdi', 'such', 'surviv', 'terrif', 'textur',
        'theyr', 'tprice', 'trash', 'triangl', 'unbeat', 'valu',
        'versatil', 'well-mad', 'worth'], dtype=object),
 array(['.color', '.nice', '10-16', '105', '125lbs', '2996', '4-star',
        '511', '80/20', '88-90lbs', '8oz', 'abd', 'above~', 'acuratt',
        'afford', 'alright', 'amaz', 'anyway', 'asid', 'awesom', 'baggi',
        'basic', 'beefi', 'besid', 'boxi', 'breath', 'breathabl', 'bright',
        'broad-should', 'bulki', 'cardigan', 'cheap', 'chines', 'chocker',
        'class', 'collect', 'colorfast', 'comfi', 'comfort', 'comfti',
        'constuct', 'cotton/poli', 'cotton/polyest', 'cottoni', 'cozi',
        'crop', 'cudd', 'curvi', 'cus', 'cute', 'darn', 'diamet',
        'disproport', 'dumb', 'duper', 'durabl', 'duribl', 'easi', 'econo',
        'extrem', 'fabric', 'fair', 'feel', 'fitment', 'fleec', 'flimsi',
        'fluffi', 'fuzzi', 'gaudi', 'generous', 'gestur', 'gigant',
        'glove', 'good', 'great', 'greatest', 'heavi', 'heavy-blend',
        'heavyweight', 'heft', 'hell', 'hella', 'hoodie~it', 'hott',
        'howev', 'huge', 'ideal', 'impress', 'incred', 'inexpens', 'insid',
        'interior', 'kept', 'knit', 'light', 'light-weight',
        'light/heavyweight', 'lightweight', 'litt', 'lol', 'loos',
        'low-coast', 'materi', 'middleweight', 'midweight', 'mute', 'n',
        'neither', 'nice', 'nonetheless', 'notabl', 'off-whit', 'ok',
        'ok.', 'okay', 'otherwis', 'overal', 'overs', 'perfect', 'piti',
        'pleanti', 'pleasant', 'pleased-it', 'plush', 'pretti', 'provid',
        'puchas', 'quiet', 'quit', 'real', 'reali', 'realli', 'relat',
        'relax', 'remind', 'resili', 'robberi', 'roomi', 's+', 'sane',
        'shape', 'signatur', 'silki', 'simpl', 'simple/standard',
        'simplic', 'siz3', 'sloppi', 'small/petit', 'smart', 'smooth',
        'snugg', 'soft', 'solid', 'soo', 'sooo', 'soooooo', 'ssooo',
        'stark', 'stiff', 'stiffer', 'strong', 'stun', 'sturdi', 'stylish',
        'suffoc', 'sum', 'super', 'supper', 'surpric', 'surpris',
        'sweatshirt/hoodi', 'taylor', 'theyr', 'thick', 'thin', 'toasti',
        'tough', 'transpar', 'triangl', 'trim', 'ultra', 'unbeliev',
        'uncomfort', 'uneven', 'unwear', 'usabl', 'utter', 'veri', 'vivid',
        'warm', 'warm-but', 'warmth', 'washabl', 'wayyyi', 'weak',
        'wearabl', 'weight', 'weird', 'wel', 'well-mad', 'wellmad', 'whoa',
        'woild', 'wonder', 'worm', 'wow', 'yellow/gold'], dtype=object),
 array(['105lbs', '23.00', 'afford', 'asid', 'bargain', 'basic', 'beat',
        'best', 'bethat', 'boot', 'cheap', 'choic', 'contribut', 'cost',
        'dat', 'deal', 'decent', 'excel', 'fabric..', 'goof', 'headphon',
        'look', 'low', 'mic', 'money', 'outstand', 'overal', 'pay',
        'price', 'princ', 'product', 'qualiti', 'reason', 'repent',
        'reput', 'servic', 'surviv', 'thank', 'unbeat', 'valu', 'worth'],
       dtype=object),
 array(['125lb', '1990s..', '4-star', '8oz', 'also', 'although', 'amaz',
        'asid', 'basic', 'besid', 'bethat', 'big/wid', 'breathabl',
        'brilliant', 'broad-should', 'buhh', 'bulgi', 'bulki', 'bunchi',
        'cheap', 'cheep', 'clip', 'comfi', 'comfort', 'cumbersom', 'darn',
        'decent', 'deeper', 'deriv', 'descript', 'diamet', 'disproport',
        'drawstr', 'duper', 'durabl', 'econo', 'either', 'elig', 'extrem',
        'fabric', 'fair', 'farmstead', 'featur', 'feel', 'fitment',
        'fleec', 'flimsi', 'fluffi', 'fond', 'freedom', 'fricken',
        'garment', 'generous', 'gether', 'gigant', 'glue', 'good',
        'greatest', 'headphon', 'heavier', 'heavy-blend', 'heavy/thick',
        'heavyweight', 'heft', 'hefti', 'hood', 'howev', 'indulg',
        'inferior', 'insid', 'inside..', 'irrit', 'is', 'itself', 'kind',
        'light-weight', 'lighter', 'lightweight', 'line', 'low', 'make',
        'materi', 'materiel', 'mic', 'middleweight', 'nice', 'none',
        'notabl', 'nowher', 'ok', 'ok.', 'okay', 'otherwis', 'outstand',
        'overal', 'pictur', 'pleasant', 'pleased-it', 'plush', 'point',
        'poor', 'pretti', 'proplem', 'protrud', 'puchas', 'puff',
        'qualiti', 'quit', 'roooooomi', 'scratchi', 'seem', 'shade',
        'shrinkabl', 'side', 'silki', 'simple/standard', 'siz3', 'soft',
        'space/a', 'staff', 'stark', 'stiff', 'strong', 'sturdi',
        'substanti', 'super', 'surpris', 'sweater.thin', 'terrif',
        'textur', 'theyr', 'thick', 'thicker', 'thin', 'thinner', 'though',
        'tough', 'triangl', 'ultra', 'usabl', 'valu', 'veri', 'w/same',
        'warm-but', 'warmer', 'warmth', 'weak', 'weight', 'well-mad',
        'which', 'wide', 'wild', 'writ'], dtype=object)]
'''

"\nresult:\n[array(['array', 'bright', 'color', 'colour', 'depict', 'desrib', 'od',\n        'pictur', 'picture-', 'pricepoint', 'red', 'satur', 'select',\n        'shade', 'show', 'vibrant'], dtype=object),\n array(['2x', '2xl', '3x', '3xl', '4x', '4xl', 'bigger', 'bite', 'decid',\n        'did..', 'exchang', 'extra', 'l', 'larg', 'larger', 'loose-fit',\n        'm', 'may', 'medium', 'might', 'normal', 'order',\n        'price..although', 'run', 'should', 'sip', 'size', 'sizw',\n        'smaller', 'somewher', 'suggest', 'surer', 'swap', 'them.i',\n        'then', 'upmi', 'useal', 'usual', 'waaayyyyy', 'wil', 'x', 'xl',\n        'xtra', 'xxl', 'xxxxl', 'youth', 'zip'], dtype=object),\n array(['.size', '105lbs', '11-buck', '23.00', '4-star', 'afford', 'amaz',\n        'asid', 'bargain', 'basic', 'beat', 'best', 'bethat', 'big/wid',\n        'boot', 'bunchi', 'cheap', 'choic', 'clip', 'colour', 'companion',\n        'contribut', 'damn', 'dat', 'deal', 'decent', 'diliveri',\n        'disa

## Classify

In [8]:
def train_clfs(X_train,y_train,model='decision_tree'):
    """
    In the code below, I have trained a model specifically for decision tree. You must expand the code to accommodate
    the other two models. To learn more about sklearn's decision trees, see https://scikit-learn.org/stable/modules/tree.html
    :param X_train: self-explanatory
    :param y_train:
    :param model: we will allow three values for model namely 'decision_tree', 'naive_bayes' and 'linear_SGD_classifier'
    (Hint: you must change the loss function in https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.SGDClassifier.html
    to squared_loss to correctly implement the linear classifier. For the naive bayes, the appropriate model to use
    is the Bernoulli naive Bayes.)


    :return:
    """
    if model == 'decision_tree':
        clf = tree.DecisionTreeClassifier(random_state = 42)
        clf = clf.fit(X_train, y_train)
        return clf
    if model == 'naive_bayes':
        clf = BernoulliNB()
        clf = clf.fit(X_train, y_train)
        return clf
    if model == 'linear_SGD_classifier':
        clf = SGDClassifier(loss = 'squared_loss', random_state = 42)
        clf = clf.fit(X_train, y_train)
        return clf
    if model == 'SVC':
        clf = SVC(random_state = 42)
        clf = clf.fit(X_train, y_train)
        return clf
    if model == 'random_forest':
        clf = RandomForestClassifier(random_state = 42)
        clf = clf.fit(X_train, y_train)
        return clf
    else:
        return print('no such model')

In [9]:
model_list = ['decision_tree', 'naive_bayes', 'linear_SGD_classifier', 'SVC', 'random_forest']

In [10]:
df_1200 = pd.read_csv('1200.csv') # ground truth set
X = df_1200['reviewText']
y = df_1200.iloc[:,1:8]
y.columns = ['color', 'size', 'qualiti', 'comfi', 'price', 'materi', 'overall']

In [11]:
def classify(X, y, list_words):
    y_machine = [] # contain y_predict with all classifier
    for clf in model_list:
        print(clf)
        i = 0
        y_machine_temp = []
        for lst in list_words: # each criterion
            vectorizer = TfidfVectorizer(vocabulary = lst)
#             vectorizer = TfidfVectorizer()
            vectorizer.fit(X)
            X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 400, random_state = 42)
            X_train = vectorizer.transform(X_train).toarray()
            X_test = vectorizer.transform(X_test).toarray()
            model = train_clfs(X_train, y_train.iloc[:,i], clf)
            y_predict = model.predict(X_test)
            y_machine_temp.append(y_predict)
            print(model.score(X_test, y_test.iloc[:,i]))
            i += 1
        y_machine.append(y_machine_temp)
    return y_machine

In [12]:
y_machine = classify(X, y, list_words)

decision_tree
0.885
0.5225
0.75
0.7525
0.9075
0.6825
naive_bayes
0.88
0.5525
0.755
0.7525
0.92
0.745
linear_SGD_classifier
0.8875
0.5425
0.745
0.505
0.9075
0.145
SVC
0.89
0.5025
0.7725
0.775
0.9125
0.74
random_forest
0.8875
0.51
0.7575
0.77
0.905
0.7175


In [13]:
predict_dct = {} # prediction with differen classifier
i = 0
for clf in model_list:
    predict_dct[clf] = pd.DataFrame(np.array(y_machine[i]).T, columns = category)
    i += 1

In [14]:
def eva_metrics(y_predict, y_test):
    accuracy = []
    precision = []
    recall = []
    f1 = []
    for i in range(y_predict.shape[1]):
        accuracy.append(accuracy_score(y_test.iloc[:,i], y_predict.iloc[:,i]))
        precision.append(precision_score(y_test.iloc[:,i], y_predict.iloc[:,i], average = 'weighted')) # order: -1, 0, 1
        recall.append(recall_score(y_test.iloc[:,i], y_predict.iloc[:,i], average = 'weighted'))
        f1.append(f1_score(y_test.iloc[:,i], y_predict.iloc[:,i], average = 'weighted'))
    return accuracy, precision, recall, f1

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 400, random_state = 42)
accuracy, precision, recall, f1 = eva_metrics(predict_dct['SVC'], y_test)

In [16]:
accuracy_lst = []
precision_lst = []
recall_lst = []
f1_lst = []
for clf in model_list:
    accuracy, precision, recall, f1 = eva_metrics(predict_dct[clf], y_test)
    accuracy_lst.append(accuracy)
    precision_lst.append(precision)
    recall_lst.append(recall)
    f1_lst.append(f1)

In [17]:
pd.DataFrame(accuracy_lst, columns = category, index = model_list).to_csv('accuracy.csv')
pd.DataFrame(precision_lst, columns = category, index = model_list).to_csv('precision.csv')
pd.DataFrame(recall_lst, columns = category, index = model_list).to_csv('recall.csv')
pd.DataFrame(f1_lst, columns = category, index = model_list).to_csv('f1.csv')

In [18]:
import random

In [19]:
random_dict = {}
for i in category:
    weight = y_test[i].value_counts(normalize = True)
    random_score = random.choices([0,1,-1],weights = [weight.iloc[0],weight.iloc[1],weight.iloc[2]],k=400)
    random_dict[i] = random_score

In [20]:
rndm = pd.DataFrame(random_dict)

In [21]:
accuracy_rndm, precision_rndm, recall_rndm, f1_rndm = eva_metrics(rndm, y_test)

In [22]:
accuracy_rndm, precision_rndm, recall_rndm, f1_rndm

([0.7525, 0.3925, 0.63, 0.59, 0.6725, 0.6125],
 [0.7384128148691612,
  0.38933918767857606,
  0.6158750000000001,
  0.5882926829268292,
  0.675349939338793,
  0.616158836689038],
 [0.7525, 0.3925, 0.63, 0.59, 0.6725, 0.6125],
 [0.7453684557030209,
  0.39083011095158127,
  0.62285725589492,
  0.5891393442622951,
  0.6739118378275005,
  0.6143201476184984])

## significance

In [23]:
# random 10 times
accuracy_rndm_list = []
precision_rndm_list = []
recall_rndm_list = []
f1_rndm_list = []
for _ in range(10):
    random_ditc = {}
    for i in category:
        weight = y_test[i].value_counts(normalize = True)
        random_score = random.choices([0,1,-1],weights = [weight.iloc[0],weight.iloc[1],weight.iloc[2]],k=400)
        random_dict[i] = random_score
    rndm = pd.DataFrame(random_dict)
    accuracy_rndm, precision_rndm, recall_rndm, f1_rndm = eva_metrics(rndm, y_test)
    accuracy_rndm_list.append(accuracy_rndm)
    precision_rndm_list.append(precision_rndm)
    recall_rndm_list.append(recall_rndm)
    f1_rndm_list.append(f1_rndm)

In [74]:
# SVC 10 times
accuracy_list = []
precision_list = []
recall_list = []
f1_list = []
for rs in range(10):
    y_machine_temp = []
    i = 0
    for lst in list_words: # each criterion
        vectorizer = TfidfVectorizer(vocabulary = lst)
#         vectorizer = TfidfVectorizer()
        vectorizer.fit(X)
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 400, random_state = rs)
        X_train = vectorizer.transform(X_train).toarray()
        X_test = vectorizer.transform(X_test).toarray()
#         clf = RandomForestClassifier()
        clf = SVC()
        clf.fit(X_train, y_train.iloc[:,i])
        y_predict = clf.predict(X_test)
        y_machine_temp.append(y_predict)
        i += 1
    y_predict = pd.DataFrame(np.array(y_machine_temp).T)
    accuracy, precision, recall, f1 = eva_metrics(y_predict, y_test)
    print(accuracy)
    accuracy_list.append(accuracy)
    precision_list.append(precision)
    recall_list.append(recall)
    f1_list.append(f1)

[0.91, 0.545, 0.7975, 0.76, 0.9125, 0.7075]
[0.9025, 0.525, 0.7725, 0.7775, 0.905, 0.695]
[0.8875, 0.5375, 0.7775, 0.7525, 0.9025, 0.7475]
[0.885, 0.53, 0.7975, 0.7525, 0.8975, 0.7275]
[0.905, 0.5575, 0.7675, 0.7725, 0.8925, 0.73]
[0.88, 0.57, 0.7775, 0.7625, 0.865, 0.7275]
[0.905, 0.555, 0.7475, 0.7475, 0.9, 0.745]
[0.8725, 0.575, 0.795, 0.7675, 0.9225, 0.7225]
[0.8775, 0.53, 0.7825, 0.755, 0.9075, 0.7325]
[0.9125, 0.59, 0.75, 0.7575, 0.8975, 0.695]


In [75]:
# to csv
pd.DataFrame(accuracy_rndm_list, columns = category).to_csv('accuracy_rndm.csv')
pd.DataFrame(accuracy_list, columns = category).to_csv('accuracy.csv')
pd.DataFrame(precision_rndm_list, columns = category).to_csv('precision_rndm.csv')
pd.DataFrame(precision_list, columns = category).to_csv('precision.csv')
pd.DataFrame(recall_rndm_list, columns = category).to_csv('recall_rndm.csv')
pd.DataFrame(recall_list, columns = category).to_csv('recall.csv')
pd.DataFrame(f1_rndm_list, columns = category).to_csv('f1_rndm.csv')
pd.DataFrame(f1_list, columns = category).to_csv('f1.csv')