In [29]:
from collections import Counter
from itertools import compress

import re
import pandas as pd

In [62]:
def isfloat(s):
    try:
        float(s)
        return True
    except ValueError:
        return False

def regularize(word):
    word = re.sub(r"[\']s|’s|[(|)|,|\"|\'|‘|’|`|“|”|:|;|\[|\]|?|!]|(\.(?=[\s\n\r”\"]|$))", '', word).lower().strip()
    if word.isdigit():
        word = '<DIG>'
    elif isfloat(word):
        word = '<FLO>'
    elif ('$' or '€' or '£') in word:
        word = '<MON>'
    elif '°c' in word:
        word = '<TEM>'
    elif '°' in word:
        word = '<DEG>'
    elif '%' in word:
        word = '<PER>'
    return word

In [63]:
data = pd.read_csv('./Data/train.csv')

In [64]:
ict_group = data[data['label']=='ICT']
edu_group = data[data['label']=='education']
mec_group = data[data['label']=='mechanics']
eco_group = data[data['label']=='economy']

In [65]:
with open('./backup/stopwords.txt', mode='r') as f:
    stop_words = [line.rstrip() for line in f.readlines()]

In [66]:
ict_sentences = ict_group['text']
words = []

for sentence in ict_sentences:
    word_list = sentence.split(' ')
    for word in word_list:
        if word:
            words.append(regularize(word))

ict_counter = Counter(words)
ict_counter = sorted(ict_counter, key=ict_counter.get, reverse=True)
ict_counter = list(compress(ict_counter, [0 if word in stop_words else 1 for word in ict_counter]))
ict_counter

['<DIG>',
 'device',
 'information',
 'data',
 'first',
 'unit',
 'user',
 'fig',
 'second',
 'image',
 'include',
 'display',
 'embodiment',
 'signal',
 'input',
 'electronic',
 'one',
 'control',
 'memory',
 'present',
 'invention',
 'operation',
 'system',
 'communication',
 'terminal',
 'various',
 'using',
 'method',
 'area',
 'based',
 'value',
 'time',
 'case',
 'processor',
 'used',
 'output',
 'touch',
 'plurality',
 'circuit',
 'sensor',
 'screen',
 'content',
 'apparatus',
 'block',
 'controller',
 'object',
 'module',
 'number',
 'server',
 'embodiments',
 'step',
 'application',
 'voltage',
 'service',
 'function',
 'set',
 'corresponding',
 'layer',
 'diagram',
 'process',
 'illustrating',
 'least',
 'provided',
 'network',
 'reference',
 'state',
 'use',
 'mode',
 'power',
 'processing',
 'described',
 'technology',
 'another',
 'different',
 'receive',
 'performed',
 'displayed',
 'includes',
 'connected',
 'perform',
 'mobile',
 'configuration',
 'storage',
 'received'

In [67]:
edu_sentences = edu_group['text']
words = []

for sentence in edu_sentences:
    word_list = sentence.split(' ')
    for word in word_list:
        if word:
            words.append(regularize(word))

edu_counter = Counter(words)
edu_counter = sorted(edu_counter, key=edu_counter.get, reverse=True)
edu_counter = list(compress(edu_counter, [0 if word in stop_words else 1 for word in edu_counter]))
edu_counter

['education',
 '<DIG>',
 'students',
 'school',
 'learning',
 'study',
 'teachers',
 'one',
 'research',
 'process',
 'class',
 'curriculum',
 'time',
 'high',
 'teacher',
 'system',
 'used',
 'educational',
 'based',
 'aaa',
 'two',
 'level',
 'various',
 'related',
 'content',
 'activities',
 'first',
 'number',
 'life',
 'results',
 'necessary',
 'social',
 'development',
 'studies',
 'method',
 'mathematics',
 'problem',
 'teaching',
 'analysis',
 'subject',
 'knowledge',
 'evaluation',
 'using',
 'important',
 'use',
 'experience',
 'human',
 'data',
 'schools',
 'group',
 'career',
 'conducted',
 'elementary',
 'people',
 'field',
 'result',
 'ability',
 'new',
 'classes',
 'found',
 'training',
 'effect',
 'different',
 'case',
 'learners',
 'student',
 'many',
 'order',
 'information',
 'korean',
 'relationship',
 'way',
 'children',
 'characteristics',
 'model',
 'theory',
 'understanding',
 'support',
 'general',
 'thinking',
 'subjects',
 'value',
 'language',
 'work',
 'poi

In [68]:
mec_sentences = mec_group['text']
words = []

for sentence in mec_sentences:
    word_list = sentence.split(' ')
    for word in word_list:
        if word:
            words.append(regularize(word))

mec_counter = Counter(words)
mec_counter = sorted(mec_counter, key=mec_counter.get, reverse=True)
mec_counter = list(compress(mec_counter, [0 if word in stop_words else 1 for word in mec_counter]))
mec_counter

['<DIG>',
 'present',
 'fig',
 'invention',
 'unit',
 'first',
 'embodiment',
 'device',
 'second',
 'control',
 'include',
 'method',
 'used',
 'formed',
 'surface',
 'one',
 'information',
 'using',
 'power',
 'vehicle',
 'air',
 'heat',
 'part',
 'portion',
 'temperature',
 'case',
 'sensor',
 'material',
 'signal',
 'time',
 'layer',
 'image',
 'direction',
 'system',
 'apparatus',
 'user',
 'provided',
 'body',
 'illustrating',
 'light',
 'water',
 'process',
 'value',
 'various',
 'view',
 'data',
 'operation',
 'member',
 'structure',
 'side',
 'plurality',
 'shown',
 'described',
 'diagram',
 'step',
 'high',
 'gas',
 'state',
 'based',
 'voltage',
 'display',
 'electronic',
 'flow',
 'another',
 'module',
 'shape',
 'driving',
 'lower',
 '<FLO>',
 '<PER>',
 'film',
 'applied',
 'area',
 'performed',
 'connected',
 'end',
 'group',
 'pressure',
 'input',
 'reference',
 'disposed',
 'current',
 'output',
 'amount',
 'change',
 'least',
 'composition',
 'upper',
 'treatment',
 'e

In [69]:
eco_sentences = eco_group['text']
words = []

for sentence in eco_sentences:
    word_list = sentence.split(' ')
    for word in word_list:
        if word:
            words.append(regularize(word))

eco_counter = Counter(words)
eco_counter = sorted(eco_counter, key=eco_counter.get, reverse=True)
eco_counter = list(compress(eco_counter, [0 if word in stop_words else 1 for word in eco_counter]))
eco_counter

['<DIG>',
 'rate',
 '<PER>',
 'financial',
 'economic',
 'korea',
 'companies',
 'market',
 'income',
 'tax',
 'analysis',
 'effect',
 'study',
 'increase',
 'system',
 'case',
 'investment',
 'growth',
 'countries',
 'level',
 'government',
 'aaa',
 'time',
 'results',
 'policy',
 'insurance',
 'used',
 'industry',
 'model',
 'data',
 'value',
 'first',
 'number',
 'company',
 'high',
 'year',
 'china',
 'local',
 'price',
 'result',
 'social',
 'economy',
 'period',
 'management',
 'business',
 '<FLO>',
 'public',
 'based',
 'capital',
 'total',
 'since',
 'interest',
 'information',
 'new',
 'using',
 'korean',
 'variables',
 'higher',
 'research',
 'development',
 'labor',
 'studies',
 'two',
 'found',
 'one',
 'increased',
 'necessary',
 'expected',
 'domestic',
 'trade',
 'factors',
 'bbb',
 'table',
 'significant',
 'change',
 'related',
 'relationship',
 'corporate',
 'national',
 'ratio',
 'amount',
 'average',
 'foreign',
 'support',
 'years',
 'people',
 'method',
 'various'

In [70]:
min_len = len(min([ict_counter, edu_counter, mec_counter, eco_counter], key=len))

In [71]:
compare_df = pd.DataFrame(columns=['ICT', 'EDU', 'MEC', 'ECO'], )
compare_df['ICT'] = ict_counter[:min_len]
compare_df['EDU'] = edu_counter[:min_len]
compare_df['MEC'] = mec_counter[:min_len]
compare_df['ECO'] = eco_counter[:min_len]

In [72]:
compare_df

Unnamed: 0,ICT,EDU,MEC,ECO
0,<DIG>,education,<DIG>,<DIG>
1,device,<DIG>,present,rate
2,information,students,fig,<PER>
3,data,school,invention,financial
4,first,learning,unit,economic
...,...,...,...,...
18725,cept,calculation-based,non-differential,embassies
18726,cyber-physical,orientedness,llc,deposit-taking
18727,personalize,beside,anti-vibration,78.521.5
18728,128f,gall-peters,buckle,sem
