In [2]:
import pandas as pd
import numpy as np
from tqdm import tqdm

## LOAD DATASET

In [3]:
train = pd.read_csv("./data/train.tsv", sep = "\t", index_col = "PhraseId")
train["Phrase(original)"] = train["Phrase"]
print(train.shape)
train.head(10)

(156060, 4)


Unnamed: 0_level_0,SentenceId,Phrase,Sentiment,Phrase(original)
PhraseId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,1,A series of escapades demonstrating the adage ...,1,A series of escapades demonstrating the adage ...
2,1,A series of escapades demonstrating the adage ...,2,A series of escapades demonstrating the adage ...
3,1,A series,2,A series
4,1,A,2,A
5,1,series,2,series
6,1,of escapades demonstrating the adage that what...,2,of escapades demonstrating the adage that what...
7,1,of,2,of
8,1,escapades demonstrating the adage that what is...,2,escapades demonstrating the adage that what is...
9,1,escapades,2,escapades
10,1,demonstrating the adage that what is good for ...,2,demonstrating the adage that what is good for ...


In [4]:
test = pd.read_csv("./data/test.tsv", sep = "\t", index_col = "PhraseId")
test["Phrase(original)"] = train["Phrase"]
print(test.shape)
test.head(10)


(66292, 3)


Unnamed: 0_level_0,SentenceId,Phrase,Phrase(original)
PhraseId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
156061,8545,An intermittently pleasing but mostly routine ...,
156062,8545,An intermittently pleasing but mostly routine ...,
156063,8545,An,
156064,8545,intermittently pleasing but mostly routine effort,
156065,8545,intermittently pleasing but mostly routine,
156066,8545,intermittently pleasing but,
156067,8545,intermittently pleasing,
156068,8545,intermittently,
156069,8545,pleasing,
156070,8545,but,


## PREPROCESSING

## Cleantext -- 1

In [5]:
#영어 단수 복수 같은거 전부 원본으로 교체
import nltk
from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer('english')

stemmer.stem("disappointments") #disappointment


'disappoint'

In [6]:
def stem_phrase(phrase):
    words = phrase.split(" ")
    stemmed_words = []

    for word in words:
        stemmed_word = stemmer.stem(word)
        stemmed_words.append(stemmed_word)

    stemmed_phrase = " ".join(stemmed_words)

    return stemmed_phrase

# Train
tqdm.pandas(desc="Stemming... (train)")

train["Phrase"] = train["Phrase"].progress_apply(stem_phrase)

print(train.shape)
train.head()

# Test
tqdm.pandas(desc = "Stemming... (test)")

test["Phrase"] = test["Phrase"].progress_apply(stem_phrase)

print(test.shape)
test.head()


Stemming... (train):   0%|          | 0/156060 [00:00<?, ?it/s]

Stemming... (train):   1%|          | 1248/156060 [00:00<00:12, 12473.11it/s]

Stemming... (train):   2%|▏         | 2441/156060 [00:00<00:12, 12301.81it/s]

Stemming... (train):   2%|▏         | 3482/156060 [00:00<00:13, 11660.87it/s]

Stemming... (train):   3%|▎         | 4228/156060 [00:00<00:16, 9258.36it/s] 

Stemming... (train):   3%|▎         | 4938/156060 [00:00<00:19, 7643.97it/s]

Stemming... (train):   4%|▍         | 5878/156060 [00:00<00:18, 8094.30it/s]

Stemming... (train):   4%|▍         | 6782/156060 [00:00<00:17, 8356.08it/s]

Stemming... (train):   5%|▍         | 7759/156060 [00:00<00:16, 8733.92it/s]

Stemming... (train):   6%|▌         | 8799/156060 [00:00<00:16, 9170.45it/s]

Stemming... (train):   6%|▋         | 9764/156060 [00:01<00:15, 9308.86it/s]

Stemming... (train):   7%|▋         | 10923/156060 [00:01<00:14, 9890.33it/s]

Stemming... (train):   8%|▊         | 11917/156060 [00:01<00:14, 9825.32it/s]

Stemming... (train):   8%|▊         | 12962/156060 [00:01<00:14, 10004.42it/s]

Stemming... (train):   9%|▉         | 13966/156060 [00:01<00:14, 9789.58it/s] 

Stemming... (train):  10%|▉         | 14948/156060 [00:01<00:15, 9196.11it/s]

Stemming... (train):  10%|█         | 15878/156060 [00:01<00:17, 8083.88it/s]

Stemming... (train):  11%|█         | 16718/156060 [00:01<00:20, 6943.97it/s]

Stemming... (train):  11%|█         | 17463/156060 [00:01<00:20, 6929.13it/s]

Stemming... (train):  12%|█▏        | 18191/156060 [00:02<00:20, 6775.48it/s]

Stemming... (train):  12%|█▏        | 18950/156060 [00:02<00:19, 7000.15it/s]

Stemming... (train):  13%|█▎        | 19670/156060 [00:02<00:19, 7053.23it/s]

Stemming... (train):  13%|█▎        | 20414/156060 [00:02<00:18, 7163.46it/s]

Stemming... (train):  14%|█▎        | 21141/156060 [00:02<00:19, 6860.11it/s]

Stemming... (train):  14%|█▍        | 22027/156060 [00:02<00:18, 7348.95it/s]

Stemming... (train):  15%|█▍        | 22779/156060 [00:02<00:18, 7199.97it/s]

Stemming... (train):  15%|█▌        | 23626/156060 [00:02<00:17, 7537.40it/s]

Stemming... (train):  16%|█▌        | 24569/156060 [00:02<00:16, 8020.07it/s]

Stemming... (train):  16%|█▋        | 25460/156060 [00:03<00:15, 8266.59it/s]

Stemming... (train):  17%|█▋        | 26321/156060 [00:03<00:15, 8366.66it/s]

Stemming... (train):  17%|█▋        | 27267/156060 [00:03<00:14, 8663.55it/s]

Stemming... (train):  18%|█▊        | 28289/156060 [00:03<00:14, 9077.53it/s]

Stemming... (train):  19%|█▊        | 29214/156060 [00:03<00:13, 9126.40it/s]

Stemming... (train):  19%|█▉        | 30136/156060 [00:03<00:13, 9129.97it/s]

Stemming... (train):  20%|█▉        | 31081/156060 [00:03<00:13, 9218.84it/s]

Stemming... (train):  21%|██        | 32008/156060 [00:03<00:13, 9217.64it/s]

Stemming... (train):  21%|██        | 32933/156060 [00:03<00:13, 8922.26it/s]

Stemming... (train):  22%|██▏       | 33835/156060 [00:03<00:13, 8948.56it/s]

Stemming... (train):  22%|██▏       | 34800/156060 [00:04<00:13, 9147.31it/s]

Stemming... (train):  23%|██▎       | 35858/156060 [00:04<00:12, 9530.66it/s]

Stemming... (train):  24%|██▎       | 36818/156060 [00:04<00:12, 9381.23it/s]

Stemming... (train):  24%|██▍       | 37766/156060 [00:04<00:12, 9401.54it/s]

Stemming... (train):  25%|██▍       | 38743/156060 [00:04<00:12, 9508.91it/s]

Stemming... (train):  25%|██▌       | 39697/156060 [00:04<00:12, 9483.86it/s]

Stemming... (train):  26%|██▌       | 40648/156060 [00:04<00:12, 9283.44it/s]

Stemming... (train):  27%|██▋       | 41579/156060 [00:04<00:12, 8903.22it/s]

Stemming... (train):  27%|██▋       | 42475/156060 [00:04<00:13, 8537.15it/s]

Stemming... (train):  28%|██▊       | 43360/156060 [00:04<00:13, 8625.02it/s]

Stemming... (train):  28%|██▊       | 44358/156060 [00:05<00:12, 8985.80it/s]

Stemming... (train):  29%|██▉       | 45378/156060 [00:05<00:11, 9318.41it/s]

Stemming... (train):  30%|██▉       | 46436/156060 [00:05<00:11, 9661.16it/s]

Stemming... (train):  30%|███       | 47414/156060 [00:05<00:11, 9693.05it/s]

Stemming... (train):  31%|███       | 48390/156060 [00:05<00:11, 9595.46it/s]

Stemming... (train):  32%|███▏      | 49467/156060 [00:05<00:10, 9919.96it/s]

Stemming... (train):  32%|███▏      | 50490/156060 [00:05<00:10, 10008.19it/s]

Stemming... (train):  33%|███▎      | 51533/156060 [00:05<00:10, 10130.75it/s]

Stemming... (train):  34%|███▍      | 52762/156060 [00:05<00:09, 10693.28it/s]

Stemming... (train):  35%|███▍      | 53996/156060 [00:06<00:09, 11137.96it/s]

Stemming... (train):  35%|███▌      | 55146/156060 [00:06<00:08, 11243.13it/s]

Stemming... (train):  36%|███▌      | 56337/156060 [00:06<00:08, 11433.54it/s]

Stemming... (train):  37%|███▋      | 57542/156060 [00:06<00:08, 11610.40it/s]

Stemming... (train):  38%|███▊      | 58709/156060 [00:06<00:08, 11271.32it/s]

Stemming... (train):  38%|███▊      | 59843/156060 [00:06<00:08, 11272.70it/s]

Stemming... (train):  39%|███▉      | 61025/156060 [00:06<00:08, 11420.49it/s]

Stemming... (train):  40%|███▉      | 62171/156060 [00:06<00:08, 10699.67it/s]

Stemming... (train):  41%|████      | 63253/156060 [00:06<00:08, 10446.55it/s]

Stemming... (train):  41%|████      | 64354/156060 [00:06<00:08, 10606.52it/s]

Stemming... (train):  42%|████▏     | 65561/156060 [00:07<00:08, 11004.26it/s]

Stemming... (train):  43%|████▎     | 66768/156060 [00:07<00:07, 11298.40it/s]

Stemming... (train):  44%|████▎     | 67907/156060 [00:07<00:07, 11297.93it/s]

Stemming... (train):  44%|████▍     | 69043/156060 [00:07<00:07, 11297.56it/s]

Stemming... (train):  45%|████▍     | 70191/156060 [00:07<00:07, 11351.15it/s]

Stemming... (train):  46%|████▌     | 71336/156060 [00:07<00:07, 11379.81it/s]

Stemming... (train):  46%|████▋     | 72477/156060 [00:07<00:07, 11261.72it/s]

Stemming... (train):  47%|████▋     | 73605/156060 [00:07<00:07, 10524.90it/s]

Stemming... (train):  48%|████▊     | 74669/156060 [00:07<00:07, 10307.38it/s]

Stemming... (train):  49%|████▊     | 75709/156060 [00:07<00:07, 10102.72it/s]

Stemming... (train):  49%|████▉     | 76817/156060 [00:08<00:07, 10375.36it/s]

Stemming... (train):  50%|████▉     | 77862/156060 [00:08<00:07, 10183.64it/s]

Stemming... (train):  51%|█████     | 78886/156060 [00:08<00:07, 9757.91it/s] 

Stemming... (train):  51%|█████     | 79870/156060 [00:08<00:07, 9674.80it/s]

Stemming... (train):  52%|█████▏    | 80843/156060 [00:08<00:07, 9646.01it/s]

Stemming... (train):  53%|█████▎    | 81941/156060 [00:08<00:07, 10004.59it/s]

Stemming... (train):  53%|█████▎    | 82948/156060 [00:08<00:07, 9516.74it/s] 

Stemming... (train):  54%|█████▍    | 83960/156060 [00:08<00:07, 9688.69it/s]

Stemming... (train):  55%|█████▍    | 85058/156060 [00:08<00:07, 10042.09it/s]

Stemming... (train):  55%|█████▌    | 86089/156060 [00:09<00:06, 10120.36it/s]

Stemming... (train):  56%|█████▌    | 87143/156060 [00:09<00:06, 10233.34it/s]

Stemming... (train):  56%|█████▋    | 88171/156060 [00:09<00:06, 9983.76it/s] 

Stemming... (train):  57%|█████▋    | 89174/156060 [00:09<00:06, 9916.86it/s]

Stemming... (train):  58%|█████▊    | 90285/156060 [00:09<00:06, 10245.27it/s]

Stemming... (train):  59%|█████▊    | 91334/156060 [00:09<00:06, 10316.17it/s]

Stemming... (train):  59%|█████▉    | 92370/156060 [00:09<00:06, 9784.46it/s] 

Stemming... (train):  60%|█████▉    | 93357/156060 [00:09<00:06, 9458.36it/s]

Stemming... (train):  61%|██████    | 94442/156060 [00:09<00:06, 9836.04it/s]

Stemming... (train):  61%|██████    | 95436/156060 [00:09<00:06, 9578.37it/s]

Stemming... (train):  62%|██████▏   | 96405/156060 [00:10<00:06, 9605.81it/s]

Stemming... (train):  62%|██████▏   | 97444/156060 [00:10<00:05, 9826.42it/s]

Stemming... (train):  63%|██████▎   | 98499/156060 [00:10<00:05, 10030.16it/s]

Stemming... (train):  64%|██████▍   | 99507/156060 [00:10<00:05, 9609.65it/s] 

Stemming... (train):  64%|██████▍   | 100476/156060 [00:10<00:05, 9625.27it/s]

Stemming... (train):  65%|██████▌   | 101526/156060 [00:10<00:05, 9869.96it/s]

Stemming... (train):  66%|██████▌   | 102519/156060 [00:10<00:05, 9515.68it/s]

Stemming... (train):  66%|██████▋   | 103477/156060 [00:10<00:05, 9225.60it/s]

Stemming... (train):  67%|██████▋   | 104406/156060 [00:10<00:05, 8916.61it/s]

Stemming... (train):  67%|██████▋   | 105305/156060 [00:11<00:05, 8913.37it/s]

Stemming... (train):  68%|██████▊   | 106345/156060 [00:11<00:05, 9311.66it/s]

Stemming... (train):  69%|██████▉   | 107466/156060 [00:11<00:04, 9802.29it/s]

Stemming... (train):  70%|██████▉   | 108468/156060 [00:11<00:04, 9861.30it/s]

Stemming... (train):  70%|███████   | 109463/156060 [00:11<00:04, 9782.81it/s]

Stemming... (train):  71%|███████   | 110475/156060 [00:11<00:04, 9880.91it/s]

Stemming... (train):  71%|███████▏  | 111536/156060 [00:11<00:04, 10087.55it/s]

Stemming... (train):  72%|███████▏  | 112550/156060 [00:11<00:04, 9774.87it/s] 

Stemming... (train):  73%|███████▎  | 113589/156060 [00:11<00:04, 9947.39it/s]

Stemming... (train):  73%|███████▎  | 114589/156060 [00:11<00:04, 9441.91it/s]

Stemming... (train):  74%|███████▍  | 115663/156060 [00:12<00:04, 9793.45it/s]

Stemming... (train):  75%|███████▍  | 116662/156060 [00:12<00:04, 9843.18it/s]

Stemming... (train):  75%|███████▌  | 117722/156060 [00:12<00:03, 10057.91it/s]

Stemming... (train):  76%|███████▌  | 118737/156060 [00:12<00:03, 10075.87it/s]

Stemming... (train):  77%|███████▋  | 119749/156060 [00:12<00:03, 9736.57it/s] 

Stemming... (train):  77%|███████▋  | 120745/156060 [00:12<00:03, 9795.31it/s]

Stemming... (train):  78%|███████▊  | 121729/156060 [00:12<00:03, 9523.70it/s]

Stemming... (train):  79%|███████▊  | 122725/156060 [00:12<00:03, 9650.36it/s]

Stemming... (train):  79%|███████▉  | 123769/156060 [00:12<00:03, 9873.83it/s]

Stemming... (train):  80%|███████▉  | 124834/156060 [00:13<00:03, 10094.28it/s]

Stemming... (train):  81%|████████  | 125848/156060 [00:13<00:03, 9434.38it/s] 

Stemming... (train):  81%|████████▏ | 126842/156060 [00:13<00:03, 9576.59it/s]

Stemming... (train):  82%|████████▏ | 127809/156060 [00:13<00:03, 9209.98it/s]

Stemming... (train):  83%|████████▎ | 128814/156060 [00:13<00:02, 9446.47it/s]

Stemming... (train):  83%|████████▎ | 129767/156060 [00:13<00:02, 9255.92it/s]

Stemming... (train):  84%|████████▎ | 130699/156060 [00:13<00:03, 7312.33it/s]

Stemming... (train):  84%|████████▍ | 131499/156060 [00:13<00:03, 6768.59it/s]

Stemming... (train):  85%|████████▍ | 132232/156060 [00:13<00:03, 6723.99it/s]

Stemming... (train):  85%|████████▌ | 132982/156060 [00:14<00:03, 6937.62it/s]

Stemming... (train):  86%|████████▌ | 133829/156060 [00:14<00:03, 7333.68it/s]

Stemming... (train):  86%|████████▋ | 134702/156060 [00:14<00:02, 7702.06it/s]

Stemming... (train):  87%|████████▋ | 135507/156060 [00:14<00:02, 7802.84it/s]

Stemming... (train):  87%|████████▋ | 136304/156060 [00:14<00:02, 7714.43it/s]

Stemming... (train):  88%|████████▊ | 137088/156060 [00:14<00:02, 7500.95it/s]

Stemming... (train):  88%|████████▊ | 137848/156060 [00:14<00:02, 7371.92it/s]

Stemming... (train):  89%|████████▉ | 138593/156060 [00:14<00:02, 7061.96it/s]

Stemming... (train):  89%|████████▉ | 139367/156060 [00:14<00:02, 7246.06it/s]

Stemming... (train):  90%|████████▉ | 140126/156060 [00:15<00:02, 7345.88it/s]

Stemming... (train):  90%|█████████ | 140949/156060 [00:15<00:01, 7589.04it/s]

Stemming... (train):  91%|█████████ | 141774/156060 [00:15<00:01, 7775.41it/s]

Stemming... (train):  91%|█████████▏| 142567/156060 [00:15<00:01, 7819.25it/s]

Stemming... (train):  92%|█████████▏| 143353/156060 [00:15<00:01, 7163.57it/s]

Stemming... (train):  92%|█████████▏| 144216/156060 [00:15<00:01, 7545.41it/s]

Stemming... (train):  93%|█████████▎| 145046/156060 [00:15<00:01, 7756.82it/s]

Stemming... (train):  93%|█████████▎| 145834/156060 [00:15<00:01, 5632.39it/s]

Stemming... (train):  94%|█████████▍| 146489/156060 [00:16<00:01, 5485.33it/s]

Stemming... (train):  94%|█████████▍| 147103/156060 [00:16<00:02, 3876.57it/s]

Stemming... (train):  95%|█████████▍| 147724/156060 [00:16<00:01, 4367.92it/s]

Stemming... (train):  95%|█████████▌| 148431/156060 [00:16<00:01, 4933.39it/s]

Stemming... (train):  96%|█████████▌| 149364/156060 [00:16<00:01, 5744.74it/s]

Stemming... (train):  96%|█████████▌| 150161/156060 [00:16<00:00, 6267.58it/s]

Stemming... (train):  97%|█████████▋| 150883/156060 [00:16<00:00, 6420.66it/s]

Stemming... (train):  97%|█████████▋| 151650/156060 [00:16<00:00, 6749.38it/s]

Stemming... (train):  98%|█████████▊| 152434/156060 [00:16<00:00, 7039.96it/s]

Stemming... (train):  98%|█████████▊| 153316/156060 [00:17<00:00, 7492.71it/s]

Stemming... (train):  99%|█████████▉| 154164/156060 [00:17<00:00, 7762.80it/s]

Stemming... (train):  99%|█████████▉| 155101/156060 [00:17<00:00, 8181.54it/s]

Stemming... (train): 100%|█████████▉| 156049/156060 [00:17<00:00, 8524.30it/s]

Stemming... (train): 100%|██████████| 156060/156060 [00:17<00:00, 8971.34it/s]


Stemming... (test):   0%|          | 0/66292 [00:00<?, ?it/s]

Stemming... (test):   2%|▏         | 1282/66292 [00:00<00:05, 12819.74it/s]

(156060, 4)


Stemming... (test):   4%|▍         | 2724/66292 [00:00<00:04, 13257.58it/s]

Stemming... (test):   6%|▌         | 3733/66292 [00:00<00:05, 12115.64it/s]

Stemming... (test):   7%|▋         | 4921/66292 [00:00<00:05, 12042.65it/s]

Stemming... (test):   9%|▉         | 6219/66292 [00:00<00:04, 12306.31it/s]

Stemming... (test):  11%|█         | 7444/66292 [00:00<00:04, 12285.74it/s]

Stemming... (test):  13%|█▎        | 8684/66292 [00:00<00:04, 12316.90it/s]

Stemming... (test):  15%|█▌        | 9952/66292 [00:00<00:04, 12418.58it/s]

Stemming... (test):  17%|█▋        | 11309/66292 [00:00<00:04, 12737.29it/s]

Stemming... (test):  19%|█▉        | 12725/66292 [00:01<00:04, 13132.49it/s]

Stemming... (test):  21%|██        | 14009/66292 [00:01<00:04, 12774.35it/s]

Stemming... (test):  23%|██▎       | 15311/66292 [00:01<00:03, 12846.95it/s]

Stemming... (test):  25%|██▌       | 16583/66292 [00:01<00:04, 12068.16it/s]

Stemming... (test):  27%|██▋       | 17791/66292 [00:01<00:04, 11678.29it/s]

Stemming... (test):  29%|██▊       | 18963/66292 [00:01<00:04, 11237.14it/s]

Stemming... (test):  30%|███       | 20093/66292 [00:01<00:04, 10852.34it/s]

Stemming... (test):  32%|███▏      | 21186/66292 [00:01<00:04, 10833.25it/s]

Stemming... (test):  34%|███▎      | 22313/66292 [00:01<00:04, 10960.61it/s]

Stemming... (test):  35%|███▌      | 23513/66292 [00:01<00:03, 11250.33it/s]

Stemming... (test):  37%|███▋      | 24685/66292 [00:02<00:03, 11387.22it/s]

Stemming... (test):  39%|███▉      | 25828/66292 [00:02<00:03, 11127.01it/s]

Stemming... (test):  41%|████      | 26945/66292 [00:02<00:03, 10954.63it/s]

Stemming... (test):  42%|████▏     | 28044/66292 [00:02<00:03, 10218.20it/s]

Stemming... (test):  44%|████▍     | 29079/66292 [00:02<00:03, 10006.91it/s]

Stemming... (test):  45%|████▌     | 30090/66292 [00:02<00:03, 9794.77it/s] 

Stemming... (test):  47%|████▋     | 31127/66292 [00:02<00:03, 9960.04it/s]

Stemming... (test):  49%|████▊     | 32217/66292 [00:02<00:03, 10222.97it/s]

Stemming... (test):  50%|█████     | 33450/66292 [00:02<00:03, 10773.23it/s]

Stemming... (test):  52%|█████▏    | 34546/66292 [00:03<00:02, 10825.23it/s]

Stemming... (test):  54%|█████▍    | 35638/66292 [00:03<00:02, 10751.11it/s]

Stemming... (test):  55%|█████▌    | 36733/66292 [00:03<00:02, 10796.03it/s]

Stemming... (test):  57%|█████▋    | 37817/66292 [00:03<00:02, 10748.86it/s]

Stemming... (test):  59%|█████▉    | 38974/66292 [00:03<00:02, 10975.26it/s]

Stemming... (test):  61%|██████    | 40194/66292 [00:03<00:02, 11314.55it/s]

Stemming... (test):  62%|██████▏   | 41336/66292 [00:03<00:02, 11345.58it/s]

Stemming... (test):  64%|██████▍   | 42475/66292 [00:03<00:02, 11333.27it/s]

Stemming... (test):  66%|██████▌   | 43664/66292 [00:03<00:01, 11492.04it/s]

Stemming... (test):  68%|██████▊   | 44816/66292 [00:03<00:01, 10981.34it/s]

Stemming... (test):  69%|██████▉   | 45921/66292 [00:04<00:01, 10645.19it/s]

Stemming... (test):  71%|███████   | 47008/66292 [00:04<00:01, 10708.35it/s]

Stemming... (test):  73%|███████▎  | 48114/66292 [00:04<00:01, 10810.47it/s]

Stemming... (test):  74%|███████▍  | 49230/66292 [00:04<00:01, 10912.45it/s]

Stemming... (test):  76%|███████▌  | 50395/66292 [00:04<00:01, 11123.28it/s]

Stemming... (test):  78%|███████▊  | 51561/66292 [00:04<00:01, 11278.90it/s]

Stemming... (test):  80%|███████▉  | 52706/66292 [00:04<00:01, 11326.29it/s]

Stemming... (test):  81%|████████  | 53841/66292 [00:04<00:01, 10114.64it/s]

Stemming... (test):  83%|████████▎ | 54957/66292 [00:04<00:01, 10406.83it/s]

Stemming... (test):  85%|████████▍ | 56139/66292 [00:04<00:00, 10793.54it/s]

Stemming... (test):  86%|████████▋ | 57297/66292 [00:05<00:00, 11017.62it/s]

Stemming... (test):  88%|████████▊ | 58453/66292 [00:05<00:00, 11174.43it/s]

Stemming... (test):  90%|████████▉ | 59581/66292 [00:05<00:00, 10645.50it/s]

Stemming... (test):  92%|█████████▏| 60658/66292 [00:05<00:00, 10106.35it/s]

Stemming... (test):  93%|█████████▎| 61683/66292 [00:05<00:00, 9675.96it/s] 

Stemming... (test):  95%|█████████▍| 62743/66292 [00:05<00:00, 9932.18it/s]

Stemming... (test):  96%|█████████▋| 63875/66292 [00:05<00:00, 10306.98it/s]

Stemming... (test):  98%|█████████▊| 64918/66292 [00:05<00:00, 10288.19it/s]

Stemming... (test):  99%|█████████▉| 65955/66292 [00:05<00:00, 10141.56it/s]

Stemming... (test): 100%|██████████| 66292/66292 [00:05<00:00, 11063.15it/s]

(66292, 3)





Unnamed: 0_level_0,SentenceId,Phrase,Phrase(original)
PhraseId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
156061,8545,an intermitt pleas but most routin effort .,
156062,8545,an intermitt pleas but most routin effort,
156063,8545,an,
156064,8545,intermitt pleas but most routin effort,
156065,8545,intermitt pleas but most routin,


# Cleantext -- 2

In [7]:
def clean_text(phrase):
    phrase = phrase.replace("n't", "not")
    phrase = phrase.replace("hopeless", "bad")
    phrase = phrase.replace("good", "best")
    phrase = phrase.replace("excellent", "best")
    phrase = phrase.replace("funni", "fun")
    phrase = phrase.replace("funny", "fun")
    phrase = phrase.replace("littl", "little")
    phrase = phrase.replace("the movi", "movie")
    phrase = phrase.replace("veri", "very")
    phrase = phrase.replace("onli", "only")
    phrase = phrase.replace("comedi", "comedy")
    phrase = phrase.replace("veri", "very")
    phrase = phrase.replace("stori", "story")
    phrase = phrase.replace("charact", "character")
    
    #Hypothesis
    phrase = phrase.replace("not", "no")
    return phrase

train["Phrase"] = train["Phrase"].apply(clean_text)
test["Phrase"] = test["Phrase"].apply(clean_text)


## one Hot Encode Phrases

In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer

char_vectorizer = TfidfVectorizer(analyzer = 'char', 
                                  max_features = 10000, 
                                  ngram_range = (1,9))

char_vectorizer.fit(train["Phrase"])
%time 
#analyzer = word, character
#기타 - 피쳐로 단어수

CPU times: user 3 µs, sys: 1 µs, total: 4 µs
Wall time: 8.82 µs


In [9]:
word_vectorizer = TfidfVectorizer(analyzer = 'word', 
                                  max_features = 30000, 
                                  ngram_range = (1,4))

word_vectorizer.fit(train["Phrase"])

#analyzer = word, character

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=30000, min_df=1,
        ngram_range=(1, 4), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

### fit and transform </br></br>
Fitting finds the internal parameters of a model that will be used to transform data. </br></br>

Transforming applies the parameters to data.</br></br>

You may fit a model to one set of data, and then transform it on a completely different set. </br>

In [10]:
X_train_word = word_vectorizer.transform(train["Phrase"])
print(X_train_word.shape)
X_train_char = char_vectorizer.transform(train["Phrase"])
print(X_train_char.shape)

from scipy.sparse import hstack #vstack, hstack (word, character to merge )
X_train = hstack([X_train_char, X_train_word])
print(X_train.shape)
X_train

(156060, 30000)


(156060, 10000)


(156060, 40000)


<156060x40000 sparse matrix of type '<class 'numpy.float64'>'
	with 19680572 stored elements in COOrdinate format>

In [11]:
X_test_word = word_vectorizer.transform(test["Phrase"])
print(X_test_word.shape)
X_test_char = char_vectorizer.transform(test["Phrase"])
print(X_test_char.shape)

from scipy.sparse import hstack #vstack, hstack (word, character to merge )
X_test = hstack([X_test_char, X_test_word])
print(X_test.shape)
X_test

(66292, 30000)


(66292, 10000)


(66292, 40000)


<66292x40000 sparse matrix of type '<class 'numpy.float64'>'
	with 7612200 stored elements in COOrdinate format>

In [12]:
y_train = train["Sentiment"]

print(y_train.shape)
y_train.head()

(156060,)


PhraseId
1    1
2    2
3    2
4    2
5    2
Name: Sentiment, dtype: int64

## Score 

* random forest classifier 하기에 column이 1000개나 되어서, 너무 오래걸림 </br></br>

#### Machine learning</br>
- Supervised: if label exist </br>
    - Tree =  DecisionTree, RandomForest(+Ensemble)</br>
    - Regression = 선형(y = ax + b, x = feature, y = label)</br></br></br>
    - cross validation to pick which one you prefer: 
    use "recent_reservations" and find the features that matters. Divide the data to trian and test and rerun the analysis !
- Unsupervised: ^ otherwise </br>
- Reinforcement: 강화학습은 너무 유니크하다? 거의 쓰이지않는다? </br></br>



In [13]:
from sklearn.linear_model import SGDClassifier

seed = 23

model = SGDClassifier(n_jobs=-1,
                      alpha = 0.00005,
                      random_state=seed)

model

SGDClassifier(alpha=5e-05, average=False, class_weight=None, epsilon=0.1,
       eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='hinge', max_iter=None, n_iter=None,
       n_jobs=-1, penalty='l2', power_t=0.5, random_state=23, shuffle=True,
       tol=None, verbose=0, warm_start=False)

In [14]:
#cross validation

from sklearn.model_selection import cross_val_predict
from sklearn.metrics import accuracy_score
# http://scikit-learn.org/stable/modules/classes.html
predictions = cross_val_predict(model, X_train, y_train, cv = 5)

score = accuracy_score(y_train, predictions)

print("Score = {0:5f}".format(score))












Score = 0.592452


모델 정확도를 높히는 방법으로, 미리 맞춰놓은 값들을 더 잘 맞출 필요는 없다. 
그래서 못맞춘 데이터만 떼어서, 그걸 계속 튜닝하는식으로 진행하는게 훨씬 효과적이다.

In [15]:
result = pd.DataFrame({'actual': y_train, 'predict': predictions})

result['distance'] = np.abs(result['actual'] - result['predict'])
result['Phrase'] = train['Phrase']
result = result.sort_values(by = 'distance', ascending = False)

result.head(1000).to_csv("cv_results_test_5.csv")
print(result.shape)
result.head()
#틀린이유 / 중요키워드 > 데이터 클리닝 > 다시돌리기...
#위에 코이피션트 확인하러 올라감.

(156060, 4)


Unnamed: 0_level_0,actual,predict,distance,Phrase
PhraseId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
145348,0,4,4,is hard a masterpiec
23806,0,4,4,"on it ici face , the new film is a subzero ver..."
11777,0,4,4,", this is the opposit of a truli magic movi ."
11778,0,4,4,this is the opposit of a truli magic movi .
66532,0,4,4,one of the saddest action hero perform ever wit


In [16]:
from sklearn.model_selection import cross_val_score, GroupKFold
# from sklearn.cross_validation import cross_val_score, GroupKFold

kfold = GroupKFold(n_splits=5)

score = cross_val_score(model, X_train, y_train, cv=kfold, groups=train["SentenceId"]).mean()

print("Score = {0:.5f}".format(score))
#57962
#58078
#58579
#58597











Score = 0.59852


## Predictions

In [17]:
import xgboost as xgb

In [24]:
predictions = booster.predict(dtest)
print(predictions.shape)
predictions[0:10]

(66292,)


array([ 3.,  3.,  2.,  3.,  3.,  3.,  3.,  2.,  3.,  2.], dtype=float32)

In [18]:
dtrain = xgb.DMatrix(X_train, label = y_train)

In [19]:
params = {
    'booster': 'gblinear',
    'objective': 'multi:softmax',
    'lambda': 2.186753e-03,
    'alpha': 1.286904,
    'lambda_bias': 6.191707e+00,
    'num_class': 5,
    'nthread':2,
}

booster = xgb.train(params, dtrain, num_boost_round=98)
booster

<xgboost.core.Booster at 0x1a21c38c88>

In [22]:
dtest = xgb.DMatrix(X_test.toarray())

NameError: name 'predictons' is not defined

# Submit

In [25]:
submission = pd.read_csv("./data/sampleSubmission.csv", index_col="PhraseId")

submission["Sentiment"] = predictions

print(submission.shape)
submission.head()

(66292, 1)


Unnamed: 0_level_0,Sentiment
PhraseId,Unnamed: 1_level_1
156061,3.0
156062,3.0
156063,2.0
156064,3.0
156065,3.0


In [None]:
submission.to_csv("tfidf-xgboost.csv")