## tensorflow_nlp.ipynb
## NLP : 자연어 처리
## 1) Classification (분류) - 라벨링이 필요
## 2) Machine Translation (번역)
## 3) Text Generation (문장 생성)
## 4) Voice Assistants (보이스 비서)

==> 이런 자연어처리들을 하기 위해서 필요한 것이 -> sequence prolems!!! (문장을 이루는 단어를 숫자로 표현하는 문제)
==> NLP (Natural Language Processing) : 자연어에서 정보를 추려해는 것인 목적
==> NLU (Natural Language Understanding) : 자연어를 이해하고 행동이 목적

자연어에서 이야기하는 용어
1) Text 
2) Speech 

## 우리가 하려는 과정은
텍스트 -> 이 텍스트를 숫자로 변환 -> 모델 생성 -> 패턴을 찾기 위해 모델을 학습 -> 패턴 이용 (예측)

In [1]:
import tensorflow as tf
print(tf.__version__)

2.6.0


In [2]:
from helper_functions import unzip_data, create_tensorboard_callback, plot_loss_curves, compare_historys

In [3]:
unzip_data("nlp_getting_started.zip")

In [4]:
import pandas as pd

train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")

train_df.head(), train_df.shape, test_df.head(), test_df.shape

(   id keyword location                                               text  \
 0   1     NaN      NaN  Our Deeds are the Reason of this #earthquake M...   
 1   4     NaN      NaN             Forest fire near La Ronge Sask. Canada   
 2   5     NaN      NaN  All residents asked to 'shelter in place' are ...   
 3   6     NaN      NaN  13,000 people receive #wildfires evacuation or...   
 4   7     NaN      NaN  Just got sent this photo from Ruby #Alaska as ...   
 
    target  
 0       1  
 1       1  
 2       1  
 3       1  
 4       1  ,
 (7613, 5),
    id keyword location                                               text
 0   0     NaN      NaN                 Just happened a terrible car crash
 1   2     NaN      NaN  Heard about #earthquake is different cities, s...
 2   3     NaN      NaN  there is a forest fire at spot pond, geese are...
 3   9     NaN      NaN           Apocalypse lighting. #Spokane #wildfires
 4  11     NaN      NaN      Typhoon Soudelor kills 28 in China 

In [5]:
train_df_shuffled = train_df.sample(frac = 1, random_state=42)
train_df_shuffled.head(), train_df_shuffled.shape

(        id      keyword               location  \
 2644  3796  destruction                    NaN   
 2227  3185       deluge                    NaN   
 5448  7769       police                     UK   
 132    191   aftershock                    NaN   
 6845  9810       trauma  Montgomery County, MD   
 
                                                    text  target  
 2644  So you have a new weapon that can cause un-ima...       1  
 2227  The f$&amp;@ing things I do for #GISHWHES Just...       0  
 5448  DT @georgegalloway: RT @Galloway4Mayor: ÛÏThe...       1  
 132   Aftershock back to school kick off was great. ...       0  
 6845  in response to trauma Children of Addicts deve...       0  ,
 (7613, 5))

## 우리가 하려는 것
## 입력 (text 컬럼) -> 머신러닝 알고리즘 -> 출력 (target 컬럼)

In [6]:
train_df.target.value_counts()

# 결과값이 0과 1밖에 없기에 binary classification 문제

0    4342
1    3271
Name: target, dtype: int64

In [8]:
print(f"총 학습할 데이터의 수 : {len(train_df)}")
print(f"총 테스트할 데이터의 수 : {len(test_df)}")
print(f"총 데이터의 수 : {len(train_df) + len(test_df)}")

총 학습할 데이터의 수 : 7613
총 테스트할 데이터의 수 : 3263
총 데이터의 수 : 10876


In [9]:
import random

random_index = random.randint(0, len(train_df) - 5)
for row in train_df_shuffled[ ["text", "target"] ][random_index:random_index + 5].itertuples():
    _, text, target = row
    print(f"Target: {target}", "부정" if target > 0 else "긍정")
    print(f"Text: \n{text}\n")
    print("---\n")

Target: 0 긍정
Text: 
Dying with debt can be costly for survivors

---

Target: 0 긍정
Text: 
Caution: breathing may be hazardous to your health.

---

Target: 0 긍정
Text: 
@SmusX16475 Skype just crashed u host

---

Target: 1 부정
Text: 
Christian Attacked by Muslims at the Temple Mount after Waving Israeli Flag via Pamela Geller - ... http://t.co/T1aa5Ov7Eg

---

Target: 1 부정
Text: 
Man charged over fatal crash near Dubbo refused bail http://t.co/HDBMfOVUtZ via @dailyliberal

---



In [12]:
type(train_df_shuffled["text"]), train_df_shuffled["text"]

(pandas.core.series.Series,
 2644    So you have a new weapon that can cause un-ima...
 2227    The f$&amp;@ing things I do for #GISHWHES Just...
 5448    DT @georgegalloway: RT @Galloway4Mayor: ÛÏThe...
 132     Aftershock back to school kick off was great. ...
 6845    in response to trauma Children of Addicts deve...
                               ...                        
 5226    @Eganator2000 There aren't many Obliteration s...
 5390    just had a panic attack bc I don't have enough...
 860     Omron HEM-712C Automatic Blood Pressure Monito...
 7603    Officials say a quarantine is in place at an A...
 7270    I moved to England five years ago today. What ...
 Name: text, Length: 7613, dtype: object)

In [13]:
type(train_df_shuffled["text"].to_numpy()), train_df_shuffled["text"].to_numpy()

(numpy.ndarray,
 array(['So you have a new weapon that can cause un-imaginable destruction.',
        'The f$&amp;@ing things I do for #GISHWHES Just got soaked in a deluge going for pads and tampons. Thx @mishacollins @/@',
        'DT @georgegalloway: RT @Galloway4Mayor: \x89ÛÏThe CoL police can catch a pickpocket in Liverpool Stree... http://t.co/vXIn1gOq4Q',
        ...,
        'Omron HEM-712C Automatic Blood Pressure Monitor STANDARD AND LARGE BP CUFFS http://t.co/gJBAInQWN9 http://t.co/jPhgpL1c5x',
        'Officials say a quarantine is in place at an Alabama home over a possible Ebola case after developing symptoms... http://t.co/rqKK15uhEY',
        'I moved to England five years ago today. What a whirlwind of time it has been! http://t.co/eaSlGeA1B7'],
       dtype=object))

In [14]:
from sklearn.model_selection import train_test_split

train_sentences, val_sentences, train_labels, val_labels = train_test_split(
    train_df_shuffled["text"].to_numpy(),
    train_df_shuffled["target"].to_numpy(),
    test_size = 0.1,
    random_state= 42
)

In [15]:
len(train_sentences), len(train_labels), len(val_sentences), len(val_labels)

(6851, 6851, 762, 762)

In [18]:
train_sentences[:10], train_labels[:10]

(array(['@mogacola @zamtriossu i screamed after hitting tweet',
        'Imagine getting flattened by Kurt Zouma',
        '@Gurmeetramrahim #MSGDoing111WelfareWorks Green S welfare force ke appx 65000 members har time disaster victim ki help ke liye tyar hai....',
        "@shakjn @C7 @Magnums im shaking in fear he's gonna hack the planet",
        'Somehow find you and I collide http://t.co/Ee8RpOahPk',
        '@EvaHanderek @MarleyKnysh great times until the bus driver held us hostage in the mall parking lot lmfao',
        'destroy the free fandom honestly',
        'Weapons stolen from National Guard Armory in New Albany still missing #Gunsense http://t.co/lKNU8902JE',
        '@wfaaweather Pete when will the heat wave pass? Is it really going to be mid month? Frisco Boy Scouts have a canoe trip in Okla.',
        'Patient-reported outcomes in long-term survivors of metastatic colorectal cancer - British Journal of Surgery http://t.co/5Yl4DC1Tqt'],
       dtype=object),
 array([0,