In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
from keras.preprocessing.text import Tokenizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from keras.preprocessing.sequence import pad_sequences
import keras
from keras.layers import Input,Embedding,Bidirectional,LSTM,Dense,Dropout,TimeDistributed,GlobalAveragePooling1D,BatchNormalization,GlobalMaxPool1D
from sklearn.feature_extraction.text import CountVectorizer,TfidfTransformer
from sklearn.naive_bayes import GaussianNB,BernoulliNB,MultinomialNB
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
import re
import seaborn as sns

In [None]:
train_df = pd.read_csv("../input/covid-19-nlp-text-classification/Corona_NLP_train.csv",encoding="L1")
test_df = pd.read_csv("../input/covid-19-nlp-text-classification/Corona_NLP_test.csv",encoding="L1")

print(f"train dataset shape >> {train_df.shape}")
print(f"test dataset shape >> {test_df.shape}")

def data_label_split(dataset):
    data = dataset['OriginalTweet']
    label = dataset['Sentiment']
    return data,label

train_data,train_label = data_label_split(train_df)
test_data,test_label = data_label_split(test_df)

train = pd.DataFrame({
    'label':train_label,
    'data':train_data
})

test = pd.DataFrame({
    'label':test_label,
    'data':test_data
})

def reassign_label(x):
    if x == "Extremely Positive" or x == "Positive":
        return 1
    elif x =="Extremely Negative" or x =="Negative":
        return -1
    elif x =="Neutral":
        return 0

train.label = train.label.apply(lambda x:reassign_label(x))
test.label = test.label.apply(lambda x:reassign_label(x))


train_data = train.data
test_data = test.data
train_label = train.label
test_label = test.label

train.sample(15)

<1-1> 2글자 이하 단어 제거, @덩어리 제거, #제거, url 주소 제거

In [None]:
shortword = re.compile(r"\b\w{1,2}\b")
hashtag = re.compile(r"@[a-zA-Z0-9_]*")
website = re.compile(r"(http|https):*/+[a-zA-Z0-9./]*")


def remove_short(data):
    removed=[]
    for s in data:
        removed_sentence = shortword.sub('',s)
        removed_sentence = hashtag.sub('',removed_sentence)
        removed_sentence = website.sub('',removed_sentence)
        removed_sentence = removed_sentence.replace("#","")
        removed.append(removed_sentence.strip())
    return removed

train_data = remove_short(train_data)
test_data = remove_short(test_data)

print(len(train_data))
print(len(test_data))

<1-2> tokenize & stopwords 제거 (nltk.corpus)

In [None]:
swords = stopwords
stop_words = set(swords.words('english'))
print(len(stop_words))
print("stopwords samples >> ",stopwords.words('english')[:10])

def tokenize(data):
    ret = []
    for sentence in data:
        result = word_tokenize(sentence)
        ret.append(result)
        
    return ret

def remove_stopwords(data):
    ret = []
    for sentence in data:
        result=[]
        for tok in sentence:
            if tok not in stop_words:
                result.append(tok)
                
        ret.append(result)
        
    return ret
        
train_data = tokenize(train_data)
test_data = tokenize(test_data)

train_data = remove_stopwords(train_data)
test_data = remove_stopwords(test_data)

print(len(train_data))
print(len(test_data))

+) Remove empty rows

In [None]:
train = pd.DataFrame({
    'label':train_label,
    'data':train_data
})

test = pd.DataFrame({
    'label':test_label,
    'data':test_data
})
    
train['data'] = train['data'].apply(lambda x:np.nan if (len(x) == 0) else (x))
test['data'] = test['data'].apply(lambda x:np.nan if len(x)==0 else x)

train.dropna(inplace=True)
test.dropna(inplace=True)

train_data = train.data
train_label = train.label
test_data = test.data
test_label = test.label

print(len(train_data))
print(len(test_data))

<1-3> Tokenize again to select only words used more than twice & text-integer mapping

In [None]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(train_data)
print(f"Size of vocabs >> {len(tokenizer.word_index)}")

word_counts = tokenizer.word_counts
k=0
freq=0
total_freq=0
for key,value in word_counts.items():
    total_freq = total_freq + value
    if value<2:
        k = k+1
        freq = freq + value
print(f"freq/total_freq >> {(freq/total_freq)*100}")
print(f"{k} words are used only for once")

In [None]:
word_size = 25000
vocab_size = word_size+1

tokenizer = Tokenizer(num_words=word_size)
tokenizer.fit_on_texts(train_data)

word_to_index = tokenizer.word_index
index_to_word = tokenizer.index_word

train_data = tokenizer.texts_to_sequences(train_data)
test_data = tokenizer.texts_to_sequences(test_data)

<1-4> Padding

In [None]:
lens =  [len(s) for s in train_data]

plt.hist(lens,bins=200)
plt.show()

sequence_size = 50

train_data =pad_sequences(train_data,maxlen=sequence_size,padding='post',truncating='post')
test_data = pad_sequences(test_data,maxlen=sequence_size,padding='post',truncating='post')

<2> one-hot encode label

In [None]:
sns.countplot(x=train_label)
plt.tight_layout()
plt.show()

train_label = pd.get_dummies(train_label)
test_label = pd.get_dummies(test_label)

In [None]:
test_label.shape

<3> Modeling

<3-1> LSTM model

In [None]:
word_vec_size = 32
hidden_size = 256

def create_LSTM():
    X = Input(shape=[sequence_size])
    
    H = Embedding(vocab_size,word_vec_size,input_length=sequence_size)(X)
    H = Bidirectional(LSTM(hidden_size,return_sequences=True))(H)
    H = GlobalMaxPool1D()(H)
    H = Dropout(0.4)(H)
    
    H = Dense(64,activation='relu')(H)
    H = Dropout(0.4)(H)
    Y = Dense(3,activation='softmax')(H)
    
    model = keras.models.Model(X,Y)
    model.compile(loss='categorical_crossentropy',optimizer='adam',metrics=['accuracy'])
        
    return model

lstm = create_LSTM()
hist_lstm = lstm.fit(train_data,train_label,epochs=5,validation_split=0.2,batch_size=128)
print("\nEvaluation on test dataset >>\n")
lstm.evaluate(test_data,test_label)

<3-2> Multi Kernel Conv1D

In [None]:
from keras.callbacks import EarlyStopping
from keras.layers import Concatenate,Conv1D,GlobalMaxPooling1D,Flatten

es = EarlyStopping(monitor='val_loss',mode='min',patience=3,verbose=1)


word_vec_size = 128
num_filters=128

def create_Conv1D(kernel_sizes=[3,4,5]):
    X = Input(shape=[sequence_size],name="Input")
    
    H = Embedding(vocab_size,word_vec_size,input_length=sequence_size)(X)
    D = Dropout(0.6)(H)
    
    
    conv_blocks = []
    for k_size in kernel_sizes:
        H = Conv1D(filters=num_filters,kernel_size=k_size,padding='valid')(D)
        H = GlobalMaxPooling1D()(H)
        H= Flatten()(H)
        conv_blocks.append(H)
        
    H = Concatenate()(conv_blocks) if len(conv_blocks) >1 else conv_blocks[0]
    H = Dropout(0.8)(H)
    H = Dense(128,activation='relu')(H)
    Y = Dense(3,activation='softmax')(H)
    
    model = keras.models.Model(X,Y)
    model.compile(loss='categorical_crossentropy',optimizer='adam',metrics=['accuracy'])
    
    return model

conv1D  = create_Conv1D()
hist = conv1D.fit(train_data,train_label,epochs=3,validation_split=0.2,batch_size=128)
print("\n\nEvaluation on test dataset >>\n\n")
conv1D.evaluate(test_data,test_label)

<3-3> Naive Bayes Models

(1) These models require additional text preprocessing (token to text)

In [None]:
# def token_to_text(data):
#     ret =[]
#     for sentence in data:
#         result = []
#         for tok in sentence:
#             if tok != 0:
#                 result.append(index_to_word[tok])
#         if len(result) != 0:
#             result = " ".join(result)
#             ret.append(result)
#     return ret

# train_x = token_to_text(train_data)
# test_x  = token_to_text(test_data)

train_x = train_df.OriginalTweet
test_x = test_df.OriginalTweet

train_y = train_df.Sentiment
test_y  =test_df.Sentiment

In [None]:
vectorizer = CountVectorizer()
transformer = TfidfTransformer()

train_x = vectorizer.fit_transform(train_x)
test_x = vectorizer.transform(test_x)

train_x = transformer.fit_transform(train_x)
test_x = transformer.transform(test_x)

print(train_x.shape)
print(test_x.shape)

In [None]:
multiNB = MultinomialNB()
multiNB.fit(train_x,train_y)
test_pred = multiNB.predict(test_x)
acc = accuracy_score(test_y,test_pred)
acc = np.round(acc*100,2)
print(f"test acc >> {acc}%")

preprocessing 안하고 그냥 넣으니까 결과 처참

In [None]:
train_df = pd.read_csv("../input/covid-19-nlp-text-classification/Corona_NLP_train.csv",encoding="L1")
test_df = pd.read_csv("../input/covid-19-nlp-text-classification/Corona_NLP_test.csv",encoding="L1")

df1 = train_df[['Sentiment','OriginalTweet']].copy()
df2 = test_df[['Sentiment','OriginalTweet']].copy()

space = re.compile(r"\s{2,}")

train_df.OriginalTweet = train_df.OriginalTweet.str.replace(hashtag,'')
train_df.OriginalTweet = train_df.OriginalTweet.str.replace(website,'')
train_df.OriginalTweet = train_df.OriginalTweet.str.replace("#"," ")
train_df.OriginalTweet = train_df.OriginalTweet.str.replace(space," ")
train_df.OriginalTweet = train_df.OriginalTweet.str.strip()
test_df.OriginalTweet = test_df.OriginalTweet.str.replace(hashtag,'')
test_df.OriginalTweet = test_df.OriginalTweet.str.replace(website,'')
test_df.OriginalTweet = test_df.OriginalTweet.str.replace("#",'')
test_df.OriginalTweet = test_df.OriginalTweet.str.replace(space," ")
test_df.OriginalTweet = test_df.OriginalTweet.str.strip()


train_df.dropna(inplace=True)
test_df.dropna(inplace=True)

train_x = train_df.OriginalTweet
train_y = train_df.Sentiment
test_x = test_df.OriginalTweet
test_y = test_df.Sentiment

In [None]:
vectorizer = CountVectorizer()
transformer = TfidfTransformer()

train_x = vectorizer.fit_transform(train_x)
test_x = vectorizer.transform(test_x)

train_x = transformer.fit_transform(train_x)
test_x = transformer.transform(test_x)

print(train_x.shape)
print(test_x.shape)

In [None]:
multiNB = MultinomialNB()
multiNB.fit(train_x,train_y)

train_pred = multiNB.predict(train_x)
train_acc = accuracy_score(train_y,train_pred)
train_acc = np.round(train_acc*100,2)
print(f"test acc >> {train_acc}%")

test_pred = multiNB.predict(test_x)
test_acc = accuracy_score(test_y,test_pred)
test_acc = np.round(test_acc*100,2)
print(f"test acc >> {test_acc}%")

@, 웹사이트, # 제거 했는데도 결과 처참