In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

(1) Load Libraries

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

import keras
from keras.utils import to_categorical

import re
from nltk.corpus import stopwords

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score


(2) Load tweets dataset

In [None]:
tweets = pd.read_csv("../input/twitter-airline-sentiment/Tweets.csv")
tweets.head(5)

In [None]:
print(tweets.columns)
df = pd.DataFrame({
    "sentiment":tweets.airline_sentiment,
    "text":tweets.text
})
print(f"Shape of our dataset >> {df.shape}")
df.head()

In [None]:
df.sentiment.replace(['negative','neutral','positive'],[-1,0,1],inplace=True)
df.sample(5)

sns.set_style("whitegrid")
sns.countplot(data=df,x='sentiment')
index = [0,1,2]
plt.xticks(index,['negative','neutral','positive'])
plt.title("Distribution of sentiment labels")
plt.show()

(3) Clean text data: 
* remove urls
* remove shortwords ( words of which length is 1 or 2)
* remove @
* remove #
* remove stopwords from nltk.corpus module

In [None]:
stop_words = set(stopwords.words('english'))

shortword = re.compile(r"\b\w{1,2}\b")
url = re.compile(r"https?:*/+[a-zA-Z0-9./]*")

def clean(text):
    text = re.sub(url,'',text)
    text = re.sub(shortword,'',text)
    text = text.replace('@','')
    text = text.replace('#','')
    
    text = text.split()
    text = [word for word in text if word not in stop_words]
    text = " ".join(text)
    
    return text

df.text = df.text.apply(clean)
df.text[:15]

(4) Tokenize (words to integers)

In [None]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(df.text)
print(f"{len(tokenizer.word_index)} words are used\n")

counts = tokenizer.word_counts
print(len(counts))

total_freq = 0
rare_freq = 0
rare_counts = 0
thread=2

for key,value in counts.items():
    total_freq += value
    if value<thread:
        rare_freq += value
        rare_counts += value

print(f"{rare_counts} are used less than {thread} times")
print(f"And these words accounts for {np.round(rare_freq/total_freq*100,2)}% of whole texts")

In [None]:
print("Tokenize only 7000 words.\nOther words are considered OOV")
word_size=7000
vocab_size = word_size+1
tokenizer = Tokenizer(num_words=word_size)

tokenizer.fit_on_texts(df.text)
tokenized = tokenizer.texts_to_sequences(df.text)

print("\nSamples\n")
print(tokenized[0])
print(tokenized[1])
print(len(tokenized))

(5) Pad & truncate sequences (post)

In [None]:
lengths = [len(s) for s in tokenized]
print(f"Average length of each row >> {np.mean(lengths)}")
print(f"Maximum length of each row >> {np.max(lengths)}")

plt.hist(lengths,bins=50)
plt.show()

sequence_size = 20
print(f"Pad all sequences into size of {sequence_size}")

padded = pad_sequences(tokenized,maxlen=sequence_size,padding='post',truncating='post')
print(padded.shape)
print("Padded samples")
print(padded[0])
print(padded[1])

(6) Transform label (setiment data) into one-hot vectors

In [None]:
data = padded
label = to_categorical(df.sentiment,num_classes=3)

print("shape of data >>",data.shape)
print("shape of label >>",label.shape)

print("\nSamples of label data")
print(label[0])
print(label[1])

(7) Train/Test split

In [None]:
train_data,test_data,train_label,test_label = train_test_split(data,label,test_size=0.3,stratify=label,random_state=42)

print("shape of train data >>",train_data.shape)
print("shape of test data >>",test_data.shape)

fig = plt.figure(figsize=(12,6))
ax1 = fig.add_subplot(1,2,1)
sns.countplot(x=np.argmax(train_label,axis=1))
plt.title("Distribution of train label")

ax2 = fig.add_subplot(1,2,2)
sns.countplot(x=np.argmax(test_label,axis=1))
plt.title("Distribution of test label")
plt.show()

index_to_sentiment = {
    0:'neutral',
    1:'positive',
    -1:'negative'
}

(8) Train and Test with LSTM model ( using pre-trained embedding vectors)

(8-1) Load pre-trained Embedding vectors and make an embedding matrix  customized for words we will use

In [None]:
import os

!wget http://nlp.stanford.edu/data/glove.6B.zip
!unzip glove*.zip

In [None]:
embedding_dict=dict()

f = open(os.path.join('glove.6B.100d.txt'),encoding='utf-8')
for line in f:
    tokens = line.split()
    word = tokens[0]
    word_vector = np.asarray(tokens[1:],dtype='float32')
    embedding_dict[word] = word_vector

f.close()

print(f"There are {len(embedding_dict)} embedding vectors in total")
print(f"Dimension of each vector >> {len(embedding_dict['read'])}")
embedding_size = len(embedding_dict['read'])


embedding_matrix = np.zeros((vocab_size,embedding_size))

for word,idx in tokenizer.word_index.items():
    if idx <= 7000:
        vector = embedding_dict.get(word)
        if vector is not None:
            embedding_matrix[idx] = np.asarray(vector,dtype='float32')

(8-2) Build Stacked LSTM model (+ bidirectional, many-to-many)

In [None]:
from keras.layers import Input,Embedding,TimeDistributed,Bidirectional,LSTM,BatchNormalization,Dense,GlobalMaxPool1D,GlobalAveragePooling1D,Dropout,Masking
from keras.callbacks import EarlyStopping,ReduceLROnPlateau
from keras.utils import plot_model

word_vec_size=100
hidden_size=128

def create_lstm():
    X = Input(shape=[sequence_size])
    H = Embedding(vocab_size,word_vec_size,input_length=sequence_size,weights=[embedding_matrix],trainable=False)(X)
    H = Masking(mask_value=0.0)(H)
    
    H = Bidirectional(LSTM(hidden_size,return_sequences=True))(H)
    H = Bidirectional(LSTM(int(hidden_size/2),return_sequences=True))(H)
    H = Bidirectional(LSTM(int(hidden_size/2),return_sequences=True))(H)
    
    H = GlobalMaxPool1D()(H)
    H = BatchNormalization()(H)
    H = Dense(32,activation='relu')(H)
    H = BatchNormalization()(H)
    Y = Dense(3,activation='softmax')(H)
    
    model = keras.models.Model(X,Y)
    model.compile(loss='categorical_crossentropy',optimizer='adam',metrics=['accuracy'])
    return model

es = EarlyStopping(monitor='val_accuracy',mode='min',patience=4,verbose=1)
rl = ReduceLROnPlateau(monitor='val_loss',mode='min',patience=3,verbose=1)

In [None]:
lstm_model = create_lstm()
plot_model(lstm_model)
lstm_history = lstm_model.fit(train_data,train_label,epochs=10,batch_size=64,validation_split=0.2,callbacks=[rl])
lstm_model.evaluate(test_data,test_label)

(8-3) LSTM model without using pre-trained embedding vectors

In [None]:
word_vec_size=100
hidden_size=256

def create_lstm_no_emb():
    X = Input(shape=[sequence_size])
    H = Embedding(vocab_size,word_vec_size,input_length=sequence_size)(X)
    H = Masking(mask_value=0.0)(H)
    
    H = Bidirectional(LSTM(hidden_size,return_sequences=True))(H)
    H = BatchNormalization()(H)
    H = Bidirectional(LSTM(int(hidden_size/2),return_sequences=True))(H)
    H = BatchNormalization()(H)
    H = Bidirectional(LSTM(int(hidden_size/2),return_sequences=True))(H)
    
    H = GlobalMaxPool1D()(H)
    H = Dense(64,activation='relu')(H)
    H = Dense(32,activation='relu')(H)
    H = Dropout(0.2)(H)
    Y = Dense(3,activation='softmax')(H)
    
    model = keras.models.Model(X,Y)
    model.compile(loss='categorical_crossentropy',optimizer='adam',metrics=['accuracy'])
    return model

In [None]:
lstm_no = create_lstm_no_emb()
plot_model(lstm_no)
lstm_no_hist = lstm_model.fit(train_data,train_label,epochs=15,batch_size=64,validation_split=0.2,callbacks=[rl])
lstm_no.evaluate(test_data,test_label)

(9)Multi Kernel Conv1D model (using pre-trained Embedding Vectors)

In [None]:
from keras.layers import Conv1D,Concatenate,LeakyReLU,Flatten

def create_conv1d():
    X = Input(shape=[sequence_size])
    H = Embedding(vocab_size,word_vec_size,input_length=sequence_size,weights=[embedding_matrix],trainable=False,mask_zero=True)(X)
    H = Dropout(0.3)(H)
    
    num_filters=[256,256,128,128]
    kernel_sizes=[3,4,5,6]
    conv_blocks=[]
    
    for i in range(len(kernel_sizes)):
        conv = Conv1D(filters=num_filters[i],kernel_size=kernel_sizes[i],padding='valid',activation='relu')(H)
        conv = GlobalMaxPool1D()(conv)
        conv = Flatten()(conv)
        conv_blocks.append(conv)
    
    H = Concatenate()(conv_blocks)
    H = Dropout(0.2)(H)
    
    H = Dense(128)(H)
    H = BatchNormalization()(H)
    H = LeakyReLU()(H)
    
    H = Dense(16)(H)
    H = BatchNormalization()(H)
    H = LeakyReLU()(H)
    
    Y = Dense(3,activation='softmax')(H)
    
    model = keras.models.Model(X,Y)
    model.compile(loss='categorical_crossentropy',optimizer='adam',metrics=['accuracy'])
    return model

In [None]:
conv1d = create_conv1d()
plot_model(conv1d)
hist = conv1d.fit(train_data,train_label,epochs=10,validation_split=0.2,batch_size=64,callbacks=[rl])
conv1d.evaluate(test_data,test_label)

(10) Naive Bayes Models : GaussianNB, MultiNomailNB, BernoulliNB

In [None]:
df.head()

In [None]:
from sklearn.feature_extraction.text import CountVectorizer,TfidfTransformer

vectorizer = CountVectorizer()
transformer = TfidfTransformer()

x = vectorizer.fit_transform(df.text)
print(f"shape >> {x.toarray().shape}")
print("samples\n")
print(x.toarray()[0])

x = transformer.fit_transform(x)
print(f"\n\nshape >> {x.toarray().shape}")
print("samples\n")
print(x.toarray()[0])

x = x.toarray()
y = df.sentiment

In [None]:
train_x,test_x,train_y,test_y = train_test_split(x,y,test_size=0.3,random_state=42,stratify=y)

print(train_x.shape)
print(train_y.shape)
print(test_x.shape)
print(test_y.shape)

In [None]:
from sklearn.naive_bayes import GaussianNB,MultinomialNB,BernoulliNB

models_NB = []
models_NB.append(GaussianNB())
models_NB.append(MultinomialNB())
models_NB.append(BernoulliNB())

for model in models_NB:
    model.fit(train_x,train_y)
    pred = model.predict(test_x)
    acc = accuracy_score(test_y,pred)
    print(f"Model {model.__class__.__name__} accuracy on test dataset >> {acc}")

(11) RandomForestClassifier

In [None]:
from sklearn.ensemble import RandomForestClassifier

