# Loading Libraries

In [None]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud
import re
import emoji

from IPython.display import Markdown as md
plt.style.use('ggplot')

In [None]:
from sklearn.metrics import classification_report,accuracy_score,confusion_matrix

# Loading Data

In [None]:
train_path = "../input/tweet-sentiment-extraction/train.csv"
test_path = "../input/tweet-sentiment-extraction/test.csv"
sample_submission_path = "../input/tweet-sentiment-extraction/sample_submission.csv"

In [None]:
df_train = pd.read_csv(train_path)
df_test = pd.read_csv(test_path)
submission = pd.read_csv(sample_submission_path)

# 1. Explore Data

In [None]:
df_train.head()

In [None]:
df_train.info()

In [None]:
df_test.info()

In [None]:
df_test.head()

In [None]:
print('Training data shape: ', df_train.shape)
print('Testing data shape: ', df_test.shape)

# 2. Preprocessing

### 2-1) Missing Values treatment in the dataset

In [None]:
#Missing values in training set
df_train.isnull().sum()

In [None]:
#Missing values in test set
df_test.isnull().sum()

In [None]:
#Drop missing value
df_train.dropna(axis = 0, how ='any',inplace=True) ;
df_test.dropna(axis = 0, how ='any',inplace=True) ;
df_train.isnull().sum()

In [None]:
df_train.to_csv('./train_v1.csv', index = False)
df_test.to_csv('./test_v1.csv', index = False)

# Preprocessing with one function

In [None]:
!pip install text-preprocessing
!pip install nltk

In [None]:
!python -m pip install ../input/textpreprocessing/text_preprocessing-0.0.8-py2.py3-none-any.whl

In [None]:
import text_preprocessing
import string

!pip install contractions
import contractions

import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize

import string
punc = string.punctuation

nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

nltk.download('averaged_perceptron_tagger')

nltk.download('wordnet')
from nltk.corpus import wordnet

from nltk.stem import WordNetLemmatizer

In [None]:
# text preprocessing helper functions

def clean_text(text):
    '''Make text lowercase, remove text in square brackets,remove links,remove punctuation
    and remove words containing numbers.'''
    text = text.lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    text = contractions.fix(text)
    return text


def text_preprocessing(text):
    """
    Cleaning and parsing the text.

    """
    tokenizer = nltk.tokenize.RegexpTokenizer(r'\w+')
    nopunc = clean_text(text)
    tokenized_text = tokenizer.tokenize(nopunc)
    remove_stopwords = [w for w in tokenized_text if w not in stopwords.words('english')]
    combined_text = ' '.join(tokenized_text)
    return combined_text

In [None]:
# Applying the cleaning function to both test and training datasets
#df_train['text_clean'] = df_train['text'].apply(str).apply(lambda x: text_preprocessing(x))

df_test['text_clean'] = df_test['text'].apply(str).apply(lambda x: text_preprocessing(x))
df_test.head()

In [None]:
df_test.sample(frac=0.05)

In [None]:
df_train.to_csv('./train_v1.csv', index = False)

In [None]:
df_test.to_csv('./test_v1.csv', index = False)

# Padding

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer
tokenizer = Tokenizer()

In [None]:
tokenizer.fit_on_texts(df_train['selected_text'])
encoded_train = tokenizer.texts_to_sequences(df_train['selected_text'])

tokenizer.fit_on_texts(df_test['text_clean'])
encoded_test = tokenizer.texts_to_sequences(df_test['text_clean'])

print(encoded_train, encoded_test)

In [None]:
len(encoded_train)

In [None]:
max_len_train = max(len(item) for item in encoded_train)
max_len_test = max(len(item) for item in encoded_test)
print(max_len_train, max_len_test)

#max_len: train, test equalize
max_len_train = 35
max_len_test = 35

In [None]:
for item in encoded_train:
    while len(item) < max_len_train:  
        item.append(0)

padded_np_train = np.array(encoded_train)

for item in encoded_test:
    while len(item) < max_len_test:  
        item.append(0)

padded_np_test = np.array(encoded_test)

#save
np.save('./padded_train.npy',padded_np_train)
np.save('./padded_test.npy',padded_np_test)

In [None]:
padded_np_train.shape

In [None]:
padded_np_test.shape

In [None]:
len(df_train['sentiment'])

In [None]:
len(df_test['sentiment'])

**from here

In [None]:
import pandas as pd
import numpy as np
from tensorflow.keras.utils import to_categorical

In [None]:
padded_train=np.load('../input/final-data/padded_train.npy')
padded_test=np.load('../input/final-data/padded_test.npy')

In [None]:
df_train = pd.read_csv('../input/final-data/train_v1.csv')
df_test = pd.read_csv('../input/final-data/test_v1.csv')

In [None]:
df_train['sentiment'].shape

In [None]:
# Finding the number of labels
num_labels = len(set(df_train['sentiment']))
print(num_labels)

# Label Encoder

In [None]:
from sklearn.preprocessing import LabelEncoder
lb=LabelEncoder()
sent_train=lb.fit_transform(df_train['sentiment'])
sent_test=lb.fit_transform(df_test['sentiment'])

In [None]:
print(sent_train)

In [None]:
# Import the libraries
import seaborn as sns

sns.countplot(x='sentiment',data=df_train)

In [None]:
len(sent_train)

In [None]:
from sklearn.preprocessing import OneHotEncoder
onehot_encoder = OneHotEncoder(sparse=False)

#train
sent_train = sent_train.reshape(len(sent_train), 1)
y_final = onehot_encoder.fit_transform(sent_train)

#test
sent_test = sent_test.reshape(len(sent_test), 1)
y_test = onehot_encoder.fit_transform(sent_test)

print(y_final.shape, y_test.shape)

In [None]:
print(y_final)

In [None]:
x_final=padded_train
x_final.shape

In [None]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x_final, y_final, test_size=0.33, random_state=42, shuffle=True)

# Build Model

In [None]:
np.unique(padded_train)

In [None]:
np.unique(padded_test)

In [None]:
len(np.unique(padded_test))

In [None]:
voc_size=len(np.unique(padded_train))+1
print(voc_size)

In [None]:
#hyperparameter
max_features = voc_size

embedding_dims = 300 # feature
max_len = 35 # time_step

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, SimpleRNN, LSTM, Dropout, Bidirectional

# Simple RNN (vanilla RNN)

In [None]:
def create_model():
    model = Sequential()
    # 2d_array (data, max_len)
    model.add(Embedding(voc_size, embedding_dims, input_length=max_len))
    # 3d_array (data, max_len, embedding_dims)
    model.add(SimpleRNN(32))
    # 2d_array (data, value)
    model.add(Dense(3, activation='softmax'))
    # 1d_array [0, 1, 0, 0, 1]
    
    model.compile(optimizer = 'adam', metrics=['accuracy'], loss = 'categorical_crossentropy')

    return model

In [None]:
model = create_model()

In [None]:
model.summary()

In [None]:
model.fit(x_train, y_train, validation_data = (x_test, y_test), epochs = 10, batch_size = 64)

In [None]:
model.fit(x_train, y_train, validation_data = (x_test, y_test), epochs = 10, batch_size = 512)

# LSTM

In [None]:
from tensorflow.keras import regularizers

In [None]:
from tensorflow.keras.callbacks import ModelCheckpoint
import os

checkpoint_path = "./cp-{epoch:04d}.ckpt"
checkpoint_dir = os.path.dirname(checkpoint_path)

cp_callback = ModelCheckpoint(checkpoint_path, verbose=1, 
                              save_weights_only=True, period=1)

In [None]:
def create_lstm_model():
    model=Sequential()
    model.add(Embedding(voc_size, embedding_dims, input_length=max_len))
    model.add(LSTM(128))
    model.add(Dense(3,activation='softmax'))
    model.compile(loss='categorical_crossentropy',optimizer='adam',metrics=['accuracy'])
    return model

model_lstm = create_lstm_model()
print(model_lstm.summary())

In [None]:
history=model_lstm.fit(x_train, y_train, validation_data = (x_test, y_test), epochs = 10, batch_size = 512)

In [None]:
loss, acc = model_lstm.evaluate(x_test,  y_test, verbose=2)

In [None]:
history1=model_lstm.fit(x_train, y_train, validation_data = (x_test, y_test), epochs = 20, batch_size = 512)

In [None]:
import matplotlib.pyplot as plt

In [None]:
loss = history.history['loss']
val_loss = history.history['val_loss']

epochs = range(1, len(loss) + 1)

plt.plot(epochs, loss, 'r', label = 'Training loss')
plt.plot(epochs, val_loss, 'b', label = 'Validation loss')
plt.title('Training and validation loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()      

plt.show()

In [None]:
acc = history.history['accuracy']
val_acc = history.history['val_accuracy']

plt.plot(epochs, acc, 'r', label = 'Training accuracy')
plt.plot(epochs, val_acc, 'b', label = 'Validation accuracy')
plt.title('Training and validation accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()

plt.show()

In [None]:
model_lstm.save('./save_model_lstm.h5')

# CNN+RNN

In [None]:
from tensorflow.keras.layers import Conv1D

In [None]:
def create_cnn_model():
    model = Sequential()
    model.add(Embedding(voc_size, embedding_dims, input_length=max_len))
    model.add(Conv1D(filters=64, kernel_size=2, activation='relu', input_shape=(35,1)))
    model.add(SimpleRNN(32))
    model.add(Dense(3, activation='softmax'))
    
    model.compile(optimizer = 'adam', metrics=['accuracy'], loss = 'categorical_crossentropy')

    return model

In [None]:
model_cnn = create_cnn_model()
print(model_cnn.summary())

In [None]:
model_cnn.fit(x_train, y_train, validation_data = (x_test, y_test), epochs = 10, batch_size = 512)

# CNN+LSTM

In [None]:
def create_cnnlstm_model():
    model=Sequential()
    model.add(Embedding(voc_size, embedding_dims, input_length=max_len))
    model.add(Conv1D(filters=64, kernel_size=2, activation='relu', input_shape=(35,1)))
    model.add(LSTM(100))
    model.add(Dense(3,activation='softmax'))
    model.compile(loss='categorical_crossentropy',optimizer='adam',metrics=['accuracy'])
    return model

model_cnnlstm = create_cnnlstm_model()
print(model_lstm.summary())

In [None]:
model_cnnlstm.fit(x_train, y_train, validation_data = (x_test, y_test), epochs = 10, batch_size = 512)

load model

In [None]:
from keras.models import load_model
# load model from single file
model = create_lstm_model()
model_lstm = load_model('../input/weight/save_model_lstm.h5')