In [None]:
import tensorflow as tf
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np


import nltk 
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

import re

#  Dataset Preprocessing

In [None]:
df = pd.read_csv('../input/sentiment140/training.1600000.processed.noemoticon.csv', encoding = 'latin')
df.columns = ['sentiment', 'id', 'date', 'query', 'user_id', 'text']
df['sentiment_pos'] = df['sentiment'].apply(lambda x: 1 if x==4 else 0)
df['sentiment_neg'] = df['sentiment'].apply(lambda x: 1 if x==0 else 0)
df = df.drop(['id', 'date', 'query', 'user_id','sentiment'], axis=1)
df.head()

In [None]:
df = df.sample(frac = 0.25)
val_count = df.sentiment_pos.value_counts()

plt.figure(figsize=(8,4))
plt.bar(['Negative','Positive'], val_count.values)
plt.title("Data Distribution")

In [None]:
def preprocess(text, stem=False):
  text = re.sub("@\S+|https?:\S+|http?:\S|[^A-Za-z0-9]+", ' ', str(text).lower()).strip()
  stop_words = stopwords.words('english')
  stemmer = SnowballStemmer('english')
  tokens = []
  for token in text.split():
    if token not in stop_words:
      if stem:
        tokens.append(stemmer.stem(token))
      else:
        tokens.append(token)
  return " ".join(tokens)

In [None]:
df.text = df.text.apply(lambda x: preprocess(x))

# Tokenization

In [None]:
x = df.text
y = df[['sentiment_pos', 'sentiment_neg']]

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(max_features=2500)
tfidf = vectorizer.fit(x)
processed_features = tfidf.transform(x).toarray()

In [None]:
import pickle
pickle.dump(tfidf, open("vectorizer.pickle", "wb"))

In [None]:
processed_features = processed_features.reshape(processed_features.shape[0],1,processed_features.shape[1])
print(processed_features.shape)

In [None]:
from tensorflow.keras.layers import Conv1D, Bidirectional, LSTM, Dense, Input, Dropout
from tensorflow.keras.layers import SpatialDropout1D
from tensorflow.keras.callbacks import ModelCheckpoint

In [None]:
inp = Input(shape=(1,processed_features.shape[2]))
x = Bidirectional(LSTM(64, dropout=0.2, recurrent_dropout=0.2))(inp)
x = Dense(512, activation='relu')(x)
x = Dropout(0.5)(x)
x = Dense(512, activation='relu')(x)
outputs = Dense(2, activation='softmax')(x)
model = tf.keras.Model(inp, outputs)

In [None]:
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import ReduceLROnPlateau

model.compile(optimizer=Adam(), loss='categorical_crossentropy', metrics=['accuracy'])
ReduceLROnPlateau = ReduceLROnPlateau(factor=0.1, min_lr = 0.01, monitor = 'val_loss', verbose = 1)

In [None]:
history = model.fit(processed_features, y, batch_size=512, epochs=50, validation_split=0.2, callbacks=[ReduceLROnPlateau])

In [None]:
model.save('model.h5')