In [11]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import csv
import re
import random
import numpy as np
from IPython import embed

In [None]:
data = pd.read_csv('Sentiment Analysis Dataset.csv')

In [None]:
print(data.shape)

In [None]:
data.head()

In [None]:
data.isnull().any()

In [None]:
# checking out the negative comments from the train set
data[data['Sentiment'] == 0].head(10)

In [None]:
#checking out the postive comments from the train set
data[data['Sentiment'] == 1].head(10)

In [None]:
data['Sentiment'].value_counts().plot.bar(color = 'pink', figsize = (6, 4))

In [None]:
# checking the distribution of tweets in the data
length_train = data['SentimentText'].str.len().plot.hist(color = 'pink', figsize = (6, 4))

In [None]:
data.groupby('Sentiment').describe()

In [None]:
def clean_str(string):
    # EMOJIS
    string = re.sub(r":\)", "emojihappy1", string)
    string = re.sub(r":P", "emojihappy2", string)
    string = re.sub(r":p", "emojihappy3", string)
    string = re.sub(r":>", "emojihappy4", string)
    string = re.sub(r":3", "emojihappy5", string)
    string = re.sub(r":D", "emojihappy6", string)
    string = re.sub(r" XD ", "emojihappy7", string)
    string = re.sub(r" <3 ", "emojihappy8", string)
    string = re.sub(r":\(", "emojisad9", string)
    string = re.sub(r":<", "emojisad10", string)
    string = re.sub(r":<", "emojisad11", string)
    string = re.sub(r">:\(", "emojisad12", string)

    # MENTIONS
    string = re.sub(r"(@)\w+", "", string)

    # WEBSITES
    string = re.sub(r"http(s)*:(\S)*", "linktoken", string)

    # STRANGE UNICODE
    string = re.sub(r"\\x(\S)*", "", string)

    # General Cleanup and Symbols
    string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)
    string = re.sub(r"\'s", " \'s", string)
    string = re.sub(r"\'ve", " \'ve", string)
    string = re.sub(r"n\'t", " n\'t", string)
    string = re.sub(r"\'re", " \'re", string)
    string = re.sub(r"\'d", " \'d", string)
    string = re.sub(r"\'ll", " \'ll", string)
    string = re.sub(r",", " , ", string)
    string = re.sub(r"!", " ! ", string)
    string = re.sub(r"\(", " \( ", string)
    string = re.sub(r"\)", " \) ", string)
    string = re.sub(r"\?", " \? ", string)
    string = re.sub(r"\s{2,}", " ", string)

    return string.strip().lower()


In [None]:
data['clean_text'] = data['SentimentText'].apply(clean_str)

In [None]:
data.head()

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(stop_words = 'english')
words = cv.fit_transform(data.clean_text)
sum_words = words.sum(axis=0)
words_freq = [(word, sum_words[0, i]) for word, i in cv.vocabulary_.items()]
words_freq = sorted(words_freq, key = lambda x: x[1], reverse = True)
frequency = pd.DataFrame(words_freq, columns=['word', 'freq'])
frequency.head(30).plot(x='word', y='freq', kind='bar', figsize=(15, 7), color = 'blue')
plt.title("Most Frequently Occuring Words - Top 30")

In [None]:
pip install wordcloud

In [None]:
from wordcloud import WordCloud
wordcloud = WordCloud(background_color = 'white', width = 1000, height = 1000).generate_from_frequencies(dict(words_freq))
plt.figure(figsize=(10,8))
plt.imshow(wordcloud)
plt.title("WordCloud - Vocabulary from Dataset", fontsize = 22)

In [None]:
positive_words =' '.join([text for text in data['clean_text'][data['Sentiment'] == 1]])
wordcloud = WordCloud(width=800, height=500, random_state = 0, max_font_size = 110).generate(positive_words)
plt.figure(figsize=(10, 7))
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis('off')
plt.title('The Positive Words')
plt.show()

In [None]:
negative_words =' '.join([text for text in data['clean_text'][data['Sentiment'] == 0]])
wordcloud = WordCloud(width=800, height=500, random_state = 0, max_font_size = 110).generate(negative_words)
plt.figure(figsize=(10, 7))
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis('off')
plt.title('The Negative Words')
plt.show()

In [None]:
pip install nltk

In [None]:
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer from textblob import TextBlob

In [None]:
def form_sentence(tweet):
  tweet_blob = TextBlob(tweet)
  return ' '.join(tweet_blob.words)
print(form_sentence(data['SentimentText'].iloc[0]))
print(data['SentimentText'].iloc[0])

In [None]:
def no_user_alpha(tweet):
  tweet_list = [ele for ele in tweet.split() if ele != 'user']
  clean_tokens = [ele for ele in tweet.split() if re.match(r'[^\W\d]*$', ele)]
  clean_s = ' '.join(clean_tokens)
  clean_mess = [word for word in clean_s.split() if word.lower() not in stopwords.words('english')]
  return ' '.join(clean_mess)
print(no_user_alpha(form_sentence(data['SentimentText'].iloc[0])))
print(data['SentimentText'].iloc[0])

In [None]:
def normalization(tweet_list):
  lem = WordNetLemmatizer()
  normalized_tweet = []
  for word in tweet_list:
    normalized_text = lem.lemmatize(word,'v')
    normalized_tweet.append(normalized_text)
  return ' '.join(normalized_tweet)
print(normalization(data['SentimentText'].iloc[0].split()))
print(data['SentimentText'].iloc[0])

In [None]:
data['clean_text'] = data['clean_text'].apply(form_sentence)

In [None]:
data['clean_text'] = data['clean_text'].apply(normalization)

In [None]:
data.head()

In [None]:
positive_words =' '.join([text for text in train['clean_text'][train['Sentiment'] == 1]])
wordcloud = WordCloud(background_color = 'grey',width=800, height=500, random_state = 0, max_font_size = 110).generate(positive_words)
plt.figure(figsize=(10, 7))
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis('off')
plt.title('The Positive Words after Cleaning')
plt.show()

In [None]:
negative_words =' '.join([text for text in data['clean_text'][data['Sentiment'] == 0]])
wordcloud = WordCloud(background_color = 'grey',width=800, height=500, random_state = 0, max_font_size = 110).generate(negative_words)
plt.figure(figsize=(10, 7))
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis('off')
plt.title('The Negative Words after Cleaning ')
plt.show()

In [None]:
#Separates a file with mixed positive and negative examples into two.
def separate_dataset(filename):
  good_out = open("good_file","w+",encoding="utf8");
  bad_out = open("bad_file","w+",encoding="utf8");
  seen = 1;
  with open(filename,'r',encoding="utf8") as f:
    reader = csv.reader(f)
    next(reader)
    for line in reader:
      seen +=1
      sentiment = line[1]
      sentence = line[4]
      if (sentiment == "0"):
        bad_out.write(sentence+"\n")
      else:
        good_out.write(sentence+"\n")
      if (seen%10000==0):
        print (seen);
  good_out.close();
  bad_out.close();

In [None]:
separate_dataset("Sentiment Analysis Dataset.csv");

In [None]:
#Load Datafiles
def get_dataset(goodfile,badfile,limit,randomize=True):
  good_x = list(open(goodfile,"r",encoding="utf8").readlines())
  good_x = [s.strip() for s in good_x]
  bad_x = list(open(badfile,"r",encoding="utf8").readlines())
  bad_x = [s.strip() for s in bad_x]
  if (randomize):
    random.shuffle(bad_x)
    random.shuffle(good_x)
  good_x = good_x[:limit]
  bad_x = bad_x[:limit]
  x = good_x + bad_x
  x = [clean_str(s) for s in x]
  positive_labels = [[0, 1] for _ in good_x]
  negative_labels = [[1, 0] for _ in bad_x]
  y = np.concatenate([positive_labels, negative_labels], 0)
  return [x,y]

In [None]:
#Generate random batches
def gen_batch(data, batch_size, num_epochs, shuffle=True): """
Generates a batch iterator for a dataset.
"""
  data = np.array(data)
  data_size = len(data)
  num_batches_per_epoch = int((len(data)-1)/batch_size) + 1
  for epoch in range(num_epochs):
  # Shuffle the data at each epoch
    if shuffle:
      shuffle_indices = np.random.permutation(np.arange(data_size))
      shuffled_data = data[shuffle_indices]
    else:
      shuffled_data = data
    for batch_num in range(num_batches_per_epoch):
      start_index = batch_num * batch_size
      end_index = min((batch_num + 1) * batch_size, data_size)
      yield shuffled_data[start_index:end_index]

In [None]:
# Data Preparation
filename = "Sentiment Analysis Dataset.csv"
goodfile = "good_file"
badfile = "bad_file"

In [None]:
x_text, y = get_dataset(goodfile, badfile, 5000)

In [None]:
good_tweets = pd.read_csv('good_file',error_bad_lines=False)

In [None]:
print(good_tweets.shape)

In [None]:
good_tweets.head(10)

In [None]:
bad_tweets = pd.read_csv('bad_file',error_bad_lines=False)

In [None]:
print(bad_tweets.shape)

In [None]:
bad_tweets.head(10)

In [None]:
!pip install tensorflow==1.14 import tensorflow as tf

In [None]:
Model Building

In [None]:
import numpy as np
from IPython import embed

class CNN_LSTM(object):
    def __init__(self, sequence_length, num_classes, vocab_size, embedding_size, filter_sizes, num_filters, l2_reg_lambda=0.0, num_hidden=128):
        # PLACEHOLDERS
        self.input_x = tf.placeholder(tf.int32, [None, sequence_length], name="input_x") # X - The Data
        self.input_y = tf.placeholder(tf.float32, [None, num_classes], name="input_y") # Y - The Labels
        self.dropout_keep_prob = tf.placeholder(tf.float32, name="dropout_keep_prob") # Dropout

        l2_loss = tf.constant(0.0) # Keeping track of L2 regularization loss

        # 1. EMBEDDING LAYER ########################################
        with tf.device('/cpu:0'), tf.name_scope("embedding"):
            self.W = tf.Variable(tf.random_uniform([vocab_size, embedding_size], -1.0, 1.0), name="W")
            self.embedded_chars = tf.nn.embedding_lookup(self.W, self.input_x)
            self.embedded_chars_expanded = tf.expand_dims(self.embedded_chars, -1)

        # 2. CONVOLUTION LAYER + MAXPOOLING (per filter) ############
        pooled_outputs = []
        for i, filter_size in enumerate(filter_sizes):
            with tf.name_scope("conv-maxpool-%s" % filter_size):
                # Convolution Layer
                filter_shape = [filter_size, embedding_size, 1, num_filters]
                W = tf.Variable(tf.truncated_normal(filter_shape, stddev=0.1), name="W")
                b = tf.Variable(tf.constant(0.1, shape=[num_filters]), name="b")
                conv = tf.nn.conv2d(self.embedded_chars_expanded, W, strides=[1, 1, 1, 1], padding="VALID", name="conv")
                # Non-linearity
                h = tf.nn.relu(tf.nn.bias_add(conv, b), name="relu")
                # Maxpooling
                pooled = tf.nn.max_pool(h, ksize=[1, sequence_length - filter_size + 1, 1, 1], strides=[1, 1, 1, 1], padding='VALID', name="pool")
                pooled_outputs.append(pooled)

        # Combining pooled features
        num_filters_total = num_filters * len(filter_sizes)
        self.h_pool = tf.concat(pooled_outputs, 3)
        self.h_pool_flat = tf.reshape(self.h_pool, [-1, num_filters_total])

        # 3. DROPOUT LAYER ###########################################
        with tf.name_scope("dropout"):
            self.h_drop = tf.nn.dropout(self.h_pool_flat, self.dropout_keep_prob)

        # 4. LSTM LAYER ##############################################
        cell = tf.contrib.rnn.LSTMCell(num_hidden, state_is_tuple=True)
        self.h_drop_exp = tf.expand_dims(self.h_drop, 0)
        val, state = tf.nn.dynamic_rnn(cell, self.h_drop_exp, dtype=tf.float32)

        val2 = tf.transpose(val, [1, 0, 2])
        last = tf.gather(val2, int(val2.get_shape()[0]) - 1)

        out_weight = tf.Variable(tf.random_normal([num_hidden, num_classes]))
        out_bias = tf.Variable(tf.random_normal([num_classes]))

        with tf.name_scope("output"):
            self.scores = tf.nn.xw_plus_b(last, out_weight, out_bias, name="scores")
            self.predictions = tf.nn.softmax(self.scores, name="predictions")

        with tf.name_scope("loss"):
            self.losses = tf.nn.softmax_cross_entropy_with_logits(logits=self.scores, labels=self.input_y)
            self.loss = tf.reduce_mean(self.losses, name="loss")

        with tf.name_scope("accuracy"):
            self.correct_pred = tf.equal(tf.argmax(self.predictions, 1), tf.argmax(self.input_y, 1))
            self.accuracy = tf.reduce_mean(tf.cast(self.correct_pred, "float"), name="accuracy")

        print("(!) LOADED CNN-LSTM! #")


In [None]:
import random
import sys
import os

file_name = "Sentiment Analysis Dataset.csv"
count = 1000

subscript = 1

while os.path.isfile('./good/' + str(count) + '_' + str(subscript)):
    subscript += 1

t_file = list(open(file_name, 'r', encoding="utf8"))
good_file = open("good" + str(count) + '_' + str(subscript), 'a', encoding="utf8")
bad_file = open("bad" + str(count) + '_' + str(subscript), 'a', encoding="utf8")

print("Opened files")

good_count = 0
bad_count = 0

while True:
    line = random.choice(t_file)
    line_split = line.split(',', 2)
    label = int(line_split[1])
    if label and good_count < count:
        good_file.write(line)
        good_count += 1
    elif not label and bad_count < count:
        bad_file.write(line)
        bad_count += 1
    elif bad_count >= count and good_count >= count:
        break


In [None]:
import numpy as np
import time
import datetime
from tensorflow.contrib import learn
from IPython import embed

# Parameters
dev_size = .10

# Model Hyperparameters
embedding_dim = 32  #128
max_seq_length = 70
filter_sizes = [3, 4, 5] #3
num_filters = 32
dropout_prob = 0.5 #0.5
l2_reg_lambda = 0.0
use_glove = True #00 we use glove

# Training parameters
batch_size = 128
num_epochs = 10 #200
evaluate_every = 100 #100
checkpoint_every = 10000 #100
num_checkpoints = 1 #Checkpoints to store

# Misc Parameters
allow_soft_placement = True
log_device_placement = False

# Data Preparation
filename = "Sentiment Analysis Dataset.csv"
goodfile = "good_file"
badfile = "bad_file"

# Load data
print("Loading data...")
x_text, y = get_dataset(goodfile, badfile, 5000)

# TODO: MAX LENGTH
# Build vocabulary
max_document_length = max([len(x.split(" ")) for x in x_text])
vocab_processor = learn.preprocessing.VocabularyProcessor(max_document_length)
x = np.array(list(vocab_processor.fit_transform(x_text)))

# Randomly shuffle data
np.random.seed(42)
shuffle_indices = np.random.permutation(np.arange(len(y)))
x_shuffled = x[shuffle_indices]
y_shuffled = y[shuffle_indices]

# Split train/test set
# TODO: This is very crude, should use cross-validation
dev_sample_index = -1 * int(dev_size * float(len(y)))
x_train, x_test = x_shuffled[:dev_sample_index], x_shuffled[dev_sample_index:]
y_train, y_test = y_shuffled[:dev_sample_index], y_shuffled[dev_sample_index:]

print("Train/Test split: {:d}/{:d}".format(len(y_train), len(y_test)))


In [None]:
with tf.Graph().as_default():
    session_conf = tf.ConfigProto(
      allow_soft_placement=allow_soft_placement,
      log_device_placement=log_device_placement)
    sess = tf.Session(config=session_conf)
    with sess.as_default():
        cnn = CNN_LSTM(
            x_train.shape[1],
            y_train.shape[1],
            len(vocab_processor.vocabulary_),
            embedding_dim,
            filter_sizes,
            num_filters,
            l2_reg_lambda=l2_reg_lambda)

        # Define Training procedure
        global_step = tf.Variable(0, name="global_step", trainable=False)
        optimizer = tf.train.AdamOptimizer(1e-3)
        grads_and_vars = optimizer.compute_gradients(cnn.loss)
        train_op = optimizer.apply_gradients(grads_and_vars, global_step=global_step)

        # Output directory for models and summaries
        timestamp = str(int(time.time()))
        out_dir = os.path.abspath(os.path.join(os.path.curdir, "runs", timestamp))
        print("Writing to {}\n".format(out_dir))

        # Summaries for loss and accuracy
        loss_summary = tf.summary.scalar("loss", cnn.loss)
        acc_summary = tf.summary.scalar("accuracy", cnn.accuracy)

        # Train Summaries
        train_summary_op = tf.summary.merge([loss_summary, acc_summary])
        train_summary_dir = os.path.join(out_dir, "summaries", "train")
        train_summary_writer = tf.summary.FileWriter(train_summary_dir, sess.graph)

        # Dev summaries
        dev_summary_op = tf.summary.merge([loss_summary, acc_summary])
        dev_summary_dir = os.path.join(out_dir, "summaries", "dev")
        dev_summary_writer = tf.summary.FileWriter(dev_summary_dir, sess.graph)

        # Checkpoint directory. Tensorflow assumes this directory already exists so we need to create it
        checkpoint_dir = os.path.abspath(os.path.join(out_dir, "checkpoints"))
        checkpoint_prefix = os.path.join(checkpoint_dir, "model")
        if not os.path.exists(checkpoint_dir):
            os.makedirs(checkpoint_dir)
        saver = tf.train.Saver(tf.global_variables(), max_to_keep=num_checkpoints)

        # Write vocabulary
        vocab_processor.save(os.path.join(out_dir, "vocab"))

        # Initialize all variables
        sess.run(tf.global_variables_initializer())

        # TRAINING STEP
        def train_step(x_batch, y_batch, save=False):
            feed_dict = {
              cnn.input_x: x_batch,
              cnn.input_y: y_batch,
              cnn.dropout_keep_prob: dropout_prob
            }
            _, step, summaries, loss, accuracy = sess.run(
                [train_op, global_step, train_summary_op, cnn.loss, cnn.accuracy],
                feed_dict)
            time_str = datetime.datetime.now().isoformat()
            print("{}: step {}, loss {:g}, acc {:g}".format(time_str, step, loss, accuracy))
            if save:
                train_summary_writer.add_summary(summaries, step)

        # EVALUATE MODEL
        def test_step(x_batch, y_batch, writer=None, save=False):
            feed_dict = {
              cnn.input_x: x_batch,
              cnn.input_y: y_batch,
              cnn.dropout_keep_prob: 0.5
            }
            step, summaries, loss, accuracy = sess.run(
                [global_step, dev_summary_op, cnn.loss, cnn.accuracy],
                feed_dict)
            time_str = datetime.datetime.now().isoformat()
            print("{}: step {}, loss {:g}, acc {:g}".format(time_str, step, loss, accuracy))
            if save:
                if writer:
                    writer.add_summary(summaries, step)

        # CREATE THE BATCHES GENERATOR
        batches = gen_batch(list(zip(x_train, y_train)), batch_size, num_epochs)

        # TRAIN FOR EACH BATCH
        for batch in batches:
            x_batch, y_batch = zip(*batch)
            train_step(x_batch, y_batch)
            current_step = tf.train.global_step(sess, global_step)
            if current_step % evaluate_every == 0:
                print("\nEvaluation:")
                test_step(x_test, y_test, writer=dev_summary_writer)
                print("")
            if current_step % checkpoint_every == 0:
                path = saver.save(sess, checkpoint_prefix, global_step=current_step)
                print("Saved model checkpoint to {}\n".format(path))


In [None]:
# CREATE THE BATCHES GENERATOR
batches = gen_batch(list(zip(x_train, y_train)), batch_size, num_epochs)

# TRAIN FOR EACH BATCH
for batch in batches:
    x_batch, y_batch = zip(*batch)
    train_step(x_batch, y_batch)
    current_step = tf.train.global_step(sess, global_step)
    if current_step % evaluate_every == 0:
        print("\nEvaluation:")
        test_step(x_test, y_test, writer=dev_summary_writer)
        print("")
    if current_step % checkpoint_every == 0:
        path = saver.save(sess, checkpoint_prefix, global_step=current_step)
        print("Saved model checkpoint to {}\n".format(path))
