In [9]:
import re
import string
from random import shuffle

import nltk
import pandas as pd
from nltk import pos_tag
from nltk.stem.wordnet import WordNetLemmatizer
from tqdm.notebook import tqdm

from nltk.corpus import stopwords
from nltk import FreqDist

from nltk import classify
from nltk import NaiveBayesClassifier

import pickle

import numpy as np

import tensorflow as tf

In [10]:
def remove_noise(tweet_tokens, stop_words=()):
    cleaned_tokens = []

    for token, tag in pos_tag(tweet_tokens):
        token = re.sub("http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+#]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+", '', token)
        token = re.sub("(@[A-Za-z0-9_]+)", "", token)

        if tag.startswith("NN"):
            pos = 'n'
        elif tag.startswith('VB'):
            pos = 'v'
        else:
            pos = 'a'

        lemmatizer = WordNetLemmatizer()
        token = lemmatizer.lemmatize(token, pos)

        if len(token) > 0 and token not in string.punctuation and token.lower() not in stop_words:
            cleaned_tokens.append(token.lower())

    return cleaned_tokens


def get_tweets_for_model(cleaned_tokens):
    for tweet_tokens in cleaned_tokens:
        yield dict([token, True] for token in tweet_tokens)

In [8]:
print("LOADING DATA...")

stop_words = stopwords.words('english')

data = pd.read_csv('training_data/data.csv', encoding='latin-1')

# Sentiments: 0 = Negative, 2 = Neutral, 4 = Positive
data.columns = ['sentiment', 'id', 'date', 'query', 'user', 'text']

# Get rid of unnecessary columns
data = data.drop(['id', 'date', 'query', 'user'], axis=1)

# Split the data into positive and negative sets based on sentiment and isolate the text
positive_data = list(data[data['sentiment'] == 4]['text'])
negative_data = list(data[data['sentiment'] == 0]['text'])

# Save space lol
del data

# Shuffle the data
shuffle(positive_data)
shuffle(negative_data)

data_used = .001

positive_tokenized = [nltk.tokenize.word_tokenize(text) for text in
                      tqdm(positive_data[:int(data_used * len(positive_data))], desc="Tokenizing Positive Data")]
negative_tokenized = [nltk.tokenize.word_tokenize(text) for text in
                      tqdm(negative_data[:int(data_used * len(negative_data))], desc="Tokenizing Negative Data")]

positive_lemmatized = [remove_noise(tokens, stop_words) for tokens in
                       tqdm(positive_tokenized, desc="Lemmatizing Positive Data")]
negative_lemmatized = [remove_noise(tokens, stop_words) for tokens in
                       tqdm(negative_tokenized, desc="Lemmatizing Negative Data")]

training_data = []
for i in positive_lemmatized:
    training_data.append((i, 1))
for i in negative_lemmatized:
    training_data.append((i, 0))

shuffle(training_data)

amount = .7
positive_cutoff = int(amount * len(positive_lemmatized))

testing_data = training_data[positive_cutoff:]
training_data = training_data[:positive_cutoff]

shuffle(testing_data)
shuffle(training_data)

LOADING DATA...


ParserError: Error tokenizing data. C error: Calling read(nbytes) on source failed. Try engine='python'.

In [11]:
data = pd.read_csv('training_data/data.csv', encoding='latin-1')

# Sentiments: 0 = Negative, 2 = Neutral, 4 = Positive
data.columns = ['sentiment', 'id', 'date', 'query', 'user', 'text']

# Get rid of unnecessary columns
data = data.drop(['id', 'date', 'query', 'user'], axis=1)

positive_data = data[data['sentiment'] == 4]
negative_data = data[data['sentiment'] == 0]

In [12]:
positive_data['sentiment'] = [1 for x in range(len(positive_data['sentiment']))]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  positive_data['sentiment'] = [1 for x in range(len(positive_data['sentiment']))]


In [13]:
split = .7

training_data = positive_data[:int(split*len(positive_data))].append(negative_data[:int(split*len(negative_data))])
testing_data = positive_data[int(split*len(positive_data)):].append(negative_data[int(split*len(negative_data)):])

training_texts, training_labels = list(training_data['text']), list(training_data['sentiment'])
testing_texts, testing_labels = list(testing_data['text']), list(testing_data['sentiment'])

  training_data = positive_data[:int(split*len(positive_data))].append(negative_data[:int(split*len(negative_data))])
  testing_data = positive_data[int(split*len(positive_data)):].append(negative_data[int(split*len(negative_data)):])


In [14]:
vocab_size = 100000000
embedding_dim = 16
max_length = 120
trunc_type = 'post'
oov_tok = '<OOV>'
padding_type = 'post'

tokenizer = tf.keras.preprocessing.text.Tokenizer(oov_token="<OOV>")
tokenizer.fit_on_texts(training_texts)
word_index = tokenizer.word_index

In [15]:
sequences = tokenizer.texts_to_sequences(training_texts)
padded = tf.keras.preprocessing.sequence.pad_sequences(sequences, maxlen=max_length, truncating=trunc_type)
testing_texts = tokenizer.texts_to_sequences(testing_texts)
testing_padded = tf.keras.preprocessing.sequence.pad_sequences(testing_texts, maxlen=max_length)

In [None]:
vocab_size = 40000

model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
    tf.keras.layers.GlobalAveragePooling1D(),
    tf.keras.layers.Dense(6, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
training_labels_final = np.array(training_labels)
testing_labels_final = np.array(testing_labels)

In [None]:
num_epochs = 20
history = model.fit(padded, training_labels_final, epochs=num_epochs, validation_data=(testing_padded, testing_labels_final))