<a href="https://colab.research.google.com/github/rsalmin/azmq/blob/master/NLP_Play.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Playing with NLP following coursera course...

In [1]:
from nltk.corpus import twitter_samples
from nltk.corpus import stopwords
from nltk.tokenize import TweetTokenizer
import nltk
import numpy as np
import matplotlib.pyplot as plt


In [2]:
nltk.download('twitter_samples')
nltk.download('stopwords')
twitter_samples.ensure_loaded()
print(twitter_samples.fileids())

all_positive_tweets = twitter_samples.strings('positive_tweets.json')
all_negative_tweets = twitter_samples.strings('negative_tweets.json')
swords = stopwords.words('english')

print(f'positive tweets {len(all_positive_tweets)}')
print(f'netative tweets {len(all_negative_tweets)}')
print(f'first positive tweet {all_positive_tweets[0]}')
print(f'first negative tweet {all_negative_tweets[0]}')

print(f'stop words {len(swords)}')
print(f'stop words list {swords}')

[nltk_data] Downloading package twitter_samples to /root/nltk_data...
[nltk_data]   Package twitter_samples is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
['negative_tweets.json', 'positive_tweets.json', 'tweets.20150430-223406.json']
positive tweets 5000
netative tweets 5000
first positive tweet #FollowFriday @France_Inte @PKuchly57 @Milipol_Paris for being top engaged members in my community this week :)
first negative tweet hopeless for tmr :(
stop words 179
stop words list ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'b

In [3]:
def preprocess_tweets(tweets):
  tokenizer = TweetTokenizer(strip_handles = True, preserve_case = False)

  result = []
  for tweet in tweets:
      preprocessed_tweet = []
      for word in tokenizer.tokenize(tweet):
        if word not in swords:
          preprocessed_tweet.append(word)
      result.append( preprocessed_tweet )
  
  return result


In [4]:
positive_tweets = preprocess_tweets(all_positive_tweets)
negative_tweets = preprocess_tweets(all_negative_tweets)
print(f'first preprocessed positive tweet {positive_tweets[0]}')
print(f'first preprocessed negative tweet {negative_tweets[0]}')

first preprocessed positive tweet ['#followfriday', 'top', 'engaged', 'members', 'community', 'week', ':)']
first preprocessed negative tweet ['hopeless', 'tmr', ':(']


In [5]:
def build_freqs(tweets):
  freqs = {}
  for tweet in tweets:
    for word in tweet:
      freqs[word] = freqs.get(word, 0) + 1
  return freqs

In [6]:
positive_train_tweets = positive_tweets[:4000]
negative_train_tweets = negative_tweets[:4000]
positive_test_tweets = positive_tweets[4000:]
negative_test_tweets = negative_tweets[4000:]

positive_freqs = build_freqs(positive_train_tweets)
negative_freqs = build_freqs(negative_train_tweets)

train_tweets = positive_train_tweets + negative_train_tweets
train_labels = [1 for x in range(len(positive_train_tweets))] + [0 for x in range(len(negative_train_tweets))]

test_tweets = positive_test_tweets + negative_test_tweets
test_labels = [1 for x in range(len(positive_test_tweets))] + [0 for x in range(len(negative_test_tweets))]

In [7]:
def make_features(tweets, positive_freqs, negative_freqs):

  features = []
  for tweet in tweets:
    positives = 0
    negatives = 0    
    for word in tweet:
      positives += positive_freqs.get(word, 0)
      negatives += negative_freqs.get(word, 0)
    features.append( np.array([positives, negatives]) )
  return features

In [8]:
train_features = make_features(train_tweets, positive_freqs, negative_freqs)
test_features = make_features(test_tweets, positive_freqs, negative_freqs)

In [9]:
train_features[-1]

array([ 164, 3958])

In [10]:
import tensorflow as tf
tf.__version__

'2.4.0'

In [11]:
def build_model():
  inputs = tf.keras.Input(shape=(2,))
  outputs = tf.keras.layers.Dense(1, activation = tf.nn.sigmoid)(inputs)
  return tf.keras.Model(inputs = inputs, outputs = outputs)


In [12]:
model = build_model()
model.compile(optimizer = 'sgd', loss = 'binary_crossentropy', metrics = ['accuracy'])
x = np.array(train_features)
y = np.array(train_labels)
model.fit(x, y, batch_size = 4000, verbose = 1, epochs = 10)


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7f4c937e10b8>

In [13]:
x_test = np.array(test_features)
y_test = np.array(test_labels)
y_pred = model.predict(x_test)

In [23]:
train_diffs = (model.predict(x).squeeze() >= 0.5) == y
test_diffs = (y_pred.squeeze() >= 0.5) == y_test
train_accuracy = np.sum(train_diffs) / y.shape[0]
test_accuracy = np.sum(test_diffs) / y_pred.shape[0]
print(f'test_accuracy = {test_accuracy}; train_accuracy = {train_accuracy}')

test_accuracy = 0.9875; train_accuracy = 0.988625


In [39]:
for i in range(train_diffs.shape[0]):
  if not train_diffs[i]:
    print(f'MisTweet: {train_tweets[i]} Label:{train_labels[i]}')

MisTweet: ['act', 'mischievousness', ',', 'calling', 'etl', 'layer', 'in-house', 'warehousing', 'app', 'katamari', '.', 'well', '…', 'name', 'implies', ':p', '.'] Label:1
MisTweet: ['arummzz', ':', "let's", 'go', '...', "i'll", 'drive', ':p', '#traveling', '#traveler', '#yogyakarta', '#jeep', '#indonesia', '#instamood', '…', 'https://t.co/s7y4Ys5JeU'] Label:1
MisTweet: ["let's", 'go', '...', "i'll", 'drive', ':p', '#traveling', '#traveler', '#yogyakarta', '#jeep', '#indonesia', '#instamood', '…', 'https://t.co/s7y4Ys5JeU'] Label:1
MisTweet: ['...', 'friday', ':D', '(', ')', 'http://t.co/6rA4ZlpjO9'] Label:1
MisTweet: ['haha', '.', 'yes', '.', '24', 'hrs', '.', 'time', 'come', 'touch', 'kepler', '452b', '.', 'chalna', 'hai', '?', '?', ':D'] Label:1
MisTweet: ['lol', '😄', '😄', 'really', '?', "can't", 'believe', 'beautiful', 'girl', 'like', 'single', ':p'] Label:1
MisTweet: ['lol', 'tough', 'choice', 'tbh', 'really', 'like', 'every', 'song', ',', 'others', ':p'] Label:1
MisTweet: ['hahaha

In [41]:
for i in range(test_diffs.shape[0]):
  if not test_diffs[i]:
    print(f'MisTweet: {test_tweets[i]} Label:{test_labels[i]}')

MisTweet: ['#am', 'ca', '.', 'retweet', 'domg', 'nanti', 'difollow', '?', '(', ':D'] Label:1
MisTweet: ['beat', 'da', 'beat', 'sits', 'well', '(', 'includes', 'new', 'video', ')', ':-)', 'http://t.co/zJmOmpx7iv', '#mobilegame', '#ios8', '#android', 'http://t.co/LY7EnuUH8z'] Label:1
MisTweet: ['new', 'report', 'talks', 'burn', 'calories', 'cold', ',', 'work', 'harder', 'warm', '.', 'feel', 'better', 'weather', '?', ':p'] Label:1
MisTweet: ['harry', 'niall', '-', '94', '(', 'harry', 'born', ')', 'ik', 'stupid', 'wanna', 'change', ':D', 'https://t.co/gHAt8ZDAfF'] Label:1
MisTweet: ['girl', ':', 'nice', 'wallet', '.', 'boy', ':', 'prada', 'hai', '..', 'girl', ':', 'rich', '..', 'boy', ':', 'stupid', ',', 'mean', 'bhai', 'ka', 'hai', '.', ':p', '#prada', '#punjabiswillgetit'] Label:1
MisTweet: ['bad', 'would', 'remind', 'exercise', '1:12', ':-)', '.', 'miss', '.', 'needs', 'come', 'back', '.', '.'] Label:1
MisTweet: ['always', 'part', ',', 'part', 'defenitely', '...', '♬', ':p'] Label:1
Mis