In [None]:
import pandas as pd
import csv
import json
import itertools
import matplotlib.pyplot as plt
import math

In [None]:
def load_data(filename):
  with open(filename) as file:
      data = json.load(file)
  return data

In [None]:
def write_result_to_csv(result, filename):
  with open(filename, 'w', newline='') as csv_file:
    writer = csv.writer(csv_file)
    writer.writerow(list(result.keys()))
    writer.writerow(list(result.values()))

In [None]:
def plot_data(result, filename):
  x_data = []
  y_data = []

  for item in result:
    x_data.append(item)
    y_data.append(result[item])

  plt.plot(x_data, y_data)
  plt.xlabel('Word')
  plt.ylabel('Count')
  plt.title('Histogram')

  plt.savefig(filename)

In [None]:
all_tweets = load_data('all_tweets_preprocessed.json')
sadness_tweets = load_data('sadness_tweets.json')
happiness_tweets = load_data('happiness_tweets.json')
fear_tweets = load_data('fear_tweets.json')
anger_tweets = load_data('anger_tweets.json')
disgust_tweets = load_data('disgust_tweets.json')
surprise_tweets = load_data('surprise_tweets.json')
neutral_tweets = load_data('neutral_tweets.json')

In [None]:
data_count = {'Sadness' : len(sadness_tweets), 'Happiness' : len(happiness_tweets), 'Fear' : len(fear_tweets), 'Anger' : len(anger_tweets), 'Disgust' : len(disgust_tweets), 'Surprise' : len(surprise_tweets), 'Neutral' : len(neutral_tweets), 'Total' : len(all_tweets)}
write_result_to_csv(data_count, 'tweet_count.csv')

In [None]:
def count_sentences(tweet_set):
  sent_count = 0
  for idx in tweet_set.keys():
    sent_count += len(tweet_set[idx]['sentence_broken'])
  return sent_count

sentence_count = {'Sadness' : count_sentences(sadness_tweets), 'Happiness' : count_sentences(happiness_tweets), 'Fear' : count_sentences(fear_tweets), 'Anger' : count_sentences(anger_tweets), 'Disgust' : count_sentences(disgust_tweets), 'Surprise' : count_sentences(surprise_tweets), 'Neutral' : count_sentences(neutral_tweets), 'Total' : count_sentences(all_tweets)}

write_result_to_csv(sentence_count, 'sentence_count.csv')

In [None]:
def count_words(tweet_set):
  word_count = 0
  for idx in tweet_set.keys():
    word_count += len(tweet_set[idx]['word_broken'])
  return word_count

word_count = {'Sadness' : count_words(sadness_tweets), 'Happiness' : count_words(happiness_tweets), 'Fear' : count_words(fear_tweets), 'Anger' : count_words(anger_tweets), 'Disgust' : count_words(disgust_tweets), 'Surprise' : count_words(surprise_tweets), 'Neutral' : count_words(neutral_tweets), 'Total' : count_words(all_tweets)}

write_result_to_csv(word_count, 'word_count.csv')

In [None]:
def count_unique_words(tweet_set):
  unique_words = []
  for idx in tweet_set.keys():
    unique_words.extend(tweet_set[idx]['word_broken'])
  unique_words = list(set(unique_words))

  return len(unique_words)

unique_word_count = {'Sadness' : count_unique_words(sadness_tweets), 'Happiness' : count_unique_words(happiness_tweets), 'Fear' : count_unique_words(fear_tweets), 'Anger' : count_unique_words(anger_tweets), 'Disgust' : count_unique_words(disgust_tweets), 'Surprise' : count_unique_words(surprise_tweets), 'Neutral' : count_unique_words(neutral_tweets), 'Total' : count_unique_words(all_tweets)}

write_result_to_csv(unique_word_count, 'unique_word_count.csv')

In [None]:
def count_common_words():
  unique_words = []
  for idx in all_tweets.keys():
    unique_words.extend(all_tweets[idx]['word_broken'])
  unique_words = list(set(unique_words))

  unique_words_sadness = []
  for idx in sadness_tweets.keys():
    unique_words_sadness.extend(sadness_tweets[idx]['word_broken'])
  unique_words_sadness = list(set(unique_words_sadness))

  unique_words_happiness = []
  for idx in happiness_tweets.keys():
    unique_words_happiness.extend(happiness_tweets[idx]['word_broken'])
  unique_words_happiness = list(set(unique_words_happiness))

  unique_words_fear = []
  for idx in fear_tweets.keys():
    unique_words_fear.extend(fear_tweets[idx]['word_broken'])
  unique_words_fear = list(set(unique_words_fear))

  unique_words_anger = []
  for idx in anger_tweets.keys():
    unique_words_anger.extend(anger_tweets[idx]['word_broken'])
  unique_words_anger = list(set(unique_words_anger))

  unique_words_disgust = []
  for idx in disgust_tweets.keys():
    unique_words_disgust.extend(disgust_tweets[idx]['word_broken'])
  unique_words_disgust = list(set(unique_words_disgust))

  unique_words_surprise = []
  for idx in surprise_tweets.keys():
    unique_words_surprise.extend(surprise_tweets[idx]['word_broken'])
  unique_words_surprise = list(set(unique_words_surprise))

  unique_words_neutral = []
  for idx in neutral_tweets.keys():
    unique_words_neutral.extend(neutral_tweets[idx]['word_broken'])
  unique_words_neutral = list(set(unique_words_neutral))

  common_words = 0
  for word in unique_words:
    if word in unique_words_sadness and \
     word in unique_words_happiness and \
     word in unique_words_fear and \
     word in unique_words_anger and \
     word in unique_words_disgust and \
     word in unique_words_surprise and \
     word in unique_words_neutral:
      common_words += 1

  return common_words, len(unique_words) - common_words

common, uncommon = count_common_words()

common_and_uncommon_word_count = {'Common' : common, 'Uncommon' : uncommon}

write_result_to_csv(common_and_uncommon_word_count, 'common_and_uncommon_word_count.csv')

In [None]:
def unique_word_count_histogram(tweet_set):
  unique_words = {}
  print(type(tweet_set), 1)
  print(tweet_set)
  for idx in tweet_set.keys():
    for word in tweet_set[idx]['word_broken']:
      if word != ' ':
        if word in unique_words:
          unique_words[word] += 1
        else:
          unique_words[word] = 1

  unique_words = dict(sorted(unique_words.items(), key=lambda x: x[1], reverse=True))

  return unique_words

In [None]:
def get_docs_words():
  return {'Sadness' : list(unique_word_count_histogram(sadness_tweets).keys()), 'Happiness' : list(unique_word_count_histogram(happiness_tweets).keys()), 'Anger' : list(unique_word_count_histogram(anger_tweets).keys()), 'Fear' : list(unique_word_count_histogram(fear_tweets).keys()), 'Disgust' : list(unique_word_count_histogram(disgust_tweets).keys()), 'Surprise' : list(unique_word_count_histogram(surprise_tweets).keys()), 'Neutral' : list(unique_word_count_histogram(neutral_tweets).keys())}

In [None]:
def TermFrequency(document_words_count):
    tf = {}
    sum_doc = sum(document_words_count.values())
    for w in document_words_count:
        tf[w] = document_words_count[w] / sum_doc
    return tf

def InverseDocumentFrequency(docs_words, target_labels_words):
    idf = {}
    N = len(docs_words)
    for w in target_labels_words:
        d = 0
        for doc in docs_words:
            if w in docs_words[doc]:
                d += 1
        idf[w] = math.log2(N / d)
    return idf

def Tf_Idf(tf, idf):
    tfidf = {}
    for w in tf:
        tfidf[w] = round(tf[w] * idf[w], 4)

    tfidf = dict(sorted(tfidf.items(), key=lambda x: x[1], reverse=True))

    return dict(itertools.islice(tfidf.items(), 8))

write_result_to_csv(Tf_Idf(TermFrequency(unique_word_count_histogram(sadness_tweets)), InverseDocumentFrequency(get_docs_words(), list(unique_word_count_histogram(all_tweets).keys()))), 'sadness_tfidf.csv')
write_result_to_csv(Tf_Idf(TermFrequency(unique_word_count_histogram(happiness_tweets)), InverseDocumentFrequency(get_docs_words(), list(unique_word_count_histogram(all_tweets).keys()))), 'happiness_tfidf.csv')
write_result_to_csv(Tf_Idf(TermFrequency(unique_word_count_histogram(anger_tweets)), InverseDocumentFrequency(get_docs_words(), list(unique_word_count_histogram(all_tweets).keys()))), 'anger_tfidf.csv')
write_result_to_csv(Tf_Idf(TermFrequency(unique_word_count_histogram(fear_tweets)), InverseDocumentFrequency(get_docs_words(), list(unique_word_count_histogram(all_tweets).keys()))), 'fear_tfidf.csv')
write_result_to_csv(Tf_Idf(TermFrequency(unique_word_count_histogram(disgust_tweets)), InverseDocumentFrequency(get_docs_words(), list(unique_word_count_histogram(all_tweets).keys()))), 'disgust_tfidf.csv')
write_result_to_csv(Tf_Idf(TermFrequency(unique_word_count_histogram(surprise_tweets)), InverseDocumentFrequency(get_docs_words(), list(unique_word_count_histogram(all_tweets).keys()))), 'surprise_tfidf.csv')
write_result_to_csv(Tf_Idf(TermFrequency(unique_word_count_histogram(neutral_tweets)), InverseDocumentFrequency(get_docs_words(), list(unique_word_count_histogram(all_tweets).keys()))), 'neutral_tfidf.csv')

In [None]:
plot_data(dict(itertools.islice(unique_word_count_histogram(all_tweets).items(), 10)), 'top_unique_words.png')