# Setup Environment

In [2]:
;; Enable stack traces
;; (clojupyter.misc.stacktrace/set-print-stacktraces! true)
(require '[clojupyter.misc.helper :as helper])
(helper/add-dependencies '[clojure-opennlp "0.5.0"])
(helper/add-dependencies '[kixi/stats "0.5.0"])
(helper/add-dependencies '[io.forward/clojure-mail "1.0.7"])
(helper/add-dependencies '[clojure2d "1.1.0"])
(helper/add-dependencies '[metasoarous/oz "1.5.0"])
(helper/add-dependencies '[clj-time "0.15.0"])
(print (str "Done!"))

Done!

nil

In [3]:
;; Load VADER as local repository
;; The vader repo binary must be installed in this directory ./maven-repository
(do
    (use '[cemerick.pomegranate :only (add-dependencies)])
    (add-dependencies 
        :coordinates '[[local/vader "2.0.1"]] 
        :repositories {"local/vader" (str (.toURI (java.io.File. "./maven_repository")))}))

{[local/vader "2.0.1"] nil}

In [4]:
;; Build namespace
(ns drafts.sentiment_analysis
    (:import [net.nunoachenriques.vader SentimentAnalysis]
             [net.nunoachenriques.vader.lexicon English]
             [net.nunoachenriques.vader.text TokenizerEnglish]
             [java.io FileInputStream File]
             [javax.mail Session]
             [javax.mail.internet MimeMessage]
             [java.util Properties])
    (:require [kixi.stats.core :as stats]
              [clojure-mail.core :as mail]
              [clojure-mail.message :refer (read-message)]
              [oz.notebook.clojupyter :as oz]
              [clj-time.core :as t]
              [clj-time.coerce :as c]
              )
    (:use [clojure.repl :only (doc source)]
          [clojure.pprint :only (pprint)]
          [opennlp.nlp :only (make-sentence-detector)]))

*ns*



#namespace[drafts.sentiment_analysis]

# Analyzing Sentiment w/ Vader

In [5]:
(def language (English.))
(def tokenizer (TokenizerEnglish.))

#'drafts.sentiment_analysis/tokenizer

In [6]:
(def sa (SentimentAnalysis. language tokenizer))

#'drafts.sentiment_analysis/sa

In [7]:
(. sa (getSentimentAnalysis "Yay!! You are the best!"))

{"negative" 0.0, "neutral" 0.261, "positive" 0.739, "compound" 0.8582}

# Reading Emails

In [8]:
(def maildir-path "data/enron_mail/maildir")

#'drafts.sentiment_analysis/maildir-path

In [9]:
(def sample-msg 
    (-> "data/enron_mail/maildir/arnold-j/_sent_mail/36."
        (clojure.java.io/as-file)
        (mail/file->message)
        (read-message)))

(pprint sample-msg)

{:cc (),
 :bcc (),
 :headers
 [{"Message-ID" "<33491127.1075857594966.JavaMail.evans@thyme>"}
  {"Date" "Tue, 21 Nov 2000 13:16:00 -0800 (PST)"}
  {"From" "john.arnold@enron.com"}
  {"To" "slafontaine@globalp.com"}
  {"Subject" "re:mkts"}
  {"Mime-Version" "1.0"}
  {"Content-Type" "text/plain; charset=us-ascii"}
  {"Content-Transfer-Encoding" "7bit"}
  {"X-From" "John Arnold"}
  {"X-To" "slafontaine@globalp.com @ ENRON"}
  {"X-cc" ""}
  {"X-bcc" ""}
  {"X-Folder" "\\John_Arnold_Dec2000\\Notes Folders\\'sent mail"}
  {"X-Origin" "Arnold-J"}
  {"X-FileName" "Jarnold.nsf"}],
 :date-sent #inst "2000-11-21T21:16:00.000-00:00",
 :date-received nil,
 :from ({:address "john.arnold@enron.com", :name nil}),
 :id "<33491127.1075857594966.JavaMail.evans@thyme>",
 :sender nil,
 :content-type "text/plain; charset=us-ascii",
 :multipart? false,
 :body
 {:content-type "text/plain; charset=us-ascii",
  :body
  "Hey:\nHaven't had the best of months.  Like you had some good positions but others \nwiped o

nil

# Read in Files

In [10]:
(defn get-files [start-path re]
    (->> start-path
         (clojure.java.io/as-file)
         (file-seq)
         (map #(.getPath %))
         (filter #(re-matches re %))))

#'drafts.sentiment_analysis/get-files

In [11]:
(def xform-msg-files
    (comp (map mail/file->message)
          (map read-message)))

#'drafts.sentiment_analysis/xform-msg-files

In [12]:
(def sent-mail-re #"data\/enron_mail\/maildir\/.*\/_sent_mail\/.*")
(def sent-msg-paths (get-files maildir-path sent-mail-re))

#'drafts.sentiment_analysis/sent-msg-paths

In [13]:
(defn msg-reduce
    ([] [])
    ([acc] acc)
    ([acc m]
        (conj acc {:to    (-> (get m :to) (first) (get :address))
                   :from  (-> (get m :from) (first) (get :address))
                   :date-sent (get m :date-sent)
                   :date-received (get m :date-received)
                   :subject (get m :subject)
                   :body  (get-in m [:body :body])})))

#'drafts.sentiment_analysis/msg-reduce

In [14]:
(def msgs (transduce xform-msg-files msg-reduce sent-msg-paths))

#'drafts.sentiment_analysis/msgs

In [15]:
(count msgs)

30237

# Add Message Sentiment

In [16]:
(defn remove-line-breaks [text]
    (clojure.string/replace text #"\n" ""))

#'drafts.sentiment_analysis/remove-line-breaks

In [17]:
(def get-sentences (make-sentence-detector "./models/en-sent.bin"))

#'drafts.sentiment_analysis/get-sentences

In [18]:
(defn add-sentiment
    ([] [])
    ([acc] acc)
    ([acc msg]
      (conj acc (conj msg {:avg-sentiment (->> msg
                                     (:body)
                                     (get-sentences)
                                     (map remove-line-breaks)
                                     (map #(. sa (getSentimentAnalysis %)))
                                     (map #(get % "compound"))
                                     (transduce identity stats/mean))}))))

#'drafts.sentiment_analysis/add-sentiment

In [19]:
(def sentiment (transduce identity add-sentiment (filter #(< (count (get % :body)) 4000) msgs)))

#'drafts.sentiment_analysis/sentiment

# Plot Sentiment Over Time

In [20]:
(->> (take 10 sentiment)
     (map #(select-keys % [:date-sent :avg-sentiment])))

({:date-sent #inst "2000-11-21T21:16:00.000-00:00", :avg-sentiment 0.08433029845808492} {:date-sent #inst "2001-02-20T22:15:00.000-00:00", :avg-sentiment 0.16147499904036522} {:date-sent #inst "2001-01-11T20:16:00.000-00:00", :avg-sentiment 0.06797179616151904} {:date-sent #inst "2000-07-17T07:50:00.000-00:00", :avg-sentiment 0.005300000309944153} {:date-sent #inst "2001-02-06T13:15:00.000-00:00", :avg-sentiment 0.01825161374384357} {:date-sent #inst "2000-08-17T11:48:00.000-00:00", :avg-sentiment 0.0} {:date-sent #inst "2001-03-26T11:08:00.000-00:00", :avg-sentiment 0.0} {:date-sent #inst "2000-09-27T11:12:00.000-00:00", :avg-sentiment 0.48599998156229657} {:date-sent #inst "2001-05-13T15:36:00.000-00:00", :avg-sentiment 0.21230000044618333} {:date-sent #inst "2000-10-18T09:47:00.000-00:00", :avg-sentiment 0.025407693133904383})

In [21]:
(defn same-day? [t1 t2]
    (t/equal? (t/floor t1 t/day) (t/floor t2 t/day)))

#'drafts.sentiment_analysis/same-day?

In [22]:
(def xform-get-time-data
    (comp (map #(select-keys % [:date-sent :avg-sentiment]))
          (map #(hash-map :date (-> (c/from-date (:date-sent %))
                                    (t/floor t/day)
                                    (c/to-date))
                          :avg-sentiment (:avg-sentiment %)))))

#'drafts.sentiment_analysis/xform-get-time-data

In [23]:
(pprint (eduction xform-get-time-data (take 5 sentiment)))

({:date #inst "2000-11-21T00:00:00.000-00:00", :avg-sentiment 0.08433029845808492} {:date #inst "2001-02-20T00:00:00.000-00:00", :avg-sentiment 0.16147499904036522} {:date #inst "2001-01-11T00:00:00.000-00:00", :avg-sentiment 0.06797179616151904} {:date #inst "2000-07-17T00:00:00.000-00:00", :avg-sentiment 0.005300000309944153} {:date #inst "2001-02-06T00:00:00.000-00:00", :avg-sentiment 0.01825161374384357})


nil

In [24]:
(defn reduce-daily-sentiment
    ([] {})
    ([acc] 
     (reduce #(conj %1 {(first %2) 
                        (transduce identity stats/mean (second %2))}) (sorted-map) acc))
    ([acc x]
     (let [{date :date sentiment :avg-sentiment} x]
            (if (contains? acc date)
             (update acc date conj sentiment)
             (conj acc {date [sentiment]})))))

#'drafts.sentiment_analysis/reduce-daily-sentiment

In [25]:
(def average-sentiment-data (transduce xform-get-time-data reduce-daily-sentiment sentiment))

#'drafts.sentiment_analysis/average-sentiment-data

In [26]:
(count average-sentiment-data)

556

In [27]:
(defn average [coll]
  (/ (reduce + coll)
      (count coll)))

(defn moving-average [period coll] 
  (lazy-cat (repeat (dec period) nil) 
            (map average (partition period 1  coll))))

#'drafts.sentiment_analysis/moving-average

In [56]:
(def time-series-data
    (->> average-sentiment-data
         (#(vector (map first %) (map second %)))
         (#(vector (first %) (second %) (moving-average 30 (second %))))
         (apply map vector)
         (map #(hash-map :date (str (nth % 0))
                         :avg-sentiment (nth % 1)
                         :moving-avg (nth % 2)))))
         

#'drafts.sentiment_analysis/time-series-data

In [58]:
;; (def line-plot
;;   {:data {:values time-series-data}
;;    :width 400
;;    :height 400
;;    :encoding {:x {:field "date", :type "temporal"}
;;               :y {:field "moving-avg"}}
;;    :mark {:type "line" :stroke "red"}})

(def layered-line-plot
    {:width 600
     :height 600
     :data {:values time-series-data}
     :layer [{:mark {:type "line", :stroke "lightblue"}
              :encoding {:x {:field "date", :type "temporal"}
                         :y {:field "avg-sentiment"}}},
             {:mark {:type "line", :stroke "green"}
              :encoding {:x {:field "date", :type "temporal"}
                         :y {:field "moving-avg"}}}]})

;; Render the plot
;; (oz/view! line-plot)
(oz/view! layered-line-plot)