## Text Analysis Workshop

## 1. Install and Import Packages 

In [1]:
#import all the necessary packages for the workshop

import numpy as np 
import matplotlib.pyplot as plt 
import pandas as pd
import nltk 

# later in the workshop...
from sklearn.feature_extraction.text import CountVectorizer # used to create a bag-of-words
from nltk.corpus import stopwords as stopwords # allows us to remove stopwords 
from wordcloud import WordCloud # wordcloud visualisation library
from textblob import TextBlob # used for sentiment analysis

# allows us to run matplotlib inline 
%matplotlib inline

In [None]:
# instructions for installing packages
"""
to install nltk, in terminal run this:
$ pip3 install nltk

$ python3
> import nltk
> nltk.download("stopwords")

to install matplotlib:
$ python3 -mpip install matplotlib

to install textblob:
$ sudo pip3 install textblob
"""


## 2. Load data 

In [2]:
tweets = pd.read_csv('./apple_tweets - apple_tweets.csv')

Let's understand what the data looks like...

In [None]:
tweets.head()

In [None]:
tweets.shape

## 3. Data Cleaning & Transformation

First we create new column with the raw tweet so we can persist the original tweet.

In [3]:
tweets['raw'] = tweets['text']

### Lower case all text 

In [None]:
# before
tweets['text'][0]

In [None]:
tweets['text'] = tweets['text'].str.lower()
# after
tweets['text'][0]


### Remove URLs

In [None]:
# before:
tweets['text'][54]

In [None]:
# googled "python remove url from tweet" -> https://stackoverflow.com/questions/24399820/expression-to-remove-url-links-from-twitter-tweet
tweets['text'] = tweets['text'].str.replace('http\S+', '')

In [None]:
# after:
tweets['text'][54]

### Remove punctuation

In [None]:
# before 
tweets['text'][0]

In [None]:
tweets['text'] = tweets['text'].str.replace('[^\w\s]','')
# after 
tweets['text'][0]

### Remove stop words

In [None]:
tweets['text'][0]

In [None]:
from nltk.corpus import stopwords as stopwords

stop = stopwords.words('english')
tweets['text'] = tweets['text'].apply(lambda x: " ".join(x for x in x.split() if x not in stop))
       
tweets['text'][0]

# 4. Bag-of-Words model

There are many techniques we can use to model our corpus.  One of the most common, and easiest to implement,  is the bag-of-words.

This in essence, involves turning our corpus into a matrix whereby the rows represent the tweets -- also called 'documents' -- and the columns represent our words, or terms. This is aptly called, a document-term matrix.


In [None]:
from sklearn.feature_extraction.text import CountVectorizer

vec = CountVectorizer().fit(tweets['text'])
bag_of_words = vec.transform(tweets['text'])
sum_words = bag_of_words.sum(axis=0) 
words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
words_freq = sorted(words_freq, key = lambda x: x[1], reverse=True)

# ignoring top word (which is "apple" in this case)
top_words = words_freq[1:30]

for word, freq in top_words:
    print(word, freq)
    

# 5. Visualizing Data

### Bar plot for the most common words

In [None]:
words_df = pd.DataFrame(top_words, columns=['word', 'count'])

plt.figure(figsize=(30,20))
words_df.plot(kind='bar', x='word', y='count')
plt.xticks(rotation=90)
plt.xlabel("Top words")
plt.ylabel("Word count")
plt.show()

### Word cloud

In [None]:
from wordcloud import WordCloud

# wordcloud requires a dict with { word : frequency }
words_dict = {}
for k,v in top_words:
    words_dict[k] = int(v)

wordcloud = WordCloud(width=1000, height=500, background_color="white").generate_from_frequencies(words_dict)

plt.figure(figsize=(20, 10))
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis('off')
plt.show()

# 6. Gauging Sentiment

The sentiment function of textblob returns two properties, polarity, and subjectivity.
Polarity is a float which lies in the range of [-1,1] where 1 means positive statement and -1 means a negative statement. 
Subjective sentences are generally personal opinion, emotion or judgment whereas objective sentences should focus on factual information. Subjectivity is also a float which lies in the range of [0,1].

Good writeup for how sentiment is calculated: https://planspace.org/20150607-textblob_sentiment/ 

In [None]:
from textblob import TextBlob

sentiments = []

for tweet in tweets['text']:
    analysis = TextBlob(tweet)
    sentiments.append(analysis.sentiment.polarity)

tweets['sentiments'] = sentiments

In [None]:
# find most positive tweets
positive = tweets.sort_values(by='sentiments', ascending=False)
positive = positive.reset_index(drop=True)

In [None]:
for pos in positive['text'][0:20]:
    print(pos)

In [None]:
# find most negative tweets
negative = tweets.sort_values(by='sentiments', ascending=True)
negative = negative.reset_index(drop=True)

In [None]:
for neg in negative['text'][0:20]:
    print(neg)