![Baylor Libraries Banner](https://github.com/Josh-Been/Sentiment-Per-Line/blob/master/Capture.PNG?raw=true "Baylor University Libraries")

# Facebook-Group-Analyzer

You will need a Facebook account to register for a Facebook application ID and application secret. Register here: https://developers.facebook.com/apps

This Jupyter Notebook will download the Facebook posts on a group or fan page, along with the numbers of comments, likes, and shares for each post. Bar charts are created providing overall and monthly social media metrics for conversation rates, amplification rates, and applause rates.


**An extra-special thank you** to Max Woolf for his fantastic tutorial *How to Scrape Data From Facebook Page Posts for Statistical Analysis* (http://minimaxir.com/2015/07/facebook-scraper/). Significant functions from the tutorial are included here with only small adjustments.

## Enter the Following Information and then Run this Code Block:

1. Facebook Application ID
2. Facebook Application Secret
3. ID Number of Facebook or Fan Page - paste the Facebook url here https://lookup-id.com/ to quickly find the id number

In [None]:
####################
# Fill in information below
#
app_id = ''
app_secret = ''
page_id = ''
#
####################

# imported libraries
import os, urllib2, time, json, datetime, pygal, copy
import unicodecsv as csv
from pygal.style import LightSolarizedStyle, CleanStyle
from IPython.display import SVG, HTML
import numpy as np
import pandas as pd

dir_loc=os.getcwd().replace('\\','/')+'/'+page_id
if not os.path.isdir(dir_loc):
    os.makedirs(dir_loc)
cursor='  >>  '
print cursor, 'files will be saved to', dir_loc

## Run this Code Block to Download Facebook Posts to .CSV Table

In [None]:

access_token = app_id + '|' + app_secret

dict_freq={}

html_pygal = """
<!DOCTYPE html>
<html>
  <head>
  <script type="text/javascript" src="http://kozea.github.io/pygal.js/latest/svg.jquery.js"></script>
  <script type="text/javascript" src="http://kozea.github.io/pygal.js/latest/pygal-tooltips.js"></script>
    <!-- ... -->
  </head>
  <body>
    <figure>
      {pygal_render}
    </figure>
  </body>
</html>
"""

def conf_ratio(d,col):
    # create dataframe from dictionary
    df = pd.DataFrame(d.items(), columns=['date', col])
    
    # configure date
    df['date']=pd.to_datetime(df['date'],format='%Y-%m-%d %H:%M:%S')
    df['date_minus_time'] = df['date'].apply( lambda tmp : datetime.datetime(year=tmp.year, month=tmp.month, day=tmp.day))   
    df.set_index(df["date_minus_time"],inplace=True)

    # aggregate by month in a series
    ser_ratio = df[col].resample('M').mean()
    ser_sum = df[col].resample('M').sum()
    
    # convert series to df
    df_ratio = pd.DataFrame({'date':ser_ratio.index, col+'-rate':ser_ratio.values})
    df_sum = pd.DataFrame({'date':ser_sum.index, col+'-sum':ser_sum.values})

    # pass to list of frequencies
    frequencies_ratio = df_ratio[col+'-rate'].values.tolist()
    frequencies_sum = df_sum[col+'-sum'].values.tolist()

    # create list of months
    lst = pd.to_datetime(df_sum['date']).dt.date.unique().tolist()
    names=[]
    names[:]=[]
    for item in lst:
        names.append(str(item)[:7])
    
    # pass to plot function
    q1=str(df_sum[col+'-sum'].sum())
    q2=str(df_ratio[col+'-rate'].mean())
    return frequencies_sum, frequencies_ratio, names, q1, q2

def testFacebookPageData(page_id, access_token):
    
    # construct the URL string
    base = "https://graph.facebook.com/v2.4"
    node = "/" + page_id
    parameters = "/?access_token=%s" % access_token
    url = base + node + parameters
    
    # retrieve data
    req = urllib2.Request(url)
    response = urllib2.urlopen(req)
    data = json.loads(response.read())

def request_until_succeed(url):
    req = urllib2.Request(url)
    success = False
    while success is False:
        try: 
            response = urllib2.urlopen(req)
            if response.getcode() == 200:
                success = True
        except Exception, e:
            print cursor, e
            time.sleep(5)
            
            print cursor, "Error for URL %s: %s" % (url, datetime.datetime.now())

    return response.read()

def testFacebookPageFeedData(page_id, access_token):
    
    # construct the URL string
    base = "https://graph.facebook.com/v2.4"
    node = "/" + page_id + "/feed" # changed
    parameters = "/?access_token=%s" % access_token
    url = base + node + parameters
    
    # retrieve data
    data = json.loads(request_until_succeed(url))
    

def getFacebookPageFeedData(page_id, access_token, num_statuses):
    
    # construct the URL string
    base = "https://graph.facebook.com"
    node = "/" + page_id + "/feed" 
    parameters = "/?fields=message,link,created_time,type,name,id,likes.limit(1).summary(true),comments.limit(1).summary(true),shares&limit=%s&access_token=%s" % (num_statuses, access_token) # changed
    url = base + node + parameters
    
    # retrieve data
    data = json.loads(request_until_succeed(url))
    
    return data

def clean(item):
    stripped = (c for c in item if 0 < ord(c) < 127)
    return ''.join(stripped)

def processFacebookPageFeedStatus(status):
    
    global rate_conversion
    global rate_amplification
    global rate_applause
    
    # The status is now a Python dictionary, so for top-level items,
    # we can simply call the key.
    
    # Additionally, some items may not always exist,
    # so must check for existence first
    
    status_id = status['id']
    status_message = clean('' if 'message' not in status.keys() else status['message'].encode('utf-8'))
    link_name = '' if 'name' not in status.keys() else status['name'].encode('utf-8')
    
    status_type = status['type']
    status_link = '' if 'link' not in status.keys() else status['link']
    
    
    # Time needs special care since a) it's in UTC and
    # b) it's not easy to use in statistical programs.
    
    status_published = datetime.datetime.strptime(status['created_time'],'%Y-%m-%dT%H:%M:%S+0000')
    ayear = status_published.year
    # hit error once that returned a 1927 year record so this ignores all preceding 2003
    if ayear>2003:
        status_published = status_published + datetime.timedelta(hours=-6) # CST
        status_published = status_published.strftime('%Y-%m-%d %H:%M:%S') # best time format for spreadsheet programs

        # Nested items require chaining dictionary keys.

        num_likes = 0 if 'likes' not in status.keys() else status['likes']['summary']['total_count']
        num_comments = 0 if 'comments' not in status.keys() else status['comments']['summary']['total_count']
        num_shares = 0 if 'shares' not in status.keys() else status['shares']['count']

        rate_conversion[status_published]=num_comments
        rate_amplification[status_published]=num_shares
        rate_applause[status_published]=num_likes 

        # return a tuple of all processed data
        return (status_id, status_message, link_name, status_type, status_link,
               status_published, num_likes, num_comments, num_shares)
    else:
        return ('','','','','','','','','')

def scrapeFacebookPageFeedStatus(page_id, access_token):
    os.chdir(dir_loc)
    with open('%s_facebook_statuses.csv' % page_id, 'wb') as file:
        w = csv.writer(file)
        w.writerow(["status_id", "status_message", "link_name", "status_type", "status_link",
           "status_published", "num_likes", "num_comments", "num_shares"])
        
        has_next_page = True
        num_processed = 0   # keep a count on how many we've processed
        scrape_starttime = datetime.datetime.now()
        
        print cursor, "Scraping %s Facebook Page: %s\n" % (page_id, scrape_starttime)
        
        statuses = getFacebookPageFeedData(page_id, access_token, 100)
        
        while has_next_page:
            for status in statuses['data']:
                w.writerow(processFacebookPageFeedStatus(status))
                
                # output progress occasionally to make sure code is not stalling
                num_processed += 1
                if num_processed % 100 == 0:
                    print cursor, "%s Statuses Processed: %s" % (num_processed, datetime.datetime.now())
                    
            # if there is no next page, we're done.
            # if 'paging' in statuses.keys():
            if 'paging' in statuses.keys() and 'next' in statuses['paging'] and statuses['paging']['next']:
                statuses = json.loads(request_until_succeed(statuses['paging']['next']))
            else:
                has_next_page = False
        print cursor, 'Done!'+cursor+'%s Statuses Processed in %s' % (num_processed, datetime.datetime.now() - scrape_starttime)

global rate_conversion
global rate_amplification
global rate_applause
rate_conversion={}
rate_amplification={}
rate_applause={}
        
scrapeFacebookPageFeedStatus(page_id, access_token)


## Bar Chart - Conversation Counts (total comments)

In [None]:
%matplotlib inline

bar_conversations_sum = pygal.Bar(style=LightSolarizedStyle, width=800, height=600, show_legend=False, human_readable=True, title='Conversations Total per Month\n(total comments)\nOverall: '+q1)
bar_conversations_ratio = pygal.Bar(style=LightSolarizedStyle, width=800, height=600, show_legend=False, human_readable=True, title='Conversations Ratio per Month\n(comments per post)\nOverall: '+q2)

col='Conversations'
frequency_sum, frequency_ratio, names, q1, q2 = conf_ratio(rate_conversion,col)
dict_freq[col]=frequency_ratio

for i in range(0,len(frequency_sum)-1):
    bar_conversations_sum.add(names[i], frequency_sum[i])
    bar_conversations_ratio.add(names[i], frequency_ratio[i])
    
bar_conversations_sum.render_to_file(page_id+'conversations_sum.svg')
bar_conversations_ratio.render_to_file(page_id+'conversations_ratio.svg')
HTML(html_pygal.format(pygal_render=bar_conversations_sum.render()).decode('utf-8'))

# Conversation Rates

In [None]:
HTML(html_pygal.format(pygal_render=bar_conversations_ratio.render()).decode('utf-8'))

# Amplification Counts

In [None]:
col='Amplifications'
frequency_sum, frequency_ratio, names, q1, q2 = conf_ratio(rate_amplification,col)
dict_freq[col]=frequency_ratio

bar_amplifications_sum = pygal.Bar(style=LightSolarizedStyle, width=800, height=600, show_legend=False, human_readable=True, title='Amplifications Total per Month\n(total shares)\nOverall: '+q1)
bar_amplifications_ratio = pygal.Bar(style=LightSolarizedStyle, width=800, height=600, show_legend=False, human_readable=True, title='Amplifications Ratio per Month\n(shares per post)\nOverall: '+q2)

for i in range(0,len(frequency_sum)-1):
    bar_amplifications_sum.add(names[i], frequency_sum[i])
    bar_amplifications_ratio.add(names[i], frequency_ratio[i])
    
bar_amplifications_sum.render_to_file(page_id+'amplifications_sum.svg')
bar_amplifications_ratio.render_to_file(page_id+'amplifications_ratio.svg')

HTML(html_pygal.format(pygal_render=bar_amplifications_sum.render()).decode('utf-8'))

# Amplification Rates

In [None]:
HTML(html_pygal.format(pygal_render=bar_amplifications_ratio.render()).decode('utf-8'))

# Applause Counts

In [None]:
col='Applause'
frequency_sum, frequency_ratio, names, q1, q2 = conf_ratio(rate_applause,col)
dict_freq[col]=frequency_ratio

bar_applause_sum = pygal.Bar(style=LightSolarizedStyle, width=800, height=600, show_legend=False, human_readable=True, title='Applause Total per Month\n(total likes)\nOverall: '+q1)
bar_applause_ratio = pygal.Bar(style=LightSolarizedStyle, width=800, height=600, show_legend=False, human_readable=True, title='Applause Ratio per Month\n(likes per post)\nOverall: '+q2)

for i in range(0,len(frequency_sum)-1):
    bar_applause_sum.add(names[i], frequency_sum[i])
    bar_applause_ratio.add(names[i], frequency_ratio[i])
    
bar_applause_sum.render_to_file(page_id+'applause_sum.svg')
bar_applause_ratio.render_to_file(page_id+'applause_ratio.svg')

HTML(html_pygal.format(pygal_render=bar_applause_sum.render()).decode('utf-8'))

# Applause Rates

In [None]:
HTML(html_pygal.format(pygal_render=bar_applause_ratio.render()).decode('utf-8'))

# Compare Three Ratios

In [None]:
d=copy.deepcopy(dict_freq)
line_dict={}
line_dict.clear()
c_list=[]
c_list[:]=[]
app_list=[]
app_list[:]=[]
amp_list=[]
amp_list[:]=[]

for k, v in d.items():
    if 'Conversation' in k:
        for i in v:
            if i>=0:
                c_list.append(float(i))
            else:
                c_list.append(float(0))
        line_dict[k]=c_list
    elif k=='Applause':
        for i in v:
            if i>=0:
                app_list.append(float(i))
            else:
                app_list.append(float(0))
        line_dict[k]=app_list
    elif 'Ampli' in k:
        for i in v:
            if i>=0:
                amp_list.append(float(i))
            else:
                amp_list.append(float(0))
        line_dict[k]=amp_list    

e=copy.deepcopy(line_dict)

line_chart = pygal.Line(style=CleanStyle, width=800, height=600, legend_at_bottom=True, legend_at_bottom_columns=6, human_readable=True, title='Applause Total per Month')
line_chart.title = 'Facebook Ratios'

for k, v in e.items():
    line_chart.add(k,v)
line_chart.render_to_file(page_id+'three_metrics.svg')

HTML(html_pygal.format(pygal_render=line_chart.render()).decode('utf-8'))