In [None]:
import pandas as pd
import numpy as np
import statistics as stat
import requests
import csv
import re
import nltk
import time
import warnings
warnings.filterwarnings('ignore')

from lxml import html
from bs4 import BeautifulSoup
from nltk.sentiment.vader import SentimentIntensityAnalyzer

#Download this once if you have never before
nltk.download('vader_lexicon')


%matplotlib inline

This code looks into the site data for Lobste.rs and adds the features of engagement score and comment
sentiment to better get an idea of the users who are most active and produce the most engagement and 
positivety on a whole. They will be combined with survery data to create profiles on certain types of
users on the Lobste. site.

In [None]:
#Please make sure that you have this dataset before running, it is not included within the repository
df = pd.read_csv('../uf-data-analytics-project/lobsters_full_2017_cleaned.csv', index_col='Unnamed: 0')

The engagement score metric is a combination of all major actions that a user can take in response to a post. We chose to include all responses, negative and postiive, in order to indentify those that will bring the most users that are more likely to respond to calls to action placed by marketers on
the site

In [None]:
#add engagement score feature
df["engagement_score"] = df["upvotes"] + df["downvotes"] + df["comment_count"]

In [None]:
#save original df 
og_df = df

In [None]:
df.head()

In [None]:
#remove all rows without comments to speed sentiment analysis and beause we want to look at popular posts
#Will not effect other features
df= df.drop(df[df['comment_count'] < 1].index)

Functions

We decided to use comment sentiment score as our other feature in order to garner those who create 
positive discussion on the site, something that is more valuable in the eyes of advertising firms. 
These two features can and most likely will overlap in some users, resulting in those that will be most profitable to cultivate and study closer.

In [None]:
def remove_html_tags(text):
    """Removes html tags from a string"""
    clean = re.compile('<.*?>')
    text_nt = re.sub(clean, '', text)
    return text_nt

In [None]:
def sentiment_comment_text(url):
    """Takes Lobste'rs URL and returns sentiment score and text of post comments"""
    page = requests.get(url, timeout=100)
    #brew the soup
    soup = BeautifulSoup(page.text)
    #find all comments on Lobste.rs post
    comment_text = str(soup.find_all('p'))
    #get rid of html tags
    comment_text = remove_html_tags(comment_text)
    sia = SentimentIntensityAnalyzer()
    #run the sentiment analysis
    sentiment_score = sia.polarity_scores(comment_text)['compound']
    return pd.Series((comment_text, sentiment_score ))

In [None]:
#Applying Functions
start = time.time()
#adding the comment_text and comment_sentiment columns to the dataframe
df[['comment_text','comment_sentiment']] = df['comments_url'].apply(sentiment_comment_text)
end = time.time()
print(end - start, 'seconds for cell to run')

Stats

In [None]:
#grouping the data by username and summing the engagement scores will highlight the whales
total_engagement_score_by_usn = df.groupby('username')['engagement_score'].sum()

In [None]:
#grouping by comment sentiment shows those who have produces the mose positive comments on the site
total_comment_sentiment = df.groupby('username')['comment_sentiment']

In [None]:
#count the total posts by each username, dropping those who have not posted often 
total_posts_by_usn = df.groupby('username')['engagement_score'].count()
df= total_posts_by_usn
total_posts_by_usn_ = df.drop(df[df < 10].index)

In [None]:
#Find average post engagement per username, if they have posted 10 or more times
avg_engagement_by_user = total_engagement_score_by_usn / total_posts_by_usn
avg_engagement_by_user = avg_engagement_by_user.dropna()

In [None]:
#Find average commentiment per post per username if they have posted 10 or more times
avg_postsent_by_user = total_comment_sentiment / total_posts_by_usn
avg_postsent_by_user = avg_postsent_by_user.dropna()

In [None]:
#df = og_df

In [None]:
#find the standard deviation of the engagement score
stdev_avg_engagement = stat.stdev(avg_engagement_by_user)
print(stdev_avg_engagement)

In [None]:
#find the standard deviation of the comment sentiment
stdev_comsent = stat.stdev(avg_postsent_by_user)
print(stdev_comsent)

In [None]:
#mean of the engagement score
mean_avg_engagement = stat.mean(avg_engagement_by_user)
print(mean_avg_engagement)

In [None]:
#mean of the comment sentiment
mean_comsent = stat.mean(avg_postsent_by_user)
print(mean_comsent)

In [None]:
#find engagement score standard deviation levels
one_eng_stdev = stdev_avg_engagement + mean_avg_engagement
two_eng_stdev = (stdev_avg_engagement*2) + mean_avg_engagement
three_eng_stdev = (stdev_avg_engagement*3) + mean_avg_engagement

In [None]:
stdev1df = df.drop(df[df < one_eng_stdev].index)
stdev2df = df.drop(df[df < two_eng_stdev].index)
stdev3df = df.drop(df[df < three_eng_stdev].index)

In [None]:
#find comment sentiment standard deviation levels
one_comsent_stdev = stdev_comsent +mean_comsent
two_comsent_stdev = (stdev_comsent*2) + mean_comsent
three_comsent_stdev = (stdev_comsent*3) + mean_comsent
stdev1dfcs = df.drop(df[df < one_comsent_stdev].index)
stdev2dfcs = df.drop(df[df < two_comsent_stdev].index)
stdev3dfcs = df.drop(df[df < three_comsent_stdev].index)

In [None]:
#the correlation between our two created featuers
#there is some positive correlaiton but it is not strong
df['engagement_score'].corr(df['comment_sentiment'])

User Data: Usernames and scores for each divison for analysis

In [None]:
#total engagement score per username
print(total_engagement_score_by_usn)

In [None]:
#total comment sentitment per username
print(total_comment_sentiment)

In [None]:
#number of posts by usernmae
print(total_posts_by_usn)

In [None]:
#average post engagement per username
print(avg_engagement_by_user)

In [None]:
#average post sentiment by user for use in determining posters that create most positive community reaction
print(avg_postsent_by_user)

In [None]:
#Usernames 1 or more standard deviations from the average post engagement score mean
print(stdev1df)

In [None]:
#Usernames 2 or more standard deviations from the average post engagement score mean
print(stdev2df)

In [None]:
#Usernames 3 or more standard deviations from the average post engagement score mean
print(stdev3df)

In [None]:
#Usernames 1 or more standard deviations from the average post comment sentiment mean
print(stdev1dfcs)

In [None]:
#Usernames 2 or more standard deviations from the average post comment sentiment mean
print(stdev2dfcs)

In [None]:
#Usernames 1 or more standard deviations from the average post comment sentiment mean
print(stdev3dfcs)

Visualizations

These visualizations give us good insight into the makeup of the users and the those that produce the most popular posts. You can see how there are a small section of targeted users that make up most of the posting and have the highest sentiment scores.

In [None]:
#engagement score for all posts
df['engagement_score'].hist()

In [None]:
#total comment sentiment by username
total_comment_sentiment.hist()

In [None]:
#total engagement score by username 
total_engagement_score_by_usn.hist()

In [None]:
#total posts in dataset by username
total_posts_by_usn.hist()

In [None]:
#average post engagement score by username
avg_engagement_by_user.hist()

In [None]:
#average post comment sentiment by username
avg_postsent_by_user.hist()

In [None]:
#average post engagement scores by username 1 or more standard deviations above the mean
stdev1df.hist()

In [None]:
#average post engagement scores by username 2 or more standard deviations above the mean
stdev2df.hist()

In [None]:
#average post engagement scores by username 3 or more standard deviations above the mean
stdev3df.hist()

In [None]:
#average post comment sentiment by username 1 or more standard deviations above the mean
stdev1dfcs.hist()

In [None]:
#average post comment sentiment by username 2 or more standard deviations above the mean
stdev1dfcs.hist()

In [None]:
#average post comment sentiment by username 3 or more standard deviations above the mean
stdev1dfcs.hist()