# Introduction
Investigation of bitcoin-focused reddit groups

### Standard Imports

In [2]:
# Basic functions
import os

# Data manipulation
import pandas as pd
import numpy as np

# Options for pandas
pd.options.display.max_columns = 50
pd.options.display.max_rows = 30

# Display all cell outputs
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

from IPython import get_ipython
ipython = get_ipython()

# autoreload extension
if 'autoreload' not in ipython.extension_manager.loaded:
    %load_ext autoreload

%autoreload 2

# Visualizations
import chart_studio.plotly as py
import plotly.graph_objs as go
from plotly.offline import iplot, init_notebook_mode
init_notebook_mode(connected=True)

import cufflinks as cf
cf.go_offline(connected=True)
cf.set_config_file(theme='white')

### Specific Imports

In [3]:
import bz2, json
import psaw
from psaw import PushshiftAPI
from copy import deepcopy
import datetime as dt

In [4]:
from tqdm import tqdm_notebook as tqdm, trange

In [None]:
from social_media import 

# Get Data

## Get Subreddits

In [5]:
raw_data_path = '../data/raw'
subreddit_file = 'subreddits_basic.csv'
subreddit_path = os.path.join(raw_data_path, subreddit_file)

In [6]:
def format_cols(df):
    df.columns = [col.strip().lower().replace(' ', '_') for col in df.columns]
    return df

def lowercase_cols(df, cols):
    for col in cols:
        df[col] = df[col].str.lower()
    return df

def col_to_datetime(df, col, unit='s'):
    df[col] = pd.to_datetime(df[col], unit=unit)
    return df

def contains_str(df, col, _str, regex=True, match=False):
    col_fmt = df[col].fillna("").str
    if match:
        condition = col_fmt.match(_str)
    else:
        condition = col_fmt.contains(_str, regex=regex)
    df = df.loc[condition]
    return df

def isin_col(df, col, strings):
    str_list = [term.lower() for term in strings]
    condition = df[col].isin(str_list)
    return df.loc[condition]

### Subreddit Sample

In [7]:
subreddit_data = (
    pd.read_csv(subreddit_path)
    .pipe(format_cols)
    .rename(columns={'creation_epoch':'created', 'subreddit_name':'name', 'number_of_subscribers':'subscribers'})
    .pipe(lowercase_cols, ['name'])
    .pipe(col_to_datetime, 'created')
    .set_index('reddit_base36_id')
)

subreddit_data.head()

Unnamed: 0_level_0,base10_id,created,name,subscribers
reddit_base36_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
t5_vf2,40718.0,2006-01-19 19:49:21,nsfw,936786
t5_21n6,95442.0,2006-02-19 06:55:30,features,1396
t5_21nj,95455.0,2006-02-19 07:11:36,request,547
t5_21of,95487.0,2006-02-19 08:02:34,olympics,397323
t5_22i0,96552.0,2006-02-20 06:22:35,de,108579


In [8]:
def create_regex_filter(terms):
    filt = terms[0].lower()
    for term in terms:
        filt += '|' + term.lower() 
    return filt

In [9]:
sr_list = [
    'CryptoMarkets',
    'BitcoinMarkets',
    'CryptoCurrencyTrading',
    'ethtrader',
    'cryptotrading',
    'CryptoCurrency',
    'Bitcoin',
    'Ethereum',
    'BTC',
    'litecointraders',
]

In [10]:
filter_str = create_regex_filter(sr_list)
crypto_subreddits = (
    subreddit_data
    .pipe(isin_col, 'name', sr_list)
    .replace('None', 0)
    .dropna()
    .astype({'subscribers':int})
    .sort_values('subscribers', ascending=False)
    .head(30)
)

## Subreddit Responses

In [11]:
import datetime as dt
from dateutil.relativedelta import relativedelta

In [13]:
relativedelta(months=6, days=0)

relativedelta(months=+6)

In [15]:
str(dt.datetime(2019, 11, 2))

'2019-11-02 00:00:00'

## Get all Subreddit Data and Save to Disk

In [85]:
def get_start_end_date(months, days=0):
    end_date = dt.datetime.utcnow()
    start_date = end_date - relativedelta(months=months, days=days)
    return start_date, end_date

def format_date_str(dt):
    return dt.strftime('%d%b%Y')

def get_cache(subreddit, start_date, end_date):    
    api = PushshiftAPI()
    start_epoch=int(dt.datetime.timestamp(start_date))
    end_epoch=int(dt.datetime.timestamp(end_date))
    work = api.search_comments(
    #     q = "notch",
        subreddit = sr,
        after= start_epoch,
        before= end_epoch
    )
    cache = []
    for c in work:
        cache.append(c)
    
    return cache
        
def get_response_list(cache):
    response_list = []
    for i in trange(len(cache)):
        response_dict = cache[i].d_
        response_list.append(response_dict)
    
    return response_list
    
def get_response_df(response_list):
    keep_cols = [
        'author',
        'author_flair_text',
        'body',
        'created_utc',
        'is_submitter',
        'score',
        'no_follow',
        'id',
        'parent_id'   
    ]
    
    response_df = (
        pd.DataFrame(response_list)
        .filter(keep_cols)
        .pipe(lowercase_cols, ['body'])
#         .pipe(remove_stopwords, 'body')
#         .assign(body = lambda x: x['body'].apply(lambda x: _tokenize(x))) #tokenize

    #     .pipe(remove_punctuation, ['body'])
        .assign(created = lambda x: pd.to_datetime(x['created_utc'], unit='s'))
        .set_index('id')
    )
    
    return response_df

In [86]:
start_date, end_date = get_start_end_date(months=6, days=0)
start_str = format_date_str(start_date)
end_str = format_date_str(end_date)

print(f'Getting data from {start_str} to {end_str}..')

cache_dict = {}
for sr in tqdm(sr_list[:]):
    # Get Data
    cache = get_cache(sr, start_date, end_date)        
    response_list = get_response_list(cache)
    response_df = get_response_df(response_list)
    
    # Save Data
    filename = f'r_{sr} response data {start_str}-{end_str}.csv'
    save_path = os.path.join(raw_data_path, filename)
    response_df.to_csv(save_path)

Getting data from 09May2019 to 09Nov2019..


HBox(children=(IntProgress(value=0, max=10), HTML(value='')))

100%|████████████████████████████████| 17427/17427 [00:00<00:00, 792279.65it/s]
100%|██████████████████████████████| 151596/151596 [00:00<00:00, 846716.03it/s]
100%|██████████████████████████████████| 5361/5361 [00:00<00:00, 669993.85it/s]
100%|██████████████████████████████| 166215/166215 [00:00<00:00, 818607.27it/s]
100%|████████████████████████████████████| 539/539 [00:00<00:00, 538525.45it/s]
100%|██████████████████████████████| 558133/558133 [00:00<00:00, 901466.11it/s]
100%|██████████████████████████████| 527558/527558 [00:00<00:00, 864667.17it/s]
100%|████████████████████████████████| 35338/35338 [00:00<00:00, 785112.85it/s]
100%|██████████████████████████████| 226471/226471 [00:00<00:00, 817402.76it/s]
100%|████████████████████████████████████| 455/455 [00:00<00:00, 454382.93it/s]





# Exploratory Data Analysis

## Data Cleaning Imports

In [50]:
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer

## Construct DataFrame of Reponses

In [48]:
import string

def remove_punctuation(df, col):
    punct = string.punctuation.replace('|', '') + r'\n\n' + '”' + '’'
    transtab = str.maketrans(dict.fromkeys(punct, ''))
    
    translated = (
        df[col]
        .str.translate(transtab)
        .replace('”|’', '', regex=True)
    )
    df[col] = translated
    return df

def _tokenize(x, remove_duplicates=True):
    tokenizer = RegexpTokenizer(r'\w+')
    raw_tokens = tokenizer.tokenize(x)
    tokens = set(raw_tokens) if remove_duplicates else raw_tokens
    return list(tokens)

def remove_stopwords2(text):
    words = [w for w in text if w not in stopwords.words('english')]
    return words

def remove_stopwords(df, col):
    to_remove = stopwords.words('english')
    pat = r'\b(?:{})\b'.format('|'.join(to_remove))
    df[col] = df[col].str.replace(pat, '', regex=True)
    return df

## Concentration of Author Postings

In [111]:
import sys
sys.executable

'C:\\Users\\brian\\Anaconda3\\envs\\social_media\\python.exe'

In [80]:
import ast

sr = sr_list[0]

filename = f'r_{sr} response data {start_str}-{end_str}.csv'
save_path = os.path.join(raw_data_path, filename)
df = pd.read_csv(save_path, index_col=0, converters={'body':ast.literal_eval})

In [109]:
author_counts = (
    df
    .query('author != "[deleted]"')['author']
    .value_counts()
    .pipe(lambda x: x[x>=6])
)
author_counts_norm = author_counts / len(author_counts)

In [103]:
def hhi(array):
    return np.square(array).sum()

In [110]:
hhi(author_counts_norm)

0.6414000000000001

## Popular words

In [84]:
flattened = pd.Series([item for sublist in df['body'] for item in sublist])
flattened.value_counts().iplot(kind='bar')

# Sentiment Analysis