In [1]:
%matplotlib inline

In [2]:
import os
import json
import gzip
import datetime
from functools import partial
from dateutil import parser, tz

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from glob import glob
from tqdm import tqdm

tqdm.pandas()
plt.rcParams['figure.figsize'] = (12, 8)

# Stanford Sentiment Treebank v2

https://www.kaggle.com/atulanandjha/stanford-sentiment-treebank-v2-sst2

In [None]:
text = pd.read_csv("../data/sst2/dictionary.txt", sep='|', header=None)
text.columns = ['text', 'id']
text.head()

In [None]:
text['text'].apply(len).plot.hist(bins=30)
plt.xlabel('Text Length (characters)')
print(text.info())
plt.show()

In [None]:
labels = pd.read_csv("../data/sst2/sentiment_labels.txt", sep='|')
labels.columns = ['id', 'sentiment']
labels.head()

In [None]:
labels['sentiment'].plot.hist(bins=30)
plt.xlabel('Sentiment (0-1)')
print(labels.info())
plt.show()

In [None]:
sst2 = text.merge(labels, how='inner', on='id')
print(sst2.info())
sst2.head()

In [None]:
fig, (ax1, ax2) = plt.subplots(2, 1)
experiment = sst2.copy()
experiment['len'] = experiment['text'].str.len()
experiment = experiment.groupby('len')

text_reg = experiment['sentiment'].mean()
text_count = experiment.apply(len)

sns.regplot(x=text_reg.index, y=text_reg, ax=ax1)
ax1.set_title('Linear Regression of Text Length against Sentiment')
ax1.set_xlabel('')

ax2.plot(text_count.index, text_count)
ax2.set_title('Frequency of samples at each length')
ax2.set_xlabel('Text Length')
ax2.set_ylabel('Frequency')
plt.show()

del experiment, text_reg, text_count

In [None]:
for i in range(2, 11):
    sst2[f"{i}lab"] = pd.cut(sst2['sentiment'], i).cat.codes
sst2.set_index('id', inplace=True)
sst2.sort_index(inplace=True)

In [None]:
sst2.to_csv('../data/sst2/sst2_2_10.csv')
del sst2

# Sentiment140

https://www.kaggle.com/https://www.kaggle.com/kazanova/sentiment140

In [None]:
sentiment140 = pd.read_csv('../data/sentiment140/sentiment140.csv', encoding='latin-1', header=None, usecols=[0, 1, 2, 4, 5])
sentiment140.columns = ['sentiment', 'id', 'date', 'author', 'text']

date_parser = partial(parser.parse, tzinfos={'PDT': tz.gettz('America/Los Angeles')})
sentiment140['date'] = sentiment140['date'].progress_apply(date_parser)
sentiment140['sentiment'] = sentiment140['sentiment'].astype(int).values >> 2  # Convert [0, 4] label to [0, 1]
sentiment140.set_index('id', inplace=True)
sentiment140.sort_index(inplace=True)
print(sentiment140.info())
sentiment140.head()

In [None]:
fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2)
experiment = sentiment140.copy()
experiment['day'] = experiment['date'].dt.date
experiment['len'] = experiment['text'].str.len()
grouped = experiment.groupby('day')

experiment['len'].plot.hist(bins=30, ax=ax1)
grouped.apply(len).plot(ax=ax2)
grouped['sentiment'].mean().plot(ax=ax4)
text_reg = experiment.groupby('len')['sentiment'].mean()
sns.regplot(x=text_reg.index, y=text_reg, ax=ax3)

ax1.set_title('Histogram of Text Lengths')
ax1.set_xlabel('Text Length')
ax2.set_title('Number of Tweets per Day')
ax2.set_xlabel('Date')
ax2.set_ylabel('Frequency')
ax3.set_title('Linear Regression of Text Length against Sentiment')
ax3.set_xlabel('Text Length')
ax3.set_ylabel('Sentiment')
ax4.set_title('Average Sentiment per Day')
ax4.set_xlabel('Date')
ax4.set_ylabel('Sentiment')
plt.subplots_adjust(top=1.2, right=1.5)
plt.show()

del experiment, grouped

In [None]:
sentiment140.to_csv('../data/sentiment140/sentiment140_binary.csv', encoding='latin-1')
del sentiment140

# Amazon Product Reviews (5-core)

https://nijianmo.github.io/amazon/index.html

* We can use the product rating as a proxy for the sentiment of the review
* Since this dataset is much large than I need/could possibly use, I was pretty strict with cleaning
   * Must be verified purchases
   * Removed duplicates on reviewerID and productID

In [None]:
def parse(path):
    g = gzip.open(path, 'r')
    for l in g:
        yield json.loads(l)

def clean(df):
    df = df[df['verified']]
    df = df.drop_duplicates(subset=['reviewerID', 'asin'])
    df['date'] = pd.to_datetime(df['unixReviewTime'], unit='s')
    df.dropna(subset=['reviewerID', 'asin', 'overall', 'reviewText'], inplace=True)
    return df[['date', 'reviewerID', 'asin', 'reviewerName', 'overall', 'reviewText', 'summary', 'vote']]


UNIQUE_COLS = ('reviewerID', 'asin')

def getDF(path):
    unique = set()
    data = []
    for i, d in tqdm(enumerate(parse(path))):
        try:  # Drop uniques as data is read in to reduce memory usage
            key = tuple(d[col] for col in UNIQUE_COLS)
            if key in unique:
                continue
            else:
                unique.add(key)
                data.append(d)
        except KeyError:
            continue
    print('data loaded')
    df = pd.DataFrame(data)
    print('dataframe created\ncleaning...')
    return clean(df)

def get_df_chunks(path, limit=3000000):
    """Helper function to chunk gzip into multiple csv's because I don't have enough RAM"""
    lim = limit
    data = []
    path_template = path[:-8]
    chunk = 1
    for i, d in tqdm(enumerate(parse(path), 1)):
        data.append(d)
        if i >= limit:
            print(f'saving chunk {chunk}')
            df = clean(pd.DataFrame(data))
            df.to_csv(f"{path_template}_{chunk}.csv", index=False)
            df = None
            data = []
            limit += lim
            chunk += 1
    df = clean(pd.DataFrame(data))
    df.to_csv(f"{path_template}_{chunk}.csv", index=False)
    return chunk

In [None]:
for file in list(os.listdir('../data/amazon')):
    if file.endswith('.json.gz'):
        print(file)
        df = getDF(os.path.join('../data/amazon', file))
        print(df.shape)
        df.to_csv(os.path.join('../data/amazon', f"{file.split('.')[0]}.csv"), index=False)

In [None]:
get_df_chunks('../data/amazon/books.json.gz')

In [None]:
files = glob('../data/amazon/books_*.csv')
df = pd.concat([pd.read_csv(file) for file in  tqdm(files)])
df = df.drop_duplicates(subset=['reviewerID', 'asin'])
df['date'] = pd.to_datetime(df['date'])
df = df[['date', 'reviewerID', 'asin', 'reviewerName', 'overall', 'reviewText', 'summary', 'vote']]

In [None]:
df.info()

In [None]:
df.to_csv('../data/amazon/books.csv', index=False)

# IMDB Movie Reviews Dataset

In [None]:
imdb = pd.read_csv('../data/imdb/imdb.csv')
imdb['sentiment'] = (imdb['sentiment'] == 'positive').astype(np.int8)
print(imdb.info())
imdb.head()

In [None]:
fig, (ax1, ax2) = plt.subplots(2, 1)
experiment = imdb.copy()
experiment['len'] = experiment['review'].str.len()
experiment = experiment.groupby('len')

text_reg = experiment['sentiment'].mean()
text_count = experiment.apply(len)

sns.regplot(x=text_reg.index, y=text_reg, ax=ax1)
ax1.set_title('Linear Regression of Text Length against Sentiment')
ax1.set_xlabel('')

ax2.plot(text_count.index, text_count)
ax2.set_title('Frequency of samples at each length')
ax2.set_xlabel('Text Length')
ax2.set_ylabel('Frequency')
plt.show()

del experiment, text_reg, text_count

In [None]:
imdb.to_csv('../data/imdb/imdb_binary.csv', index=False)
del imdb

# Twitter US Airline Sentiment

https://www.kaggle.com/crowdflower/twitter-airline-sentiment

Not good for this project, only gives text (10 preset choices) for negative responses. Would lead to sampling bias and the preselected responses are too short.

In [None]:
tweets = pd.read_csv('../data/twitter_airlines/tweets.csv')
print(tweets.info())
tweets.head()

In [None]:
tweets['negativereason'].unique()

# Bag of Words Meets Popcorn Dataset

https://www.kaggle.com/c/word2vec-nlp-tutorial/data

In [None]:
popcorn_train = pd.read_csv('../data/popcorn/train.csv')
popcorn_test = pd.read_csv('../data/popcorn/test.csv')
print(popcorn_train.info())
print(popcorn_test.info())
popcorn_train.head()

In [None]:
fig, (ax1, ax2) = plt.subplots(2, 1)
experiment = popcorn_train.copy()
experiment['len'] = experiment['review'].str.len()
experiment = experiment.groupby('len')

text_reg = experiment['sentiment'].mean()
text_count = experiment.apply(len)

sns.regplot(x=text_reg.index, y=text_reg, ax=ax1)
ax1.set_title('Linear Regression of Text Length against Sentiment')
ax1.set_xlabel('')

ax2.plot(text_count.index, text_count)
ax2.set_title('Frequency of samples at each length')
ax2.set_xlabel('Text Length')
ax2.set_ylabel('Frequency')
plt.show()

del experiment, text_reg, text_count

In [None]:
imdb = pd.read_csv('../data/imdb/imdb_binary.csv')
imdb.head()

In [None]:
popcorn_train['review'].isin(imdb['review']).sum()

There is huge **exact** overlap with the imdb dataset so I'm going to assume most if not all of this is duplicated from that dataset.

# OpinRank hotel and car reviews

http://kavita-ganesan.com/entity-ranking-data/#.W4jjE5MzbUJ

In [None]:
def _get_csvs(path):
    for a, b, c, in os.walk(path):
        for i in c:
            if i.endswith('.csv'):
                yield os.path.join(a, i)

for i in _get_csvs('../data/opinrank'):
    print(pd.read_csv(i).head())
    break

**Also unusable for this project since it doesn't really contain text**

# Yelp NYC Dataset

359k yelp reviews for restuarants in NYC

http://odds.cs.stonybrook.edu/yelpnyc-dataset/

https://www.kaggle.com/ahtxham/yelpnyc-labelled-dataset-from-shebuti

In [None]:
reviews = pd.read_csv('../data/yelpnyc/yelp.csv')
ratings = pd.read_csv('../data/yelpnyc/yelp_meta.csv')
print(reviews.info())
print(ratings.info())
reviews.head()

In [None]:
yelpnyc = reviews.merge(ratings, how='inner', on=['Review_id', 'Product_id'])
yelpnyc.drop('Review_Date_y', axis=1, inplace=True)
yelpnyc.columns = ['review_id', 'product_id', 'date', 'text', 'rating', 'label']
yelpnyc['text'] = yelpnyc['text'].str.replace("Â\xa0", '')
yelpnyc['date'] = pd.to_datetime(yelpnyc['date'])
yelpnyc['label'] = ((yelpnyc['label'] + 1) / 2).astype(np.int8)  # [-1, 1] label to [0, 1]
yelpnyc.info()
yelpnyc.head()

In [None]:
fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2)
experiment = yelpnyc.copy()
experiment['day'] = experiment['date'].dt.date
experiment['len'] = experiment['text'].str.len()
grouped = experiment.groupby('day')

experiment['len'].plot.hist(bins=30, ax=ax1)
grouped.apply(len).plot(ax=ax2)
grouped['label'].mean().plot(ax=ax4)
text_reg = experiment.groupby('len')['label'].mean()
sns.regplot(x=text_reg.index, y=text_reg, ax=ax3)

ax1.set_title('Histogram of Text Lengths')
ax1.set_xlabel('Text Length')
ax2.set_title('Number of Reviews per Day')
ax2.set_xlabel('Date')
ax2.set_ylabel('Frequency')
ax3.set_title('Linear Regression of Text Length against Sentiment')
ax3.set_xlabel('Text Length')
ax3.set_ylabel('Sentiment')
ax4.set_title('Average Sentiment per Day')
ax4.set_xlabel('Date')
ax4.set_ylabel('Sentiment')
plt.subplots_adjust(top=1.2, right=1.5)
plt.show()

del experiment, grouped

In [None]:
yelpnyc.to_csv('../data/yelpnyc/yelpnyc_binary.csv', index=False)
del yelpnyc

In [5]:
import wget

In [12]:
foo = pd.read_html('https://files.pushshift.io/reddit/comments/daily/')[0]
foo

Unnamed: 0,Filename,Type,Size (bytes),Date Modified
0,RC_2018-01-01.xz,LZMA2 Compressed Reddit Comments (JSON objects),201491588,Feb 13 2018 1:52 AM
1,RC_2018-01-02.xz,LZMA2 Compressed Reddit Comments (JSON objects),257462668,Feb 13 2018 1:52 AM
2,RC_2018-01-03.xz,LZMA2 Compressed Reddit Comments (JSON objects),272128832,Feb 13 2018 1:52 AM
3,RC_2018-01-04.xz,LZMA2 Compressed Reddit Comments (JSON objects),263591508,Feb 13 2018 1:53 AM
4,RC_2018-01-05.xz,LZMA2 Compressed Reddit Comments (JSON objects),276650320,Feb 13 2018 1:53 AM
...,...,...,...,...
193,RC_2020-04-16.gz,Reddit Comments (JSON objects),1192552544,Sep 1 2020 10:08 AM
194,RC_2020-04-17.gz,Reddit Comments (JSON objects),1163581603,Sep 1 2020 10:09 AM
195,RC_2020-04-18.gz,Reddit Comments (JSON objects),1098207690,Sep 1 2020 10:09 AM
196,par_recovery,<Directory> File,<Directory>,Mar 17 2018 8:06 PM


In [19]:
import bz2
import json
import lzma
import gzip
import zstandard
import io

In [22]:
COLS = ['id', 'created_utc', 'retrieved_on', 'subreddit', 'author', 'body']

def valid_line(line, subreddits):
    sub_check = 'subreddit' in line and line['subreddit'] in subreddits
    body_check = 'body' in line and line['body'] != '[deleted]' and line['body'] != '[removed]'
    has_elem = 'created_utc' in line and 'id' in line and 'retrieved_on' in line and 'author' in line
    return sub_check and body_check and has_elem

def clean_reddit(path, subreddits=['investing', 'stocks', 'wallstreetbets'], cols=COLS):
    data = []
    out_path = '/'.join(path.split('/')[:-1]) + '/' + path.split('/')[-1].split('.')[0].lower() + '.csv'
    print(f"Loading file: {path}")
    with gzip.open(path) as f:
        for line in tqdm(f):
            d = json.loads(line)
            if valid_line(d, subreddits):
                data.append({k: d[k] for k in cols})
    df = pd.DataFrame(data)
    df.drop_duplicates(subset=['id', 'subreddit'], inplace=True)
    if df.shape[0] == 0:
        print(f'No valid rows found, not saving file: {path}')
    else:
        print(df.shape)
        print(f"saving to {out_path}")
        df.to_csv(out_path)
    return df

def clean_reddit_zst(path, subreddits=['investing', 'stocks', 'wallstreetbets'], cols=COLS):
    data = []
    out_path = '/'.join(path.split('/')[:-1]) + '/' + path.split('/')[-1].split('.')[0].lower() + '.csv'
    print(f"Loading file: {path}")
    with open(path, 'rb') as f:
        dctx = zstandard.ZstdDecompressor()
        stream_reader = dctx.stream_reader(f)
        text_stream = io.TextIOWrapper(stream_reader, encoding='utf-8')
        for line in tqdm(text_stream):
            d = json.loads(line)
            if valid_line(d, subreddits):
                data.append({k: d[k] for k in cols})
    df = pd.DataFrame(data)
    df.drop_duplicates(subset=['id', 'subreddit'], inplace=True)
    if df.shape[0] == 0:
        print(f'No valid rows found, not saving file: {path}')
    else:
        print(df.shape)
        print(f"saving to {out_path}")
        df.to_csv(out_path)
    return df

In [3]:
push_dir = '../scraped_data/pushshift/'

for file in foo['Filename'].sort_values().values[87:-2]:
    if not os.path.exists(os.path.join(push_dir, file)):
        file_url = "https://files.pushshift.io/reddit/comments/daily/" + file
        print(f"Downloading {file_url}...")
        print(f"File downloaded {wget.download(file_url, out=push_dir)}")
    print("Cleaning file...")
    print(f"File cleaned:\n{clean_reddit(os.path.join(push_dir, file)).info()}")
    file_out = os.path.join(push_dir, file)
    print(f"Deleting file: {file_out}")
    os.remove(file_out)

In [3]:
push_dir = '../scraped_data/pushshift/'
merged = pd.concat([pd.read_csv(os.path.join(push_dir, i)) for i in os.listdir(push_dir) if i.endswith('.csv')])
merged.info()

  if (await self.run_code(code, result,  async_=asy)):


<class 'pandas.core.frame.DataFrame'>
Int64Index: 14666703 entries, 0 to 17553
Data columns (total 7 columns):
 #   Column        Dtype  
---  ------        -----  
 0   Unnamed: 0    object 
 1   id            object 
 2   created_utc   float64
 3   retrieved_on  float64
 4   subreddit     object 
 5   author        object 
 6   body          object 
dtypes: float64(2), object(5)
memory usage: 895.2+ MB


In [4]:
merged.dropna(inplace=True)
merged.drop('Unnamed: 0', axis=1, inplace=True)
merged['created_utc'] = pd.to_datetime(merged['created_utc'], unit='s')
merged['retrieved_on'] = pd.to_datetime(merged['retrieved_on'], unit='s')
merged.sort_values('created_utc', inplace=True)
print(merged.info())
merged

<class 'pandas.core.frame.DataFrame'>
Int64Index: 14666328 entries, 0 to 102545
Data columns (total 6 columns):
 #   Column        Dtype         
---  ------        -----         
 0   id            object        
 1   created_utc   datetime64[ns]
 2   retrieved_on  datetime64[ns]
 3   subreddit     object        
 4   author        object        
 5   body          object        
dtypes: datetime64[ns](2), object(4)
memory usage: 783.3+ MB
None


Unnamed: 0,id,created_utc,retrieved_on,subreddit,author,body
0,c6ulr69,2012-11-01 00:06:35,2015-04-29 06:24:26,investing,clituna,Check Craigslist. That's where we sell a lot o...
1,c6uls3f,2012-11-01 00:08:29,2015-04-29 06:24:38,investing,Is_this_thing_on,I couldn't tell you. Their numbers look pretty...
2,c6ulsz0,2012-11-01 00:10:25,2015-04-29 06:24:48,investing,kage860,Short : AMZN\n\nRational: Rich Valuation
3,c6uluh1,2012-11-01 00:13:35,2015-04-29 06:25:08,investing,yobria,If the market thought X was a shitty investmen...
4,c6ulw23,2012-11-01 00:17:01,2015-06-30 14:09:56,investing,Is_this_thing_on,Don't totally discount game publishers though....
...,...,...,...,...,...,...
102543,fnu4ipt,2020-04-18 23:59:58,2020-07-05 00:55:14,wallstreetbets,ZAYN91,Calls on WSB.
102541,fnu4ip7,2020-04-18 23:59:58,2020-07-05 00:55:14,wallstreetbets,twotomatoes,HELLLLLLPPPPP
102544,fnu4iq0,2020-04-18 23:59:58,2020-07-05 00:55:14,wallstreetbets,Andrew_the_giant,Aw hell I'm in. What's to lose
102546,fnu4isv,2020-04-18 23:59:59,2020-07-05 00:55:15,wallstreetbets,ZAYN91,Calls on WSB.


In [5]:
merged.to_csv('../scraped_data/pushshift/cleaned/comments_2012_11_2020_4_18.csv')