In [50]:
import ijson
import itertools
import numpy as np
import sys
import pandas as pd

The json is larger, so we use ijson here to load the data.

In [3]:
with open('comments.json', 'r') as f:
    comments = next(ijson.items(f, ''))

Take a look at the variables.

In [13]:
[i for i in comments['7u4r6']]

['text', 'author', 'score', 'ups', 'downs', 'date', 'created_utc', 'subreddit']

There are total of 12704751 comments.

In [109]:
len(comments)

12704751

Take a look at the first comment

In [111]:
comments['7u4r6']

{'text': 'Upvote For Simultaneous "Million Person" Marches on Wall Street And D.C.',
 'author': '[deleted]',
 'score': 48,
 'ups': 104,
 'downs': 56,
 'date': '2009-02',
 'created_utc': 1233540251,
 'subreddit': 'Economics'}

Partition data into small json files. Each json file has around 1000000 data, which are easier to manipulate.

In [73]:
# for i in np.arange(len(comments),1000000):
d = dict(itertools.islice(comments.items(), 11000000, len(comments)))
with open('comments' + str(12) + '.json', 'w') as f:
    f.write("%s\n" % d)

Take the first 1M comments to conduct EDA.

In [112]:
d = dict(itertools.islice(comments.items(), 1000000))
df = pd.DataFrame.from_dict(d).T
df.head()

Unnamed: 0,text,author,score,ups,downs,date,created_utc,subreddit
7u4r6,"Upvote For Simultaneous ""Million Person"" March...",[deleted],48,104,56,2009-02,1233540251,Economics
c07ewjj,Economics (29654 subscribers),pfft,14,14,0,2009-02,1233549003,Economics
7u4a5,Children in the Czech Republic are happier and...,[deleted],29,48,19,2009-02,1233533923,worldnews
c07ey0j,"Of course it's a ""less of a country"", those pe...",joe24pack,1,1,0,2009-02,1233553378,worldnews
7u1ht,Here we go again: Israeli PM vows 'sharp respo...,[deleted],14,23,9,2009-02,1233502066,worldnews


In [113]:
df.describe()

Unnamed: 0,text,author,score,ups,downs,date,created_utc,subreddit
count,1000000,999987,1000000,1000000,1000000,1000000,1000000,1000000
unique,978681,499132,11604,10598,3473,98,993066,9647
top,Why?,[deleted],1,-1,0,2017-04,1430446877,AskReddit
freq,242,40417,136837,142501,856724,56779,3,53655


Lowercase

In [116]:
df['text'] = df['text'].apply(lambda x: x.lower())

In [117]:
df.head()

Unnamed: 0,text,author,score,ups,downs,date,created_utc,subreddit
7u4r6,"upvote for simultaneous ""million person"" march...",[deleted],48,104,56,2009-02,1233540251,Economics
c07ewjj,economics (29654 subscribers),pfft,14,14,0,2009-02,1233549003,Economics
7u4a5,children in the czech republic are happier and...,[deleted],29,48,19,2009-02,1233533923,worldnews
c07ey0j,"of course it's a ""less of a country"", those pe...",joe24pack,1,1,0,2009-02,1233553378,worldnews
7u1ht,here we go again: israeli pm vows 'sharp respo...,[deleted],14,23,9,2009-02,1233502066,worldnews


Text length

In [119]:
df['len'] = df['text'].apply(lambda x: len(x.split(" ")))
df.head()

Unnamed: 0,text,author,score,ups,downs,date,created_utc,subreddit,len
7u4r6,"upvote for simultaneous ""million person"" march...",[deleted],48,104,56,2009-02,1233540251,Economics,11
c07ewjj,economics (29654 subscribers),pfft,14,14,0,2009-02,1233549003,Economics,3
7u4a5,children in the czech republic are happier and...,[deleted],29,48,19,2009-02,1233533923,worldnews,29
c07ey0j,"of course it's a ""less of a country"", those pe...",joe24pack,1,1,0,2009-02,1233553378,worldnews,41
7u1ht,here we go again: israeli pm vows 'sharp respo...,[deleted],14,23,9,2009-02,1233502066,worldnews,16


remove punctuation

In [122]:
import string
for i in string.punctuation:
    df['text'] = df['text'].apply(lambda x: x.replace(i, ""))

In [123]:
df

Unnamed: 0,text,author,score,ups,downs,date,created_utc,subreddit,len
7u4r6,upvote for simultaneous million person marches...,[deleted],48,104,56,2009-02,1233540251,Economics,11
c07ewjj,economics 29654 subscribers,pfft,14,14,0,2009-02,1233549003,Economics,3
7u4a5,children in the czech republic are happier and...,[deleted],29,48,19,2009-02,1233533923,worldnews,29
c07ey0j,of course its a less of a country those people...,joe24pack,1,1,0,2009-02,1233553378,worldnews,41
7u1ht,here we go again israeli pm vows sharp respons...,[deleted],14,23,9,2009-02,1233502066,worldnews,16
...,...,...,...,...,...,...,...,...,...
db4tzyy,must be training on artificial turf its bad fo...,pcavana17,1,-1,-1,2016-12,1481614451,FifaCareers,11
5hfp5r,conservation effort spreads seeds of destructi...,Crazy-Red-Fox,20,20,0,2016-12,1481311103,environmental_science,9
db0zycn,i do like the deceptive title of the article,officeroffkilter,1,-1,-1,2016-12,1481392382,environmental_science,9
db0412u,they give you a full refund and almost every p...,darkknightxda,5,-1,-1,2016-12,1481324578,GalaxyNote7,23


In [124]:
df.sort_values('ups')

Unnamed: 0,text,author,score,ups,downs,date,created_utc,subreddit,len
d5xrok6,please dance around the issue but never addres...,Ppitm1,-926,-926,0,2016-07,1469925152,AdviceAnimals,14
cmsw38i,my wife would of never said yes if i gave her ...,clwu,-592,-592,0,2014-12,1418400517,ImGoingToHellForThis,14
d5k8k3v,being on tinder shouldnt be a reason to break ...,jbloom3,-540,-540,0,2016-07,1469056347,AdviceAnimals,12
d566giu,he knows he shouldnt have said that and feels ...,waaavvvy,-448,-448,0,2016-07,1468115299,relationships,46
cioeyqy,your username seems really douchey right now,Danger716,-431,-431,0,2014-07,1404484720,nfl,7
...,...,...,...,...,...,...,...,...,...
5fk6rm,found an old linerider course i made in middle...,spitonem,124195,124195,0,2016-11,1480443010,gifs,10
5agzmb,me as the official obiwan kenboni,StanGibson18,127815,127815,0,2016-11,1477970696,pics,6
5by1wy,should have been bernie,Zykium,143013,143013,0,2016-11,1478662058,pics,4
5bx4bx,thanks obama,Itsjorgehernandez,196587,196587,0,2016-11,1478651245,pics,2


In [125]:
df.sort_values('downs')

Unnamed: 0,text,author,score,ups,downs,date,created_utc,subreddit,len
db0652l,no phone on the market that id like to purchas...,-r-i-p-,1,-1,-1,2016-12,1481327658,GalaxyNote7,13
dan3sew,aquafresh,Rajincajun01,6,-1,-1,2016-12,1480562942,NASCAR,1
dan2g5e,lets get barbasol back in the sport too,piper06w,7,-1,-1,2016-12,1480561123,NASCAR,8
damyrk1,buckshot jones,Doogert,6,-1,-1,2016-12,1480556226,NASCAR,2
dan94ag,dang that really mattered huh,Alxytho,2,-1,-1,2016-12,1480571193,GlobalOffensive,5
...,...,...,...,...,...,...,...,...,...
xnags,my girlfriend came home from afghanistan and p...,LEX-URA,1893,40671,38778,2012-08,1344037324,pics,8
si2jx,i love the design of these stairs and how they...,redditMEred,4097,43280,39183,2012-04,1334855379,pics,15
xrms4,maybe nasa should broadcast the olympics,SharkAttack123,4151,43620,39469,2012-08,1344264642,funny,6
obg8v,stopped they must be on this all depends,reddit,4791,50433,45642,2012-01,1326235621,blog,8


Scores are calculated by ups - downs

In [126]:
df.sort_values('score')

Unnamed: 0,text,author,score,ups,downs,date,created_utc,subreddit,len
d5xrok6,please dance around the issue but never addres...,Ppitm1,-926,-926,0,2016-07,1469925152,AdviceAnimals,14
d95wg5d,itt calling animals it instead of the provided...,Narrative_Causality,-888,-1,-1,2016-10,1477333097,Unexpected,9
cmsw38i,my wife would of never said yes if i gave her ...,clwu,-592,-592,0,2014-12,1418400517,ImGoingToHellForThis,14
d5k8k3v,being on tinder shouldnt be a reason to break ...,jbloom3,-540,-540,0,2016-07,1469056347,AdviceAnimals,12
d566giu,he knows he shouldnt have said that and feels ...,waaavvvy,-448,-448,0,2016-07,1468115299,relationships,46
...,...,...,...,...,...,...,...,...,...
64q56l,flying united,USBrock,141443,-1,-1,2017-04,1491908603,funny,2
5by1wy,should have been bernie,Zykium,143013,143013,0,2016-11,1478662058,pics,4
63pnhz,this image is now illegal in russia,GorillaS0up,164442,-1,-1,2017-04,1491439074,pics,7
5bx4bx,thanks obama,Itsjorgehernandez,196587,196587,0,2016-11,1478651245,pics,2


In [129]:
df.sort_values('score')['text'][2]

'my wife would of never said yes if i gave her that cheap ring'

A column to indicate positive/negative score

In [135]:
df['sign'] = df['score'].apply(lambda x: 1 if x >= 0 else -1)

import sentiment table

In [139]:
sen = pd.read_csv('vader_lexicon.txt', 
                   sep='\t',
                   usecols=[0, 1], 
                   header=None, 
                   names=['token', 'polarity'],
                   index_col='token'
                  )
sen.head()

Unnamed: 0_level_0,polarity
token,Unnamed: 1_level_1
$:,-1.5
%),-0.4
%-),-1.5
&-:,-0.4
&:,-0.7


In [140]:
tidy_format = (
    df['text']
    .str.split(expand=True)
    .stack()
    .reset_index(level=1)
    .rename(columns={'level_1': 'num', 0: 'word'})
)
tidy_format.head()

Unnamed: 0,num,word
7u4r6,0,upvote
7u4r6,1,for
7u4r6,2,simultaneous
7u4r6,3,million
7u4r6,4,person


calculate the sentiment score for the data

In [141]:
df['polarity'] = (
    tidy_format
    .merge(sen, how='left', left_on='word', right_index=True)
    .reset_index()
    .loc[:, ['index', 'polarity']]
    .groupby('index')
    .sum()
    .fillna(0)
)
df.head()

Unnamed: 0,text,author,score,ups,downs,date,created_utc,subreddit,len,sign,polarity
7u4r6,upvote for simultaneous million person marches...,[deleted],48,104,56,2009-02,1233540251,Economics,11,1,0.0
c07ewjj,economics 29654 subscribers,pfft,14,14,0,2009-02,1233549003,Economics,3,1,0.0
7u4a5,children in the czech republic are happier and...,[deleted],29,48,19,2009-02,1233533923,worldnews,29,1,7.3
c07ey0j,of course its a less of a country those people...,joe24pack,1,1,0,2009-02,1233553378,worldnews,41,1,4.5
7u1ht,here we go again israeli pm vows sharp respons...,[deleted],14,23,9,2009-02,1233502066,worldnews,16,1,-1.9


In [142]:
df.groupby('sign').describe()

Unnamed: 0_level_0,len,len,len,len,len,len,len,len,polarity,polarity,polarity,polarity,polarity,polarity,polarity,polarity
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max
sign,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2
-1,63587.0,11.304748,11.365108,1.0,6.0,9.0,15.0,1607.0,63586.0,0.148852,2.467892,-54.1,-1.1,0.0,1.5,28.9
1,936413.0,11.003799,8.12714,1.0,6.0,9.0,14.0,1680.0,936407.0,0.066369,2.214928,-112.7,-0.8,0.0,1.3,89.4


In [144]:
df.sort_values('score')

Unnamed: 0,text,author,score,ups,downs,date,created_utc,subreddit,len,sign,polarity
d5xrok6,please dance around the issue but never addres...,Ppitm1,-926,-926,0,2016-07,1469925152,AdviceAnimals,14,-1,1.3
d95wg5d,itt calling animals it instead of the provided...,Narrative_Causality,-888,-1,-1,2016-10,1477333097,Unexpected,9,-1,0.0
cmsw38i,my wife would of never said yes if i gave her ...,clwu,-592,-592,0,2014-12,1418400517,ImGoingToHellForThis,14,-1,1.7
d5k8k3v,being on tinder shouldnt be a reason to break ...,jbloom3,-540,-540,0,2016-07,1469056347,AdviceAnimals,12,-1,0.0
d566giu,he knows he shouldnt have said that and feels ...,waaavvvy,-448,-448,0,2016-07,1468115299,relationships,46,-1,-0.4
...,...,...,...,...,...,...,...,...,...,...,...
64q56l,flying united,USBrock,141443,-1,-1,2017-04,1491908603,funny,2,1,1.8
5by1wy,should have been bernie,Zykium,143013,143013,0,2016-11,1478662058,pics,4,1,0.0
63pnhz,this image is now illegal in russia,GorillaS0up,164442,-1,-1,2017-04,1491439074,pics,7,1,-2.6
5bx4bx,thanks obama,Itsjorgehernandez,196587,196587,0,2016-11,1478651245,pics,2,1,1.9


In [145]:
df.to_csv('sample_df.csv')