In [1]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns
import dask.dataframe as dd

### Load the full dataframe

In [2]:
df = pd.read_csv('/Volumes/external/Sangeetha-Project/Original files/reviews1.csv.zip', compression = 'zip',
                 names=['reviwerId', 'asin', 'review', 'rating',
                        'summary', 'unixtime', 'pos_votes', 'total_votes']) \
    .drop(columns=['review', 'summary', 'unixtime', 'pos_votes', 'total_votes']) \
    .astype({'rating': 'int8'})

In [3]:
df.head()

Unnamed: 0,reviwerId,asin,rating
0,A10000012B7CGYKOMPQ4L,000100039X,5
1,A2S166WSCFIFP5,000100039X,5
2,A1BM81XB4QHOA3,000100039X,5
3,A1MOSTXNIO5MPJ,000100039X,5
4,A2XQ5LZHTD4AFT,000100039X,5


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3010000 entries, 0 to 3009999
Data columns (total 3 columns):
reviwerId    object
asin         object
rating       int8
dtypes: int8(1), object(2)
memory usage: 48.8+ MB


### Checking the density of the data

In [5]:
print('Reviewers under 5: ', (df.reviwerId.value_counts() < 5).mean())
print('Mean reviewers: ', df.reviwerId.value_counts().mean())
print('Median reviewers: ', df.reviwerId.value_counts().median())
print('-----------------------------------------------')
print('Books under 5: ', (df.asin.value_counts() < 5).mean())
print('Mean books: ', df.asin.value_counts().mean())
print('Median books: ', df.asin.value_counts().median())

Reviewers under 5:  0.6687836133393875
Mean reviewers:  6.074365571868221
Median reviewers:  3.0
-----------------------------------------------
Books under 5:  0.0
Mean books:  32.62978741856104
Median books:  12.0


In [6]:
id_counts = df.reviwerId.value_counts()
id_counts

AFVQZQ8PW0L       14072
A14OJS0VWMOSWO     3884
A2F6N60Z96CAJI     3852
A1K1JW1C5CUSUZ     1814
A320TMDV6KCFU      1718
                  ...  
AOX748JW0I28R         1
AXIXQEGLL4VMO         1
A2J14AK9URRP76        1
A1Q4F7N5P880EO        1
A2B3QVGHNDFUQW        1
Name: reviwerId, Length: 495525, dtype: int64

In [7]:
book_counts= df.asin.value_counts()
book_counts

030758836X    7440
0439023483    6717
0375831002    4864
038536315X    4604
0439023513    4440
              ... 
0441712347       5
0393323994       5
0345431820       5
0373812167       5
0312319304       5
Name: asin, Length: 92247, dtype: int64

### Attempting to make the data denser

In [8]:
id_counts_remove = list(id_counts[id_counts <3].index)
print(len(id_counts_remove))
id_counts_remove[:5]

189424


['A2V1NBLN9MUG4A',
 'A3419POCSDI5FO',
 'A1MHTZNZT97D6H',
 'A24IMAYPSKXB5N',
 'A1AFQWG2H2H7BT']

In [9]:
df[df.reviwerId.isin(id_counts_remove)]

Unnamed: 0,reviwerId,asin,rating
2,A1BM81XB4QHOA3,000100039X,5
8,A3FI0744PG1WYG,000100039X,5
24,AUTNO7VDY4H4A,000100039X,5
41,A3BHSR8LON67NA,000100039X,5
52,AW1Z7GYIBWB4K,000100039X,5
...,...,...,...
3009988,A3I044ICARNZK5,0578124114,5
3009992,A36ZD9ZRVXQGQ,0578124114,5
3009994,AQN0BSMCD7W1W,0578124114,5
3009996,A2N513SD0BR6VC,0578124114,5


In [10]:
df_dense = df.drop(index = df[df.reviwerId.isin(id_counts_remove)].index)

### Analysing the denser dataframe

In [11]:
df_dense.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2728640 entries, 0 to 3009999
Data columns (total 3 columns):
reviwerId    object
asin         object
rating       int8
dtypes: int8(1), object(2)
memory usage: 65.1+ MB


In [12]:
print('Reviewers under 5: ', (df_dense.reviwerId.value_counts() < 5).mean())
print('Mean reviewers: ', df_dense.reviwerId.value_counts().mean())
print('Median reviewers: ', df_dense.reviwerId.value_counts().median())
print('-----------------------------------------------')
print('Books under 5: ', (df_dense.asin.value_counts() < 5).mean())
print('Mean books: ', df_dense.asin.value_counts().mean())
print('Median books: ', df_dense.asin.value_counts().median())

Reviewers under 5:  0.4638174981460368
Mean reviewers:  8.914181920346552
Median reviewers:  5.0
-----------------------------------------------
Books under 5:  0.06259485711807813
Mean books:  29.580677334027147
Median books:  11.0


In [13]:
df_dense.asin.value_counts()

030758836X    6613
0439023483    5766
0375831002    4199
038536315X    3962
0316055433    3907
              ... 
0557482305       1
0345498143       1
0545237459       1
0385267029       1
0240820606       1
Name: asin, Length: 92244, dtype: int64

In [14]:
df_dense.reviwerId.value_counts()

AFVQZQ8PW0L       14072
A14OJS0VWMOSWO     3884
A2F6N60Z96CAJI     3852
A1K1JW1C5CUSUZ     1814
A320TMDV6KCFU      1718
                  ...  
A13AMTIGRKFG9K        3
AY7S97W40WAXJ         3
A22E78WHDI2AUZ        3
A17S3074CRAMZP        3
A2F87G3MVXRLNT        3
Name: reviwerId, Length: 306101, dtype: int64

In [15]:
len(df_dense.asin.unique())

92244

In [16]:
len(df_dense.reviwerId.unique())

306101

### Attempting to write this new dataframe to csv

In [17]:
dense_index = df_dense.index
dense_index

Int64Index([      0,       1,       3,       4,       5,       6,       7,
                  9,      10,      11,
            ...
            3009985, 3009986, 3009987, 3009989, 3009990, 3009991, 3009993,
            3009995, 3009997, 3009999],
           dtype='int64', length=2728640)

In [18]:
df.loc[dense_index]

Unnamed: 0,reviwerId,asin,rating
0,A10000012B7CGYKOMPQ4L,000100039X,5
1,A2S166WSCFIFP5,000100039X,5
3,A1MOSTXNIO5MPJ,000100039X,5
4,A2XQ5LZHTD4AFT,000100039X,5
5,A3V1MKC2BVWY48,000100039X,5
...,...,...,...
3009991,A2U7WBWEY9LV1K,0578124114,5
3009993,A1U1B6KJPGA4IP,0578124114,4
3009995,A33KNBGC6UM133,0578124114,3
3009997,A11VVB5KUO817E,0578124114,5


In [19]:
df

Unnamed: 0,reviwerId,asin,rating
0,A10000012B7CGYKOMPQ4L,000100039X,5
1,A2S166WSCFIFP5,000100039X,5
2,A1BM81XB4QHOA3,000100039X,5
3,A1MOSTXNIO5MPJ,000100039X,5
4,A2XQ5LZHTD4AFT,000100039X,5
...,...,...,...
3009995,A33KNBGC6UM133,0578124114,3
3009996,A2N513SD0BR6VC,0578124114,5
3009997,A11VVB5KUO817E,0578124114,5
3009998,AGPSU8WMSWFCZ,0578124114,5


In [20]:
reader = pd.read_csv('/Volumes/external/Sangeetha-Project/Original files/reviews1.csv.zip', compression='zip',
                 names=['reviwerId', 'asin', 'review', 'rating',
                        'summary', 'unixtime', 'pos_votes', 'total_votes'], chunksize = 10000)
counter= 10000
prev = 0
for temp_df in reader:
    
    indices_to_keep = list(df_dense.loc[prev:counter].index)
    print(len(indices_to_keep), min(indices_to_keep), max(indices_to_keep))
    
    temp_df.loc[temp_df.index.intersection(indices_to_keep)].to_csv('/Volumes/external/Sangeetha-Project/book_reviews.csv', 
              columns = ['reviwerId', 'asin', 'review', 'rating',
                        'summary', 'unixtime', 'pos_votes', 'total_votes'],
                          header=False, mode = 'a', index=False)
    print(prev, counter)
    prev = counter+1
    counter+=10000
    
    

9037 0 10000
0 10000
9254 10001 20000
10001 20000
8801 20001 30000
20001 30000
8423 30002 39998
30001 40000
8803 40002 50000
40001 50000
9107 50002 60000
50001 60000
9283 60001 70000
60001 70000
9173 70001 80000
70001 80000
9196 80002 90000
80001 90000
9263 90001 100000
90001 100000
9240 100001 110000
100001 110000
9268 110001 120000
110001 120000
8983 120001 130000
120001 130000
9173 130001 140000
130001 140000
9139 140001 150000
140001 150000
9176 150001 159999
150001 160000
9271 160001 170000
160001 170000
9166 170001 180000
170001 180000
9151 180001 189999
180001 190000
9143 190001 200000
190001 200000
9224 200001 210000
200001 210000
9225 210001 220000
210001 220000
9184 220001 230000
220001 230000
9192 230001 240000
230001 240000
9157 240001 250000
240001 250000
9162 250001 260000
250001 260000
9154 260001 270000
260001 270000
9176 270001 280000
270001 280000
9244 280001 290000
280001 290000
9247 290001 300000
290001 300000
9144 300001 310000
300001 310000
9165 310002 320000
3100

2330001 2340000
8727 2340001 2349999
2340001 2350000
7839 2350001 2360000
2350001 2360000
8054 2360001 2370000
2360001 2370000
8427 2370001 2380000
2370001 2380000
8764 2380001 2390000
2380001 2390000
9219 2390001 2400000
2390001 2400000
9140 2400001 2410000
2400001 2410000
9166 2410001 2420000
2410001 2420000
9132 2420001 2430000
2420001 2430000
9319 2430001 2440000
2430001 2440000
9524 2440001 2450000
2440001 2450000
9280 2450001 2459997
2450001 2460000
8885 2460003 2469999
2460001 2470000
8940 2470001 2480000
2470001 2480000
9062 2480002 2490000
2480001 2490000
9293 2490002 2500000
2490001 2500000
8605 2500001 2510000
2500001 2510000
8928 2510001 2520000
2510001 2520000
8937 2520001 2530000
2520001 2530000
9045 2530002 2539999
2530001 2540000
8993 2540007 2550000
2540001 2550000
8927 2550003 2560000
2550001 2560000
8863 2560001 2570000
2560001 2570000
9239 2570001 2580000
2570001 2580000
9205 2580001 2590000
2580001 2590000
9059 2590001 2599999
2590001 2600000
8714 2600001 2610000
2

### Testing the csv file to see if the copy went well

In [21]:
test_df = pd.read_csv('/Volumes/external/Sangeetha-Project/book_reviews.csv',
                 names=['reviwerId', 'asin', 'review', 'rating',
                        'summary', 'unixtime', 'pos_votes', 'total_votes']) \
    .drop(columns=['review', 'summary', 'unixtime','total_votes']).astype({'rating':'int8'})

In [22]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2728360 entries, 0 to 2728359
Data columns (total 4 columns):
reviwerId    object
asin         object
rating       int8
pos_votes    int64
dtypes: int64(1), int8(1), object(2)
memory usage: 65.0+ MB


In [23]:
test_df[test_df.rating.isnull()]

Unnamed: 0,reviwerId,asin,rating,pos_votes


In [24]:
print('Reviewers under 5: ', (test_df.reviwerId.value_counts() < 5).mean())
print('Mean reviewers: ', test_df.reviwerId.value_counts().mean())
print('Median reviewers: ', test_df.reviwerId.value_counts().median())
print('-----------------------------------------------')
print('Books under 5: ', (test_df.asin.value_counts() < 5).mean())
print('Mean books: ', test_df.asin.value_counts().mean())
print('Median books: ', test_df.asin.value_counts().median())

Reviewers under 5:  0.4638991705352155
Mean reviewers:  8.913267189587751
Median reviewers:  5.0
-----------------------------------------------
Books under 5:  0.06267074281254065
Mean books:  29.577641906248644
Median books:  11.0


In [26]:
test_df.describe()

Unnamed: 0,rating,pos_votes
count,2728360.0,2728360.0
mean,4.153319,3.821344
std,1.104071,28.08396
min,1.0,0.0
25%,4.0,0.0
50%,5.0,1.0
75%,5.0,2.0
max,5.0,23311.0


In [27]:
test_df.pos_votes.max()

23311

### Cleaning the negative values of the pos_votes and total_votes

In [87]:
test_df.loc[95:110]

Unnamed: 0,reviwerId,asin,rating,pos_votes
95,AMFSDTN7LJ4EM,000100039X,5,1
96,A3Q5B6YEMGK4HI,000100039X,5,0
97,AYZLXLVMJGTJV,000100039X,5,1
98,AG74166FELW9U,000100039X,5,0
99,A2ZZHMT58ZMVCZ,000100039X,5,-49
100,A5K32C2QZN3WL,000100039X,4,4
101,A1VSHT4FFP768T,000100039X,5,2
102,A4KYR93C6D5AI,000100039X,5,1
103,AQDD7PB1Z8ND,000100039X,5,0
104,A157UENZPTI1TD,000100039X,5,0


In [72]:
test_df[test_df.pos_votes < 0]

Unnamed: 0,reviwerId,asin,rating,pos_votes
99,A2ZZHMT58ZMVCZ,000100039X,5,-49
105,A3W43PSHRIG8KV,000100039X,4,-100
1163,A2PR6NXG0PA3KY,0002007770,2,-55
1497,ALLJT7S5QOFFZ,0002007770,3,-40
1640,A22ABFEL815EY7,0002007770,3,-18
...,...,...,...,...
2724046,A7H4LNXXJ3DM6,0575088885,3,-31
2724343,A3DQWFWINN3V5A,0575097469,5,-109
2725911,A259Q7R79WRGOY,0578032147,2,-69
2725979,A2X7NBUETXC19E,0578032147,5,-102


In [74]:
df[(df.reviwerId == 'A2ZZHMT58ZMVCZ')&(df.asin == '000100039X')]

Unnamed: 0,reviwerId,asin,rating
114,A2ZZHMT58ZMVCZ,000100039X,5


In [75]:
check_df = df = pd.read_csv('/Volumes/external/Sangeetha-Project/Original files/reviews1.csv.zip', compression = 'zip',
                 names=['reviwerId', 'asin', 'review', 'rating',
                        'summary', 'unixtime', 'pos_votes', 'total_votes'], nrows = 1000) \
    .drop(columns=['review', 'summary', 'unixtime']) \
    .astype({'rating': 'int8', 'pos_votes': 'int8', 'total_votes': 'int8'})

In [76]:
check_df[(check_df.reviwerId == 'A2ZZHMT58ZMVCZ')&(check_df.asin == '000100039X')]

Unnamed: 0,reviwerId,asin,rating,pos_votes,total_votes
114,A2ZZHMT58ZMVCZ,000100039X,5,-49,-41


In [81]:
import gzip 

data = []
counter = 0

with gzip.open('/Users/Sangeetha/Downloads/reviews_Books_5.json.gz') as f:
    for l in f:
        #To limit the number of rows
        if counter < 1000:
            data.append(json.loads(l.strip()))
            counter+=1
        else: 
            break

    og_df = pd.DataFrame.from_dict(data)        

In [85]:
og_df.head()

Unnamed: 0,reviewerID,asin,reviewerName,helpful,reviewText,overall,summary,unixReviewTime,reviewTime
0,A10000012B7CGYKOMPQ4L,000100039X,Adam,"[0, 0]",Spiritually and mentally inspiring! A book tha...,5.0,Wonderful!,1355616000,"12 16, 2012"
1,A2S166WSCFIFP5,000100039X,"adead_poet@hotmail.com ""adead_poet@hotmail.com""","[0, 2]",This is one my must have books. It is a master...,5.0,close to god,1071100800,"12 11, 2003"
2,A1BM81XB4QHOA3,000100039X,"Ahoro Blethends ""Seriously""","[0, 0]",This book provides a reflection that you can a...,5.0,Must Read for Life Afficianados,1390003200,"01 18, 2014"
3,A1MOSTXNIO5MPJ,000100039X,Alan Krug,"[0, 0]",I first read THE PROPHET in college back in th...,5.0,Timeless for every good and bad time in your l...,1317081600,"09 27, 2011"
4,A2XQ5LZHTD4AFT,000100039X,Alaturka,"[7, 9]",A timeless classic. It is a very demanding an...,5.0,A Modern Rumi,1033948800,"10 7, 2002"


In [93]:
og_df[(og_df.reviewerID == 'A2ZZHMT58ZMVCZ')&(og_df.asin == '000100039X')]

Unnamed: 0,reviewerID,asin,reviewerName,helpful,reviewText,overall,summary,unixReviewTime,reviewTime
114,A2ZZHMT58ZMVCZ,000100039X,"L. Power ""nlp trainer""","[207, 215]",A prophet has waited twelve years in a coastal...,5.0,Deeper than you can imagine,1137456000,"01 17, 2006"


In [97]:
og_df[(og_df.reviewerID == 'A3W43PSHRIG8KV')&(og_df.asin == '000100039X')]

Unnamed: 0,reviewerID,asin,reviewerName,helpful,reviewText,overall,summary,unixReviewTime,reviewTime
120,A3W43PSHRIG8KV,000100039X,Mary Seale,"[156, 167]",I first became aware of Kahlil Gibran when I r...,4.0,Insightful Prose,957571200,"05 6, 2000"


In [106]:
og_df_subset = og_df.loc[95:125].copy()

In [107]:
og_df_subset

Unnamed: 0,reviewerID,asin,reviewerName,helpful,reviewText,overall,summary,unixReviewTime,reviewTime
95,AAFLZI7MX9UIG,000100039X,J. M. DELEON,"[0, 0]",Slow reading but full of Significant messages....,5.0,Wonderful,1381104000,"10 7, 2013"
96,A1FQPOYRBTTK1,000100039X,"John H. Eagan ""Author: The Enlightenment, Wha...","[0, 0]",The Prophet by Kahlil Gibran is a book that's ...,5.0,Interesting book,1235865600,"03 1, 2009"
97,A1JBOMEZBH80JU,000100039X,"Joshua A. ""I love switching back and forth be...","[0, 0]",This is a classic of world literature. Everyo...,5.0,"A must read for every young person, a joy and ...",1385942400,"12 2, 2013"
98,A2DFYTW62P25Z,000100039X,J. R. Willoughby DDS,"[0, 0]",I was in my twenties when I first read The Pro...,5.0,Through four decades,1402704000,"06 14, 2014"
99,A2MPRPCAQLTR3L,000100039X,"Judith Land ""Adoption Detective | First Lilac...","[0, 0]",A book to be treasured. A tremendous poet deal...,5.0,Equalled only by Shakespeare!,1310688000,"07 15, 2011"
100,A2W6WXEUAVM3E,000100039X,JuJuBee,"[3, 5]",I read it in an hour. Kahlil Gibran is definit...,5.0,Timeless,1046304000,"02 27, 2003"
101,A7GT0WQKKDP0V,000100039X,"Julie Jordan Scott ""Writer, Life Coach - Owne...","[2, 2]",Reading a classic such as &quot;The Prophet&qu...,5.0,To Be Experienced as Well as Read,1065139200,"10 3, 2003"
102,ATAPOART4QGVO,000100039X,Kate,"[3, 15]",Maybe I just wasn't in the right mood for a he...,3.0,Now I'm Sleepy,1183420800,"07 3, 2007"
103,A2052JNVUPRTMT,000100039X,Kathy Adams,"[97, 103]",Gibran gets right down to the bedrock of what ...,5.0,Pure Wisdom,973814400,"11 10, 2000"
104,ADT39C7GOEAWW,000100039X,"Kenny ""crochet guy""","[0, 0]","The book came in great condition, as well as, ...",4.0,Satisfied.,1312588800,"08 6, 2011"


In [108]:
og_df_subset['x'] = [x for [x,y] in og_df_subset.helpful]
og_df_subset['y'] = [y for [x,y] in og_df_subset.helpful]
og_df_subset.drop(columns = ['helpful', 'reviewTime', 'reviewerName'], inplace=True)
og_df_subset

Unnamed: 0,reviewerID,asin,reviewText,overall,summary,unixReviewTime,x,y
95,AAFLZI7MX9UIG,000100039X,Slow reading but full of Significant messages....,5.0,Wonderful,1381104000,0,0
96,A1FQPOYRBTTK1,000100039X,The Prophet by Kahlil Gibran is a book that's ...,5.0,Interesting book,1235865600,0,0
97,A1JBOMEZBH80JU,000100039X,This is a classic of world literature. Everyo...,5.0,"A must read for every young person, a joy and ...",1385942400,0,0
98,A2DFYTW62P25Z,000100039X,I was in my twenties when I first read The Pro...,5.0,Through four decades,1402704000,0,0
99,A2MPRPCAQLTR3L,000100039X,A book to be treasured. A tremendous poet deal...,5.0,Equalled only by Shakespeare!,1310688000,0,0
100,A2W6WXEUAVM3E,000100039X,I read it in an hour. Kahlil Gibran is definit...,5.0,Timeless,1046304000,3,5
101,A7GT0WQKKDP0V,000100039X,Reading a classic such as &quot;The Prophet&qu...,5.0,To Be Experienced as Well as Read,1065139200,2,2
102,ATAPOART4QGVO,000100039X,Maybe I just wasn't in the right mood for a he...,3.0,Now I'm Sleepy,1183420800,3,15
103,A2052JNVUPRTMT,000100039X,Gibran gets right down to the bedrock of what ...,5.0,Pure Wisdom,973814400,97,103
104,ADT39C7GOEAWW,000100039X,"The book came in great condition, as well as, ...",4.0,Satisfied.,1312588800,0,0


### Reuploading the original json file as csv

In [161]:
reader = pd.read_json('/Users/Sangeetha/Downloads/reviews_Books_5.json.gz',
                      compression = 'gzip', lines=True, chunksize=10000)

counter = 0
for data in reader:
    if counter < 10000:
        data['helpful_votes'] = [x for [x, y] in data.helpful]
        data['total_votes'] = [y for [x, y] in data.helpful]
        data.drop(columns=['reviewTime', 'helpful', 'reviewerName'], inplace=True)
        data.to_csv('/Volumes/external/Sangeetha-Project/reviews_clean.csv',
                          header=False, index=True, mode='a')
    else:
        break
        
    counter+=10000
    print(counter)

Index(['reviewerID', 'asin', 'reviewText', 'overall', 'summary',
       'unixReviewTime', 'helpful_votes', 'total_votes'],
      dtype='object')
0
10000


### Checking the pos_votes isn't negative -TURNS OUT I WAS CASTING WRONG INT TYPE

In [174]:
check_df = pd.read_csv('/Volumes/external/Sangeetha-Project/reviews_clean.csv',
                 names=['reviwerId', 'asin', 'review', 'rating',
                        'summary', 'unixtime', 'pos_votes', 'total_votes'], nrows = 1000) \
    .drop(columns=['review', 'summary', 'unixtime']) \
    .astype({'rating': 'int8', 'pos_votes': 'int8', 'total_votes': 'int8'})

In [175]:
check_df[(check_df.reviwerId == 'A2ZZHMT58ZMVCZ')&(check_df.asin == '000100039X')]

Unnamed: 0,reviwerId,asin,rating,pos_votes,total_votes
114,A2ZZHMT58ZMVCZ,000100039X,5,-49,-41
