In [1]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns
import dask.dataframe as dd

In [2]:
df = pd.read_csv('/Volumes/external/Sangeetha-Project/reviews2.csv',
                 names=['reviwerId', 'asin', 'review', 'rating',
                        'summary', 'unixtime', 'pos_votes', 'total_votes']) \
    .drop(columns=['review', 'summary', 'unixtime', 'pos_votes', 'total_votes']) \
    .astype({'rating': 'int8'})

In [16]:
df.head()

Unnamed: 0,reviwerId,asin,rating
0,A11GG623AD89YT,578124114,4
1,A17APVES3TT17Y,578124114,4
2,A3T53I4UUNBKLK,578124114,5
3,AW7JJRAMYZY1X,578124114,3
4,A1WFOGD2IDKJPS,578124114,4


In [17]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3000000 entries, 0 to 2999999
Data columns (total 3 columns):
reviwerId    object
asin         object
rating       int8
dtypes: int8(1), object(2)
memory usage: 48.6+ MB


In [18]:
small_df = df.iloc[:1000].copy()

In [19]:
small_df.head()

Unnamed: 0,reviwerId,asin,rating
0,A11GG623AD89YT,578124114,4
1,A17APVES3TT17Y,578124114,4
2,A3T53I4UUNBKLK,578124114,5
3,AW7JJRAMYZY1X,578124114,3
4,A1WFOGD2IDKJPS,578124114,4


In [20]:
small_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 3 columns):
reviwerId    1000 non-null object
asin         1000 non-null object
rating       1000 non-null int8
dtypes: int8(1), object(2)
memory usage: 16.7+ KB


In [21]:
print('Reviewers under 5: ', (df.reviwerId.value_counts() < 5).mean())
print('Mean reviewers: ', df.reviwerId.value_counts().mean())
print('Median reviewers: ', df.reviwerId.value_counts().median())
print('-----------------------------------------------')
print('Books under 5: ', (df.asin.value_counts() < 5).mean())
print('Mean books: ', df.asin.value_counts().mean())
print('Median books: ', df.asin.value_counts().median())

Reviewers under 5:  0.6959542415268299
Mean reviewers:  5.440528674306378
Median reviewers:  3.0
-----------------------------------------------
Books under 5:  0.0
Mean books:  24.50379808870375
Median books:  10.0


In [22]:
(small_df.asin.value_counts() == 1).mean()

0.0

In [23]:
counts = small_df.asin.value_counts()

In [24]:
list(counts[counts == 5].index)

['0578124114', '0578126842']

In [25]:
id_counts = df.reviwerId.value_counts()

In [26]:
book_counts= df.asin.value_counts()
book_counts

147674355X    3725
1469984202    3230
0849922070    2925
0857521012    2779
1442359315    2580
              ... 
0982816405       5
1423127528       5
0819217816       5
0761127844       5
0825305543       5
Name: asin, Length: 122430, dtype: int64

In [27]:
id_counts_1 = list(id_counts[id_counts <3].index)
id_counts_1[:5]
len(id_counts_1)

213263

In [28]:
df[df.reviwerId.isin(id_counts_1)]

Unnamed: 0,reviwerId,asin,rating
7,A2BNVNPOTJJ2ZL,0578124688,2
10,A9JYJNPG5N8GI,0578124688,5
17,A1PZ2M1BH1OKMW,0578124688,5
22,A2G7RU33OJQOVK,0578124688,3
24,AXYHUMYGJRIXB,0578124696,4
...,...,...,...
2999978,A33C6NQ1MNEPYG,1481948067,1
2999984,A1RLK341TRY9UC,1481948067,4
2999986,AZ2SS29M7Y5K0,1481948377,5
2999989,A20ZWS1BZU9T0A,1481948377,5


In [29]:
df_dense = df.drop(index = df[df.reviwerId.isin(id_counts_1)].index)

In [30]:
df_dense.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2671202 entries, 0 to 2999999
Data columns (total 3 columns):
reviwerId    object
asin         object
rating       int8
dtypes: int8(1), object(2)
memory usage: 63.7+ MB


In [31]:
print('Reviewers under 5: ', (df_dense.reviwerId.value_counts() < 5).mean())
print('Mean reviewers: ', df_dense.reviwerId.value_counts().mean())
print('Median reviewers: ', df_dense.reviwerId.value_counts().median())
print('-----------------------------------------------')
print('Books under 5: ', (df_dense.asin.value_counts() < 5).mean())
print('Mean books: ', df_dense.asin.value_counts().mean())
print('Median books: ', df_dense.asin.value_counts().median())

Reviewers under 5:  0.5042022273875216
Mean reviewers:  7.899365377904742
Median reviewers:  4.0
-----------------------------------------------
Books under 5:  0.08969719742205305
Mean books:  21.819445692394403
Median books:  9.0


In [32]:
df_dense.asin.value_counts()

147674355X    3344
1469984202    2642
0849922070    2409
144235948X    2335
1455548987    2062
              ... 
0967841445       1
0789201399       1
0758229461       1
0915807025       1
0984049363       1
Name: asin, Length: 122423, dtype: int64

In [33]:
df_dense.reviwerId.value_counts()

AFVQZQ8PW0L       7174
A14OJS0VWMOSWO    6673
A2F6N60Z96CAJI    1734
A2VKWLCNZF4ZVB    1161
A328S9RN3U5M68    1113
                  ... 
A4CNYOX1VONFB        3
A1R5K4YDY3ZFTX       3
AJQXSJC9O6PEQ        3
A2EORY4JQPBCQJ       3
A3VRAJQI7LBXMA       3
Name: reviwerId, Length: 338154, dtype: int64

In [34]:
len(df_dense.asin.unique())

122423

In [35]:
len(df_dense.reviwerId.unique())

338154