In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Data preprocessing

In [None]:
df = pd.read_csv('/content/Amazonbookreviews.csv')
df.head()

Unnamed: 0,reviewerID,asin,reviewername,helpful,reviewerText,overallRating,Summary,UnixreviewTime,reviewTime
0,A2SUAM1J3GNN3B,23456.0,SNEHA,2,I bought this for my husband who plays the pia...,4.0,SUBWAY SURF,,
1,A2SUAN1J3GNN0C,879877.0,KARTHII,3,good book to read,3.0,PILOT,,
2,A2SUAN1J3GNN7B,23456.0,SHASHANK,3,high cost,2.5,DRIVE IN HEAVEN,,
3,A5SUAN1J3GNN9B,67578.0,NITIN,3,good useof book,2.2,SPORTS HEAVEN,,
4,A1SUAM1J3GNN3B,123.0,PRANOTI,3,I m happy with the book,3.0,HEAVEN OF PEARLS,,


In [None]:
df.isnull().sum()

reviewerID         2
asin               2
reviewername       1
helpful            0
reviewerText       0
overallRating      0
Summary            0
UnixreviewTime    45
reviewTime        45
dtype: int64

# Data subsets

In [None]:
df = df.iloc[:, :-2]

In [None]:
df.dtypes

reviewerID        object
asin             float64
reviewername      object
helpful            int64
reviewerText      object
overallRating    float64
Summary           object
dtype: object

In [None]:
df.dropna(how = 'any', inplace = True)

In [None]:
df ['asin'] = df['asin'].astype('int')
df ['overallRating'] = df['overallRating'].astype('int')

In [None]:
df.dtypes

reviewerID       object
asin              int64
reviewername     object
helpful           int64
reviewerText     object
overallRating     int64
Summary          object
dtype: object

# Merge Data

In [None]:
min(df['overallRating']), max(df['overallRating'])

(2, 5)

In [None]:
# min age is 13 and max is 113 so we will take 10 as lowerbound and 120 as upperbound for creating age groups
lables = ['1-3', '4-5']
df['overallRating group'] = pd.cut(df['overallRating'], bins = np.arange(1, 6, 2), labels = lables, right=True)
df.head()

Unnamed: 0,reviewerID,asin,reviewername,helpful,reviewerText,overallRating,Summary,overallRating group
0,A2SUAM1J3GNN3B,23456,SNEHA,2,I bought this for my husband who plays the pia...,4,SUBWAY SURF,4-5
1,A2SUAN1J3GNN0C,879877,KARTHII,3,good book to read,3,PILOT,1-3
2,A2SUAN1J3GNN7B,23456,SHASHANK,3,high cost,2,DRIVE IN HEAVEN,1-3
3,A5SUAN1J3GNN9B,67578,NITIN,3,good useof book,2,SPORTS HEAVEN,1-3
4,A1SUAM1J3GNN3B,123,PRANOTI,3,I m happy with the book,3,HEAVEN OF PEARLS,1-3


# Sort Data

In [None]:
sortedDf = df.sort_values('overallRating')
sortedDf.head()

Unnamed: 0,reviewerID,asin,reviewername,helpful,reviewerText,overallRating,Summary,overallRating group
38,SISISWER4657893,111,RIYAA,3,098O,2,reader is giood,1-3
37,LIHJKD87634O21,678,KHANN,3,3434567,2,stare the raeder,1-3
2,A2SUAN1J3GNN7B,23456,SHASHANK,3,high cost,2,DRIVE IN HEAVEN,1-3
3,A5SUAN1J3GNN9B,67578,NITIN,3,good useof book,2,SPORTS HEAVEN,1-3
20,VABI9876ERTT1,9876,MANISH,2,fdghi,2,ONE FOR ALLL,1-3


# Transposing Data

In [None]:
transposed_df = sortedDf.transpose()
transposed_df.head()

Unnamed: 0,38,37,2,3,20,19,18,17,36,16,...,25,24,23,22,42,12,10,28,43,9
reviewerID,SISISWER4657893,LIHJKD87634O21,A2SUAN1J3GNN7B,A5SUAN1J3GNN9B,VABI9876ERTT1,FAJI234GAJI987,AAAJI37DFG378I,MIKI23412341QW,MAHI2348900122,SISISWER4657893,...,MIKISDF1232XAD,NIKI12NIKI1231,GHAU8768QWER,MIKISDF1232XAD,QWCVFED123,HGKIUYE345SFDG,9345GHIKLFLLED,CADERVG3450EREW,123LKIUHNFG,1234ZSDFREQWEG
asin,111,678,23456,67578,9876,22456,987,123,22245,876,...,1,2,321,879,11111,987,321456,11,9800,78761
reviewername,RIYAA,KHANN,SHASHANK,NITIN,MANISH,DIPTI,PANKAJ,SALMAN,KANGANA,RAJESH,...,JACOB,PRAJWAL,SUMEET,AISHWARYA,DHONI,MOHIT,RAJU,MORRIS,MAHI,SATISH
helpful,3,3,3,3,2,2,2,2,3,2,...,3,3,2,2,2,2,2,3,2,2
reviewerText,098O,3434567,high cost,good useof book,fdghi,faskjas,reader is giood,stare the raeder,67667,money is dis,...,0.888888888,,,,HELLO DEAR,the dealer is charged more money,no mini,staff is good,DAER PLEASE READ BOOK,many errors


# Reshape Data

In [None]:
reshaped_df = df.values.reshape((32, 10))
print(reshaped_df)

[['A2SUAM1J3GNN3B' 23456 'SNEHA' 2
  'I bought this for my husband who plays the piano. He is having a wonderful time playing these old hymns. The music is at times hard to read because we think the book was published for singing from more than playing from. Great purchase though!'
  4 'SUBWAY SURF' '4-5' 'A2SUAN1J3GNN0C' 879877]
 ['KARTHII' 3 'good book to read' 3 'PILOT ' '1-3' 'A2SUAN1J3GNN7B' 23456
  'SHASHANK' 3]
 ['high cost' 2 'DRIVE IN HEAVEN' '1-3' 'A5SUAN1J3GNN9B' 67578 'NITIN' 3
  'good useof book' 2]
 ['SPORTS HEAVEN' '1-3' 'A1SUAM1J3GNN3B' 123 'PRANOTI' 3
  'I m happy with the book' 3 'HEAVEN OF PEARLS' '1-3']
 ['BA2345WERSDFRER' 236789 'SHALINI' 3
  'im not happy with the delay of product' 3 'GODS GIFT' '1-3'
  'ASDF345671QWERT' 224442]
 ['SHIKHA' 3 'Wonderful book' 3 'LIFE OF PIE' '1-3' 'MNHJ879WERFGH' 98767
  'AKSHAY' 2]
 ['The music is at times hard to read because we think the book was published for singing from more than playing from. Great purchase though!'
  3 'IND