In [13]:
import pandas as pd
pd.options.display.max_rows = 6000

reviews = pd.read_csv('reviews.csv')

In [14]:
# columns

reviews.columns

Index(['listing_id', 'id', 'date', 'reviewer_id', 'reviewer_name', 'comments'], dtype='object')

In [15]:
# first 5 rows of calendar

reviews.head()

Unnamed: 0,listing_id,id,date,reviewer_id,reviewer_name,comments
0,18666,50158511,2015-10-10,1559265,Jody,Very nice flat. We had easy transportation to ...
1,18674,4808211,2013-05-27,4841196,Caron,"Great location. Clean, spacious flat. Would re..."
2,18674,10660311,2014-03-02,11600277,Juan Carlos,Mi mejor recomendación para este departamento....
3,18674,41087522,2015-08-04,35231385,Shlomi,"Big apartment, well equipped.\r\nVery good ser..."
4,18674,81000756,2016-06-20,23223644,Joost,The Check in was fast and flexible. The price ...


In [16]:
# starting values in reviewer name column

pd.Series(reviews['reviewer_name'].value_counts(normalize = True)).reset_index().rename(columns = {'index': 'reviewer_name', 'reviewer_name': 'frequency'})

Unnamed: 0,reviewer_name,frequency
0,David,0.006150
1,Laura,0.004847
2,Daniel,0.004826
3,Michael,0.004695
4,Anna,0.004472
...,...,...
84352,Vashti,0.000002
84353,Arife,0.000002
84354,Jakob Jonathan,0.000002
84355,Kristýna Eva,0.000002


In [17]:
# starting values in comments column

pd.Series(reviews['comments'].value_counts(normalize = True)).reset_index().rename(columns = {'index': 'comment', 'comments': 'frequency'})

Unnamed: 0,comment,frequency
0,.,0.000990
1,The host canceled this reservation the day bef...,0.000570
2,The host canceled this reservation 2 days befo...,0.000446
3,Great location!,0.000355
4,Great location,0.000347
...,...,...
634151,I've much enjoyed the stay at Noemi's place. H...,0.000002
634152,Jana was a lovely host. She was very responsiv...,0.000002
634153,The room is small but nice and has everything ...,0.000002
634154,This was a great spot! My friend and I are cur...,0.000002


In [18]:
# determining missingness among columns

def missingness(df):
    return df.isna().sum().sort_values(ascending = False) / len(df) * 100

missingness(reviews)

comments         0.044709
reviewer_name    0.000000
reviewer_id      0.000000
date             0.000000
id               0.000000
listing_id       0.000000
dtype: float64

In [19]:
# determining repeating columns

def repeating_cols(df):
    comparisons = []
    match_ratios = []
    
    for i in range(len(df.columns)):
        for j in range(i + 1, len(df.columns)):
            comparisons_i = [df.columns[i], df.columns[j]]
            comparisons.append(comparisons_i)
    
    for i in comparisons:
        match_ratios_i = [(df[i[0]] == df[i[1]]).sum() / len(df) * 100, i[0] + ' - ' + i[1]]
        match_ratios.append(match_ratios_i)
    match_ratios.sort(reverse = True)
    
    return match_ratios

repeating_cols(reviews)

[[0.00015363838747294045, 'reviewer_name - comments'],
 [0.0, 'reviewer_id - reviewer_name'],
 [0.0, 'reviewer_id - comments'],
 [0.0, 'listing_id - reviewer_name'],
 [0.0, 'listing_id - reviewer_id'],
 [0.0, 'listing_id - id'],
 [0.0, 'listing_id - date'],
 [0.0, 'listing_id - comments'],
 [0.0, 'id - reviewer_name'],
 [0.0, 'id - reviewer_id'],
 [0.0, 'id - date'],
 [0.0, 'id - comments'],
 [0.0, 'date - reviewer_name'],
 [0.0, 'date - reviewer_id'],
 [0.0, 'date - comments']]

In [20]:
# starting column dtypes

reviews.dtypes

listing_id        int64
id                int64
date             object
reviewer_id       int64
reviewer_name    object
comments         object
dtype: object

In [21]:
reviews = reviews.astype({'date': 'datetime64'})
reviews.dtypes

listing_id                int64
id                        int64
date             datetime64[ns]
reviewer_id               int64
reviewer_name            object
comments                 object
dtype: object

In [22]:
reviews.head()

Unnamed: 0,listing_id,id,date,reviewer_id,reviewer_name,comments
0,18666,50158511,2015-10-10,1559265,Jody,Very nice flat. We had easy transportation to ...
1,18674,4808211,2013-05-27,4841196,Caron,"Great location. Clean, spacious flat. Would re..."
2,18674,10660311,2014-03-02,11600277,Juan Carlos,Mi mejor recomendación para este departamento....
3,18674,41087522,2015-08-04,35231385,Shlomi,"Big apartment, well equipped.\r\nVery good ser..."
4,18674,81000756,2016-06-20,23223644,Joost,The Check in was fast and flexible. The price ...


In [23]:
reviews.to_csv('reviews-clean.csv', index = False)

In [24]:
reviews.to_json('reviews.json', orient = 'split', index = False)