<a href="https://colab.research.google.com/github/sayarghoshroy/place2crash/blob/main/preproc_funcs.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import pandas as pd
from scipy import stats

import nltk
from nltk.tokenize import word_tokenize
# nltk.download('punkt')

import json

In [2]:
path = 'drive/My Drive/place2crash_data/'
file_name = 'data.csv'

In [3]:
raw_data_df = pd.read_csv(path + file_name)

In [4]:
view_raw_features = False

if view_raw_features:
  print('Raw features:')
  for head in raw_data_df.columns:
    print('-', head)

# Dropping certain metadata fields
# that would not contribute to price estimation

raw_data_df = raw_data_df.drop(['host_id', 'host_name', 'last_review', 'calculated_host_listings_count'], axis = 1)
raw_data_df = raw_data_df.sort_values(by = 'id')

In [5]:
map_col_to_head = []
map_head_to_col = {}

for index, head in enumerate(raw_data_df.columns):
  map_col_to_head.append(head)
  map_head_to_col[head] = index

raw_data = raw_data_df.values

In [6]:
rows, cols = np.shape(raw_data)
print('# Features = ' + str(cols))
print('# Datapoints = ' + str(rows))

# Features = 12
# Datapoints = 48895


In [7]:
def numerical_EDA(attributes):
  attributes = attributes.astype('double')
  nan_count = np.count_nonzero(np.isnan(attributes))
  
  attributes = attributes[~ np.isnan(attributes)]

  items = {'nan_count': nan_count,
           'mean': np.mean(attributes),
           'var': np.var(attributes, ddof = 1),
           'sd': np.std(attributes, ddof = 1),
           'min': np.amin(attributes),
           'q1': np.quantile(attributes, 0.25),
           'median': np.median(attributes),
           'q3': np.quantile(attributes, 0.75),
           'max': np.amax(attributes),
           'iqr': np.quantile(attributes, 0.75) - np.quantile(attributes, 0.25),
           'mode': stats.mode(attributes).mode[0]}

  return items

def categorical_EDA(attributes):
  unique_vals, frequency = np.unique(attributes, return_counts = True)
  items = {'unique': np.unique(attributes).shape[0],
           'unique_vals': np.ndarray.tolist(unique_vals),
           'unique_freqs': np.ndarray.tolist(frequency)}
  return items

def text_EDA(attributes):
  texts = np.ndarray.tolist(attributes)
  lens = np.asarray([len(str(text)) for text in texts])
  token_lens = np.asarray([len(word_tokenize(str(text))) for text in texts])

  items_char = numerical_EDA(lens)
  items_tokens = numerical_EDA(token_lens)
  return {'char': items_char, 'token': items_tokens}

In [8]:
view_features = True

if view_features:
  print('Features:')
  for index, head in enumerate(map_col_to_head):
    print(str(index) + ': ' + head)

Features:
0: id
1: name
2: neighbourhood_group
3: neighbourhood
4: latitude
5: longitude
6: room_type
7: price
8: minimum_nights
9: number_of_reviews
10: reviews_per_month
11: availability_365


In [9]:
# Usage examples

# numerical_EDA(raw_data[:, map_head_to_col['reviews_per_month']])
# numerical_EDA(raw_data[:, map_head_to_col['price']])
# categorical_EDA(raw_data[:, map_head_to_col['neighbourhood']])
# categorical_EDA(raw_data[:, map_head_to_col['neighbourhood_group']])
# text_EDA(raw_data[:, map_head_to_col['name']])

In [10]:
# Considering 10% of the data as a held-out test set
# Not used for any analysis

np.random.seed(2022)
np.random.shuffle(raw_data)

train_len = int(rows * 0.9)
test_len = rows - train_len

In [11]:
train_data = raw_data[0: train_len]
test_data = raw_data[train_len: ]

In [12]:
# Saving Processed Items

def to_lists(data):
  points = []

  for row_id in range(data.shape[0]):
    points.append(np.ndarray.tolist(data[row_id, :]))

  return points

with open(path + 'train_data.json', 'w+') as f:
  json.dump(to_lists(train_data), f)

with open(path + 'test_data.json', 'w+') as f:
  json.dump(to_lists(test_data), f)

with open(path + 'head_to_col_id.json', 'w+') as f:
  json.dump(map_head_to_col, f)

In [13]:
# Done