In [1]:
#!pip install mysql-connector-python

In [1]:
# Import dependencies
import pandas as pd
import numpy as np
import pymysql as mysql
import matplotlib.pyplot as plt
import os
import warnings
import getpass
import re  
from textblob import TextBlob

import mysql.connector as msql
from mysql.connector import Error
warnings.filterwarnings('ignore')

In [2]:
# Helper function to connect to db

def connect_to_msql(database=''):
    try:
        conn = msql.connect(host='localhost', user='root', password=getpass.getpass('Enter password:'), database=f'{database}')
        print("Connected to MySQL...")
    except Error as e:
        print("Error while connecting to MySQL", e)
    return conn

# Function to find unique array values

def unique(array):
    x = np.array(array)
    print("Unique list values: ", np.unique(x))
    return np.unique(x)

# Functions for sentiment analysis

def get_sentiment(comment):
    blob = TextBlob(comment)
    return blob.sentiment.polarity

def get_sentiment_label(score):
    if score >= 0:
        return "positive"
    else:
        return "negative"

Clean Calendar and Export

ETA: 3 sec

In [4]:
calendar=pd.read_csv('Dataset/calendar.csv',index_col = False, delimiter = ',')

# step 1: remove dollar sign and convert to float

calendar['price'] = calendar['price'].str.replace('$', '').str.replace(',','').astype(float)

# step 2: export csv

calendar.to_csv('./Clean/calendar.csv', encoding='utf-8', index=False)


Clean Reviews and Export

ETA: 30 sec

In [5]:
reviews=pd.read_csv('Dataset/reviews.csv',index_col = False, delimiter = ',')
reviews = reviews.replace(np.nan, None)

# step 1: replace None values with empty string

reviews['comments'] = reviews['comments'].replace([None], "")

# step 2: create new sentiment columns

reviews["sentiment"] = reviews["comments"].apply(get_sentiment)
reviews["sentiment_label"] = reviews["sentiment"].apply(get_sentiment_label)

# step 3: export csv

reviews.to_csv('./Clean/reviews.csv', encoding='utf-8', index=False)


Clean Listings and Export

ETA: 5 sec

In [6]:
listings=pd.read_csv('Dataset/listings.csv',index_col = False, delimiter = ',')
listings = listings.replace(np.nan, None)

# step 1: remove dollar sign and convert to float for 
#   price, weekly_price, security_deposit, cleaning_fee, and extra_people

listings['price'] = listings['price'].str.replace('$', '').str.replace(',','').astype(float)
listings['weekly_price'] = listings['weekly_price'].str.replace('$', '').str.replace(',','').astype(float)
listings['security_deposit'] = listings['security_deposit'].str.replace('$', '').str.replace(',','').astype(float)
listings['cleaning_fee'] = listings['cleaning_fee'].str.replace('$', '').str.replace(',','').astype(float)
listings['extra_people'] = listings['extra_people'].str.replace('$', '').str.replace(',','').astype(float)

# step 2: remove percent sign and convert to float for 
#   host_response_rate and host_acceptance_rate

listings['host_response_rate'] = listings['host_response_rate'].str.replace('%', '').astype(float)
listings['host_acceptance_rate'] = listings['host_acceptance_rate'].str.replace('%', '').astype(float)

# step 3: convert t and f strings to 1 and 0 for boolean cols

bool_cols = [
    'host_is_superhost',
    'host_has_profile_pic',
    'host_identity_verified',
    'is_location_exact',
    'has_availability',
    'requires_license',
    'instant_bookable',
    'require_guest_profile_picture',
    'require_guest_phone_verification'
]

for cols in bool_cols:
    listings[cols] = listings[cols].replace({ 't': 1, 'f': 0 })

# step 4: replace None values with empty string

listings['summary'] = listings['summary'].replace([None], "")
listings['space'] = listings['space'].replace([None], "")
listings['description'] = listings['description'].replace([None], "")

# step 5: create new sentiment columns

listings["summary_sentiment"] = listings["summary"].apply(get_sentiment)
listings["space_sentiment"] = listings["space"].apply(get_sentiment)
listings["description_sentiment"] = listings["description"].apply(get_sentiment)

# step 6: export csv

listings.to_csv('./Clean/listings.csv', encoding='utf-8', index=False)