In [51]:
#!pip install mysql-connector-python
!pip install textblob

Collecting textblob
  Using cached textblob-0.17.1-py2.py3-none-any.whl (636 kB)
Installing collected packages: textblob
Successfully installed textblob-0.17.1


In [52]:
# Import dependencies
import pandas as pd
import numpy as np
import pymysql as mysql
import os
import warnings
import getpass
import re  
from textblob import TextBlob

import mysql.connector as msql
from mysql.connector import Error

#Python package that provides interfaces to AWS including Amazon S3
import boto3

warnings.filterwarnings('ignore')

In [3]:
# Helper function to connect to db

def connect_to_msql(database=''):
    try:
        conn = msql.connect(host='localhost', user='root', password=getpass.getpass('Enter password:'), database=f'{database}')
        print("Connected to MySQL...")
    except Error as e:
        print("Error while connecting to MySQL", e)
    return conn

# Function to find unique array values

def unique(array):
    x = np.array(array)
    print("Unique list values: ", np.unique(x))
    return np.unique(x)

# Functions for sentiment analysis

def get_sentiment(comment):
    blob = TextBlob(comment)
    return blob.sentiment.polarity

def get_sentiment_label(score):
    if score >= 0:
        return "positive"
    else:
        return "negative"

Extract the Data from S3 Bucket

In [40]:
# Create a new S3 client object that can be used to access and manage objects stored in Amazon S3 buckets
s3 = boto3.resource('s3')

In [41]:
s3 = boto3.resource(
    service_name='s3',
    region_name='us-east-1',
    aws_access_key_id= getpass.getpass('aws access key:'),
    aws_secret_access_key= getpass.getpass('aws secret access key:')
)

aws access key:········
aws secret access key:········


In [42]:
for obj in s3.Bucket('airbnbseattle').objects.all():
    print(obj)

s3.ObjectSummary(bucket_name='airbnbseattle', key='calendar.csv')
s3.ObjectSummary(bucket_name='airbnbseattle', key='listings.csv')
s3.ObjectSummary(bucket_name='airbnbseattle', key='reviews.csv')


Clean Calendar and Export

ETA: 3 sec

In [48]:
# Get calendar dataset from airbnbseattle bucket
cal_object = s3.Bucket('airbnbseattle').Object('calendar.csv').get()
calendar = pd.read_csv(cal_object["Body"])

# step 1: remove dollar sign and convert to float
calendar['price'] = calendar['price'].str.replace('$', '').str.replace(',','').astype(float)

# step 2: export csv
data = calendar.to_csv('./Clean/calendar.csv')

# step 3: Upload calendar.csv into the S3 bucket
bucket = s3.Bucket('airbnbseattleclean')
bucket.upload_file('./Clean/calendar.csv', 'calendar.csv')

Clean Reviews and Export

ETA: 30 sec

In [53]:
# Get reviews dataset from airbnbseattle bucket
rev_object = s3.Bucket('airbnbseattle').Object('reviews.csv').get()
reviews = pd.read_csv(rev_object["Body"])
reviews = reviews.replace(np.nan, None)

# step 1: replace None values with empty string

reviews['comments'] = reviews['comments'].replace([None], "")

# step 2: create new sentiment columns

reviews["sentiment"] = reviews["comments"].apply(get_sentiment)
reviews["sentiment_label"] = reviews["sentiment"].apply(get_sentiment_label)

# step 3: export csv
reviews.to_csv('./Clean/reviews.csv', encoding='utf-8', index=False)

# step 3: Upload reviews.csv into the S3 bucket
bucket = s3.Bucket('airbnbseattleclean')
bucket.upload_file('./Clean/reviews.csv', 'reviews.csv')


Clean Listings and Export

ETA: 5 sec

In [54]:
# Get listings dataset from airbnbseattle bucket
list_object = s3.Bucket('airbnbseattle').Object('listings.csv').get()
listings = pd.read_csv(list_object["Body"])
listings = listings.replace(np.nan, None)

# step 1: remove dollar sign and convert to float for 
#   price, weekly_price, security_deposit, cleaning_fee, and extra_people

listings['price'] = listings['price'].str.replace('$', '').str.replace(',','').astype(float)
listings['weekly_price'] = listings['weekly_price'].str.replace('$', '').str.replace(',','').astype(float)
listings['security_deposit'] = listings['security_deposit'].str.replace('$', '').str.replace(',','').astype(float)
listings['cleaning_fee'] = listings['cleaning_fee'].str.replace('$', '').str.replace(',','').astype(float)
listings['extra_people'] = listings['extra_people'].str.replace('$', '').str.replace(',','').astype(float)

# step 2: remove percent sign and convert to float for 
#   host_response_rate and host_acceptance_rate

listings['host_response_rate'] = listings['host_response_rate'].str.replace('%', '').astype(float)
listings['host_acceptance_rate'] = listings['host_acceptance_rate'].str.replace('%', '').astype(float)

# step 3: convert t and f strings to 1 and 0 for boolean cols

bool_cols = [
    'host_is_superhost',
    'host_has_profile_pic',
    'host_identity_verified',
    'is_location_exact',
    'has_availability',
    'requires_license',
    'instant_bookable',
    'require_guest_profile_picture',
    'require_guest_phone_verification'
]

for cols in bool_cols:
    listings[cols] = listings[cols].replace({ 't': 1, 'f': 0 })

# step 4: replace None values with empty string

listings['summary'] = listings['summary'].replace([None], "")
listings['space'] = listings['space'].replace([None], "")
listings['description'] = listings['description'].replace([None], "")

# step 5: create new sentiment columns

listings["summary_sentiment"] = listings["summary"].apply(get_sentiment)
listings["space_sentiment"] = listings["space"].apply(get_sentiment)
listings["description_sentiment"] = listings["description"].apply(get_sentiment)

# step 6: export csv

listings.to_csv('./Clean/listings.csv', encoding='utf-8', index=False)

# step 7: Upload listings.csv into the S3 bucket
bucket = s3.Bucket('airbnbseattleclean')
bucket.upload_file('./Clean/listings.csv', 'listings.csv')