In [None]:
#Analysis of DogVacay host names

In [1]:
import requests
import pandas as pd
import sqlite3 as lite
import numpy as np
import statsmodels.api as sm
from collections import Counter
import itertools
import re
import difflib
import keyword

In [2]:
df = pd.read_csv('dog-vacay.csv')

In [3]:
df.shape

(7921, 18)

In [4]:
df.drop(['Unnamed: 0'], axis=1, inplace=True)

In [5]:
df.head()

Unnamed: 0,fee,names,ratings,repeat,response_time,review,services,title,boarding,sitting,daycare,checkups,walking,bathing,training,grooming,pickup/dropoff
0,25,patricia's pet heaven,4.5,9,1,11,1,"Phoenix, Arizona",1,0,0,0,0,0,0,0,0
1,35,**DOGGIE BED & BARKFEST**,5.0,1,2,45,3,"Phoenix, Arizona",1,0,1,0,0,1,0,0,0
2,20,Doggie Heaven,5.0,1,3,28,4,"Phoenix, Arizona",1,1,1,1,0,0,0,0,0
3,25,Stay at home mother and dog lover!,5.0,5,2,6,4,"Phoenix, Arizona",1,0,1,1,1,0,0,0,0
4,20,Kelrey's Doggy Dream Stay,4.5,3,1,3,4,"Phoenix, Arizona",1,0,1,0,0,1,0,0,1


In [6]:
#Split host names into words
df['names'] = df['names'].apply(lambda x: str(x).split())

In [7]:
#Combine all words into one list
list_ = df['names'].tolist()
merged_list = list(itertools.chain(*list_))

In [8]:
#Add occurrence of word to list w Counter
namecount = list(Counter(merged_list).items())

In [9]:
#Create dataframe
hostnames = pd.DataFrame(namecount)

In [10]:
#Add column names
hostnames.columns = ['word', 'occurrence']

In [11]:
#Sort
hostnames.sort('occurrence', ascending=False)

Unnamed: 0,word,occurrence
4127,Dog,1298
5132,Pet,761
1100,Care,540
5464,and,465
1641,Home,445
6166,&,445
3519,in,406
4524,Boarding,391
3807,Puppy,388
4835,Doggie,326


In [None]:
#In-progress: Clean data
#Remove prepositions, symbols, etc.

In [17]:
#df = df[df.name != 'Tina']
hostnames = hostnames[(hostnames.word != 'in') & 
                  (hostnames.word != '&') & 
                  (hostnames.word != 'and') & 
                  (hostnames.word != 'for') & 
                  (hostnames.word != 'The') & 
                  (hostnames.word != 'of') & 
                  (hostnames.word != 'a') & 
                  (hostnames.word != 'A') &
                  (hostnames.word != 'A') &
                  (hostnames.word != 'with') &
                  (hostnames.word != 'from') &
                  (hostnames.word != 'to') &
                  (hostnames.word != 'Your') &
                  (hostnames.word != 'your') &
                  (hostnames.word != 'the')]

In [19]:
#hostnames.sort('occurrence', ascending=False)

In [20]:
#Find similar words
difflib.get_close_matches('Dog', hostnames.word)

['Dog', 'Dogs', 'Doge']

In [None]:
#Review Quartiles 

In [17]:
#25%, 50%, 75% tiers based on reviews
df['tierreview'] = df['review'].map(lambda x: 1 if x<1 else 2 if 1<x<=7 else 3)

In [18]:
df.describe()

Unnamed: 0,fee,ratings,repeat,response_time,review,services,boarding,sitting,daycare,checkups,walking,bathing,training,grooming,pickup/dropoff,tierreview
count,7921.0,7921.0,7921.0,7921.0,7921.0,7921.0,7921.0,7921.0,7921.0,7921.0,7921.0,7921.0,7921.0,7921.0,7921.0,7921.0
mean,37.908471,2.949438,1.309178,1.3848,7.924252,3.351976,0.847873,0.467996,0.631991,0.578084,0.416235,0.18432,0.028532,0.012246,0.184699,1.956445
std,12.94106,2.444614,2.046919,0.688023,18.396807,0.97239,0.359167,0.499006,0.482294,0.493896,0.492965,0.38777,0.166497,0.109989,0.388078,0.876026
min,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
25%,30.0,0.0,0.0,1.0,0.0,3.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
50%,35.0,5.0,0.0,1.0,1.0,4.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,2.0
75%,45.0,5.0,2.0,2.0,7.0,4.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,3.0
max,450.0,5.0,9.0,3.0,237.0,4.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,3.0


In [19]:
df.groupby('tierreview').mean()

Unnamed: 0_level_0,fee,ratings,repeat,response_time,review,services,boarding,sitting,daycare,checkups,walking,bathing,training,grooming,pickup/dropoff
tierreview,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
1,38.325567,0.0,0.045045,1.264057,0.0,3.4548,0.759242,0.570674,0.593041,0.64461,0.496117,0.183597,0.028891,0.013669,0.164958
2,37.555799,4.960886,1.635667,1.405361,3.71116,3.328775,0.885667,0.445842,0.654814,0.555799,0.380197,0.184354,0.024617,0.0093,0.188184
3,37.665623,4.973556,2.517397,1.506959,19.479471,3.251566,0.923104,0.367084,0.6611,0.517745,0.349687,0.185108,0.030619,0.012526,0.204593


In [None]:
#In-progress: What words used by the 3 tiers of reviews?

In [24]:
tier3 = df[df.tierreview == 3]
tier2 = df[df.tierreview == 2]
tier1 = df[df.tierreview == 1]

In [25]:
namelist3 = tier3['names']
tier3_list = list(itertools.chain(*namelist3))
namelist2 = tier2['names']
tier2_list = list(itertools.chain(*namelist2))
namelist1 = tier1['names']
tier1_list = list(itertools.chain(*namelist1))

In [26]:
d3 = list(Counter(tier3_list).items())
dfnames3 = pd.DataFrame(d3)
dfnames3.columns = ['word', 'occurrence']
#dfnames3.sort('occurrence', ascending=False)

In [27]:
#df = df[df.name != 'Tina']
dfnames3 = dfnames3[(dfnames3.word != 'in') & 
                  (dfnames3.word != '&') & 
                  (dfnames3.word != 'and') & 
                  (dfnames3.word != 'for') & 
                  (dfnames3.word != 'The') & 
                  (dfnames3.word != 'of') & 
                  (dfnames3.word != 'a') & 
                  (dfnames3.word != 'A') &
                  (dfnames3.word != 'A') &
                  (dfnames3.word != 'with') &
                  (dfnames3.word != 'from') &
                  (dfnames3.word != 'to') &
                  (dfnames3.word != 'Your') &
                  (dfnames3.word != 'your') &
                  (dfnames3.word != 'the')]

In [28]:
dfnames3.sort('occurrence', ascending=False).head(10)

Unnamed: 0,word,occurrence
910,Dog,123
55,Home,119
1367,Pet,91
1080,Care,70
1558,Doggie,62
298,Puppy,48
1408,Paws,44
585,Love,44
325,Happy,40
551,Doggy,39


In [29]:
intercept = (dfnames3['occurrence'])/1600
dfnames3['intercept'] = intercept
dfnames3.sort('occurrence', ascending=False)

Unnamed: 0,word,occurrence,intercept
910,Dog,123,0.076875
55,Home,119,0.074375
1367,Pet,91,0.056875
1080,Care,70,0.043750
1558,Doggie,62,0.038750
298,Puppy,48,0.030000
1408,Paws,44,0.027500
585,Love,44,0.027500
325,Happy,40,0.025000
551,Doggy,39,0.024375


In [30]:
d2 = list(Counter(tier2_list).items())
dfnames2 = pd.DataFrame(d2)
dfnames2.columns = ['word', 'occurrence']
#dfnames2.sort('occurrence', ascending=False)

In [31]:
#df = df[df.name != 'Tina']
dfnames2 = dfnames2[(dfnames2.word != 'in') & 
                  (dfnames2.word != '&') & 
                  (dfnames2.word != 'and') & 
                  (dfnames2.word != 'for') & 
                  (dfnames2.word != 'The') & 
                  (dfnames2.word != 'of') & 
                  (dfnames2.word != 'a') & 
                  (dfnames2.word != 'A') &
                  (dfnames2.word != 'A') &
                  (dfnames2.word != 'with') &
                  (dfnames2.word != 'from') &
                  (dfnames2.word != 'Your') &
                  (dfnames2.word != 'your') &
                  (dfnames2.word != 'the')]

In [32]:
dfnames2.sort('occurrence', ascending=False).head(10)

Unnamed: 0,word,occurrence
284,Dog,119
747,Pet,68
150,Home,51
575,Puppy,51
1225,Doggie,46
883,Care,38
1033,Happy,37
644,Paws,35
635,dog,31
536,Love,29


In [33]:
d1 = list(Counter(tier1_list).items())
dfnames1 = pd.DataFrame(d1)
dfnames1.columns = ['word', 'occurrence']

In [34]:
#df = df[df.name != 'Tina']
dfnames1 = dfnames1[(dfnames1.word != 'in') & 
                  (dfnames1.word != '&') & 
                  (dfnames1.word != 'and') & 
                  (dfnames1.word != 'for') & 
                  (dfnames1.word != 'The') & 
                  (dfnames1.word != 'of') & 
                  (dfnames1.word != 'a') & 
                  (dfnames1.word != 'A') &
                  (dfnames1.word != 'A') &
                  (dfnames1.word != 'with') &
                  (dfnames1.word != 'from') &
                  (dfnames1.word != 'to') &
                  (dfnames1.word != 'Your') &
                  (dfnames1.word != 'your') &
                  (dfnames1.word != 'the')]

In [35]:
dfnames1.sort('occurrence', ascending=False).head(10)

Unnamed: 0,word,occurrence
1633,Dog,315
1054,Puppy,191
89,Home,163
2445,Pet,157
956,Doggy,103
1926,Care,97
1034,Love,92
2795,Doggie,82
560,Happy,75
347,dog,75


In [36]:
dfcombine = pd.concat([p1, p2, p3], axis=1, ignore_index=True)
dfcombine.columns = ['word', 'occurrence1', 'word', 'occurrence2', 'word', 'occurrence3']
#dfcombine.sort(['occurrence1', 'occurrence2', 'occurrence3'], ascending=[False, False, False])
dfcombine

NameError: name 'p1' is not defined