# Get rows with only latest transaction

- Use the zillow dataset
- Acquire data from mySQL using the python module to connect and query. You will want to end with a single dataframe. Make sure to include: the logerror, all fields related to the properties that are available. You will end up using all the tables in the database.
- Be sure to do the correct join (inner, outer, etc.). We do not want to eliminate properties purely because they may have a null value for airconditioningtypeid.

- Only include properties with a transaction in 2017, and include only the last transaction for each property (so no duplicate property ID's), along with zestimate error and date of transaction.

- Only include properties that include a latitude and longitude value.

In [3]:
import pandas as pd
from env import get_db_url

In [4]:
sql = """
select *
from properties_2017
join predictions_2017 using(parcelid)
where latitude is not null and longitude is not null;
"""

In [5]:
url = get_db_url('zillow')

df = pd.read_sql(sql, url)
df.head()

Unnamed: 0,parcelid,id,airconditioningtypeid,architecturalstyletypeid,basementsqft,bathroomcnt,bedroomcnt,buildingclasstypeid,buildingqualitytypeid,calculatedbathnbr,...,taxvaluedollarcnt,assessmentyear,landtaxvaluedollarcnt,taxamount,taxdelinquencyflag,taxdelinquencyyear,censustractandblock,id.1,logerror,transactiondate
0,14297519,1727539,,,,3.5,4.0,,,3.5,...,1023282.0,2016.0,537569.0,11013.72,,,60590630000000.0,0,0.025595,2017-01-01
1,17052889,1387261,,,,1.0,2.0,,,1.0,...,464000.0,2016.0,376000.0,5672.48,,,61110010000000.0,1,0.055619,2017-01-01
2,14186244,11677,,,,2.0,3.0,,,2.0,...,564778.0,2016.0,479489.0,6488.3,,,60590220000000.0,2,0.005383,2017-01-01
3,12177905,2288172,,,,3.0,4.0,,8.0,3.0,...,145143.0,2016.0,36225.0,1777.51,,,60373000000000.0,3,-0.10341,2017-01-01
4,10887214,1970746,1.0,,,3.0,3.0,,8.0,3.0,...,119407.0,2016.0,45726.0,1533.89,,,60371240000000.0,4,0.00694,2017-01-01


In [24]:
# 199 duplicated parcelids
df.parcelid.duplicated().sum()

199

In [77]:
df.groupby('parcelid').parcelid.count()[df.groupby('parcelid').parcelid.count() > 1]

parcelid
10722858     2
10732347     2
10739478     2
10744507     2
10753427     2
            ..
17251843     2
17280166     2
17282392     2
17295416     2
162960529    2
Name: parcelid, Length: 195, dtype: int64

In [25]:
# Number of times each parcelid shows in this dataframe
df.groupby('parcelid').parcelid.count().sort_values(ascending=False)

parcelid
10857130     3
11991059     3
12612211     3
13083743     3
11739891     2
            ..
11876038     1
11876013     1
11875917     1
11875901     1
167689317    1
Name: parcelid, Length: 77381, dtype: int64

In [41]:
# Generate a series of unique parcelids that have been duplicated
duplicate_parcelids = df[df.parcelid.duplicated()].parcelid

In [43]:
# All the rows with a parcelid showing up more than once
df[df.parcelid.isin(duplicate_parcelids)]

Unnamed: 0,parcelid,id,airconditioningtypeid,architecturalstyletypeid,basementsqft,bathroomcnt,bedroomcnt,buildingclasstypeid,buildingqualitytypeid,calculatedbathnbr,...,taxvaluedollarcnt,assessmentyear,landtaxvaluedollarcnt,taxamount,taxdelinquencyflag,taxdelinquencyyear,censustractandblock,id.1,logerror,transactiondate
116,11393337,2463969,,,,3.0,3.0,,4.0,3.0,...,43439.0,2016.0,22755.0,756.94,Y,14.0,6.037235e+13,116,0.015172,2017-01-03
117,11393337,2463969,,,,3.0,3.0,,4.0,3.0,...,43439.0,2016.0,22755.0,756.94,Y,14.0,6.037235e+13,117,0.086137,2017-06-08
623,14634203,2026522,1.0,,,2.0,3.0,,,2.0,...,289445.0,2016.0,213516.0,3490.16,,,6.059001e+13,624,-0.019491,2017-01-04
624,14634203,2026522,1.0,,,2.0,3.0,,,2.0,...,289445.0,2016.0,213516.0,3490.16,,,6.059001e+13,625,-0.061973,2017-08-04
1016,11721753,616260,,,,2.0,3.0,,6.0,2.0,...,205123.0,2016.0,163175.0,2627.48,,,6.037220e+13,1017,-0.011052,2017-01-05
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
64252,17282392,2938730,,,,2.0,3.0,,,2.0,...,498684.0,2016.0,249342.0,5258.54,,,6.111007e+13,64277,-0.002968,2017-08-07
64253,17282392,2938730,,,,2.0,3.0,,,2.0,...,498684.0,2016.0,249342.0,5258.54,,,6.111007e+13,64278,0.901074,2017-08-25
64917,10984080,2876815,1.0,,,3.0,3.0,,8.0,3.0,...,354000.0,2016.0,141000.0,4347.32,Y,15.0,6.037104e+13,64943,0.001824,2017-08-09
64918,10984080,2876815,1.0,,,3.0,3.0,,8.0,3.0,...,354000.0,2016.0,141000.0,4347.32,Y,15.0,6.037104e+13,64944,0.860596,2017-09-12


In [83]:
x = df[df.parcelid.isin(duplicate_parcelids)]
x

Unnamed: 0,parcelid,id,airconditioningtypeid,architecturalstyletypeid,basementsqft,bathroomcnt,bedroomcnt,buildingclasstypeid,buildingqualitytypeid,calculatedbathnbr,...,taxvaluedollarcnt,assessmentyear,landtaxvaluedollarcnt,taxamount,taxdelinquencyflag,taxdelinquencyyear,censustractandblock,id.1,logerror,transactiondate
116,11393337,2463969,,,,3.0,3.0,,4.0,3.0,...,43439.0,2016.0,22755.0,756.94,Y,14.0,6.037235e+13,116,0.015172,2017-01-03
117,11393337,2463969,,,,3.0,3.0,,4.0,3.0,...,43439.0,2016.0,22755.0,756.94,Y,14.0,6.037235e+13,117,0.086137,2017-06-08
623,14634203,2026522,1.0,,,2.0,3.0,,,2.0,...,289445.0,2016.0,213516.0,3490.16,,,6.059001e+13,624,-0.019491,2017-01-04
624,14634203,2026522,1.0,,,2.0,3.0,,,2.0,...,289445.0,2016.0,213516.0,3490.16,,,6.059001e+13,625,-0.061973,2017-08-04
1016,11721753,616260,,,,2.0,3.0,,6.0,2.0,...,205123.0,2016.0,163175.0,2627.48,,,6.037220e+13,1017,-0.011052,2017-01-05
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
64252,17282392,2938730,,,,2.0,3.0,,,2.0,...,498684.0,2016.0,249342.0,5258.54,,,6.111007e+13,64277,-0.002968,2017-08-07
64253,17282392,2938730,,,,2.0,3.0,,,2.0,...,498684.0,2016.0,249342.0,5258.54,,,6.111007e+13,64278,0.901074,2017-08-25
64917,10984080,2876815,1.0,,,3.0,3.0,,8.0,3.0,...,354000.0,2016.0,141000.0,4347.32,Y,15.0,6.037104e+13,64943,0.001824,2017-08-09
64918,10984080,2876815,1.0,,,3.0,3.0,,8.0,3.0,...,354000.0,2016.0,141000.0,4347.32,Y,15.0,6.037104e+13,64944,0.860596,2017-09-12


In [84]:
y = pd.DataFrame(x.groupby("parcelid").transactiondate.max())
y

Unnamed: 0_level_0,transactiondate
parcelid,Unnamed: 1_level_1
10722858,2017-07-28
10732347,2017-07-25
10739478,2017-03-31
10744507,2017-08-31
10753427,2017-03-17
...,...
17251843,2017-06-22
17280166,2017-06-15
17282392,2017-08-25
17295416,2017-05-16
