In [1]:
import pandas as pd
import csv
import mysql.connector
from shapely.geometry import Point, Polygon
from sklearn.preprocessing import StandardScaler
import statsmodels.api as sm

### Property Valuation and Assessment Data:

#### Explain the meaning of each feature

- BBLE: It is a New York City real estate identification number.
- BORO: 1: Manhattan
- Block: Block number, representing the integer value of the block in which the property is located.
- LOT (Lot) : Lot number, indicating the integer value of the lot in which the property is located.
- EASEMENT: Land use right, which denotes the use right or restriction of land ownership.
- OWNER:Building owner.
- BLDGCL: Represents the class or building use code of a building
- TAXCLASS: Tax classification code, a string code used to identify the tax classification of the real estate.
- LTFRONT: The width of the lot in front of the real estate, expressed as an integer value in feet.
- LTDEPTH: The depth of a real estate plot, expressed as an integer value in feet.
- EXT: Extended information, which may be additional descriptions or features related to buildings or land.
- STORIES: The number of floors of the building
- FULLVAL: The full value of real estate, expressed as an integer value in US dollars.
- AVLAND: Land value, expressed as an integer value in US dollars.
- AVTOT: Total value, expressed as an integer value in US dollars.
- EXALND: Tax free land value, expressed as an integer value in US dollars.
- EXTOT: Total tax exemption value, expressed as an integer value in US dollars.
- EXCD1: Tax exemption class code (first assessment), a string code used to identify the tax exemption class.
- STADDR: Street address, a string representing the specific street address of the real estate.
- POSTCODE: Postal code denotes the postal code of the location of the real estate
- EXMPTCL: This is the tax exemption classification code used to identify the tax exemption category to which the property belongs.
- BLDFRONT: Width of the front of the building, expressed as an integer value in feet.
- BLDDEPTH: The depth of a building, expressed as an integer value in feet.
- AVLAND2: The total value of the second assessment, expressed as a floating point value in US dollars.
- AVTOT2: The total value of the second assessment, expressed as a floating point value in US dollars.
- EXLAND2: Tax land value (second assessment), expressed as a floating point value in US dollars.
- EXTOT2: Total duty-free value (second assessment), expressed as floating point value in US dollars.
- EXCD2 : Tax exemption class code (second assessment), a string code used to identify the tax exemption class.
- PERIOD : The time period in which the data was recorded, a string representing the time period in which the data was recorded.
- YEAR: Year of the data record, a string indicating the year of the record.
- VALTYPE: The value type of the data record, a string indicating the value type of the record.
- Borough: The administrative division in which the immovable property is located, a string indicating the administrative division.
- Latitude: The latitude of the real estate represents the latitude value expressed as a floating point value.
- Longitude: The longitude of real property, which represents the longitude value expressed as a floating point value.
- Community Board: Community board, denoting the administrative division unit of the district
- Council District: City Council District, which represents the division of the district in the city Council.
- Census Tract: Census area, which represents the division of the area in the census.
- BIN: The real property identification number, similar to the BBLE column, is used to uniquely identify each real property as a floating point value
- NTA: Community ID, a string indicating the community.
- New Georeferenced Column: New georeferenced column, a string column representing georeferenced information.

In [2]:
df = pd.read_csv('dataSource/Property_Valuation_and_Assessment_Data.csv')

In [3]:
df.head(5)

Unnamed: 0,BBLE,BORO,BLOCK,LOT,EASEMENT,OWNER,BLDGCL,TAXCLASS,LTFRONT,LTDEPTH,...,VALTYPE,Borough,Latitude,Longitude,Community Board,Council District,Census Tract,BIN,NTA,New Georeferenced Column
0,1000163859,1,16,3859,,"CHEN, QI TOM",R4,2,0,0,...,AC-TR,,,,,,,,,
1,1000730028,1,73,28,,NYC DSBS,V1,4,183,52,...,AC-TR,,,,,,,,,
2,1000730029,1,73,29,,NYC DSBS,Y7,4,90,500,...,AC-TR,,,,,,,,,
3,1000297504,1,29,7504,,,R0,2,36,73,...,AC-TR,,,,,,,,,
4,1000360012,1,36,12,,NYC DSBS,Y7,4,534,604,...,AC-TR,,,,,,,,,


- Check how many rows and columns dataset has.

In [4]:
df.shape

(9845857, 40)

- Check if there are duplicate rows or columns

In [5]:
#remove whitespace in or around feature names
df.columns = df.columns.str.replace(' ', '')

#check to ensure whitespaces have been removed
df.columns

Index(['BBLE', 'BORO', 'BLOCK', 'LOT', 'EASEMENT', 'OWNER', 'BLDGCL',
       'TAXCLASS', 'LTFRONT', 'LTDEPTH', 'EXT', 'STORIES', 'FULLVAL', 'AVLAND',
       'AVTOT', 'EXLAND', 'EXTOT', 'EXCD1', 'STADDR', 'POSTCODE', 'EXMPTCL',
       'BLDFRONT', 'BLDDEPTH', 'AVLAND2', 'AVTOT2', 'EXLAND2', 'EXTOT2',
       'EXCD2', 'PERIOD', 'YEAR', 'VALTYPE', 'Borough', 'Latitude',
       'Longitude', 'CommunityBoard', 'CouncilDistrict', 'CensusTract', 'BIN',
       'NTA', 'NewGeoreferencedColumn'],
      dtype='object')

In [6]:
#check for duplicate rows

#Print the number of duplicates, without the original rows that were duplicated
print('Number of duplicate (excluding first) rows in the table is: ', df.duplicated().sum())

# Use "keep=False" to mark all duplicates as true, including the original rows that were duplicated.
print('Number of duplicate rows (including first) in the table is:', df[df.duplicated(keep=False)].shape[0])

Number of duplicate (excluding first) rows in the table is:  0
Number of duplicate rows (including first) in the table is: 0


- Check if there is a constant column

In [7]:
#Check the data of category type to see if there is a constant column
df_columns = df.columns
features_card = list(df[df_columns].columns.values)

print('{0:35}  {1}'.format("Feature", "Unique Values"))
print('{0:35}  {1}'.format("-------", "--------------- \n"))

for c in df_columns:
    print('{0:35}  {1}'.format(c, str(len(df[c].unique()))))

Feature                              Unique Values
-------                              --------------- 

BBLE                                 1128885
BORO                                 5
BLOCK                                13985
LOT                                  6548
EASEMENT                             15
OWNER                                1470317
BLDGCL                               218
TAXCLASS                             11
LTFRONT                              1328
LTDEPTH                              1391
EXT                                  4
STORIES                              129
FULLVAL                              579432
AVLAND                               171876
AVTOT                                395031
EXLAND                               83255
EXTOT                                243174
EXCD1                                146
STADDR                               861582
POSTCODE                             239
EXMPTCL                              15
BLDFRONT  

The above result shows that PERIOD, VALTYPE are constant columns, so delete these two columns

In [8]:
columns_to_drop = ["PERIOD", "VALTYPE"]
df = df.drop(columns_to_drop, axis=1)

- Check %Missing column and %null column

In [9]:
# Prepare %Missing column and %null column
categorical_missing = {'Feature':[], 'Missing%':[], 'Null%':[], 'Total%':[], '0%':[]}
for column in df.columns:
    categorical_missing['Feature'].append(column)
    categorical_missing['Missing%'].append(100*sum(df[column]=='Missing')/df.shape[0])
    categorical_missing['Null%'].append(100*(df[column].isnull().sum())/df.shape[0])
    categorical_missing['Total%'].append((100*sum(df[column]=='Missing')/df.shape[0])+(100*(df[column].isnull().sum())/df.shape[0]))
    categorical_missing['0%'].append(100 * len(df[df[column] == 0]) / df.shape[0])
pd.DataFrame(categorical_missing)

Unnamed: 0,Feature,Missing%,Null%,Total%,0%
0,BBLE,0.0,0.0,0.0,0.0
1,BORO,0.0,0.0,0.0,0.0
2,BLOCK,0.0,0.0,0.0,0.0
3,LOT,0.0,0.0,0.0,0.0
4,EASEMENT,0.0,99.580453,99.580453,0.0
5,OWNER,0.0,2.190576,2.190576,0.0
6,BLDGCL,0.0,0.0,0.0,0.0
7,TAXCLASS,0.0,0.0,0.0,0.0
8,LTFRONT,0.0,0.0,0.0,16.022221
9,LTDEPTH,0.0,0.0,0.0,16.84412


###### BORO

In [11]:
df['BORO'].unique()

array([1, 2, 3, 4, 5])

- BORO is a field indicating the administrative division of real estate. It is commonly used to identify boroughs in New York City.
- 1, Manhattan; 2, Brooklyn; 3, Queens; 4, Bronx; 5, Staten Island.
- Since this project focuses on Manhattan, the data with BORO value 1 is filtered out.

In [12]:
df = df[df['BORO'] == 1]

In [13]:
df.drop(labels=['BORO'],axis=1,inplace=True)

###### EASEMENT

In [14]:
df['EASEMENT'].unique()

array([nan, 'E', 'G', 'F', 'A', 'H', 'I', 'N', 'K'], dtype=object)

- In the EASEMENT column, these values indicate the easement status of the property. An easement is a specific right or restriction on the use of land without ownership. These values represent the various easement types that may exist in the area where the property is located. NaN indicates missing values, that is, no easement information is available.
- Considering that over 99.5 % of buildings have no easement, this attribute has almost zero impact on the analysis, so this should be removed.

In [15]:
df.drop(labels=['EASEMENT'],axis=1,inplace=True)

###### OWNER

- The owner will not affect the value of the building, and in order to ensure personal privacy，this should be removed.

In [16]:
df.drop(labels=['OWNER'],axis=1,inplace=True)

###### EXMPTCL

In [18]:
df['EXMPTCL'].unique()

array([nan, 'X1', 'X4', 'X8', 'X6', 'X5', 'X2', 'VI', 'X7', 'X3', 'X9',
       'KI'], dtype=object)

- In the EASEMENT column, these values indicate the tax exemption category to which the property belongs. Nan values are guessed as no tax-exempt status.
- Since over 98.5% of the values are missing, the impact of this attribute on the analysis is almost zero, so this should be removed.

In [19]:
df.drop(labels=['EXMPTCL'],axis=1,inplace=True)

###### AVLAND, AVLAND2 and AVTOT, AVTOT2

- Both are the first valuation and the second valuation. It is speculated that the second valuation is vacant because the second valuation is the same as the first valuation. Therefore, fill the vacant values of the second valuation with the first valuation.

In [20]:
df.loc[df['AVLAND2'].isnull(), 'AVLAND2'] = df['AVLAND']
df.loc[df['AVTOT2'].isnull(), 'AVTOT2'] = df['AVTOT']

- If AVLAND, AVLAND2, AVTOT, AVTOT2 and FULLVAL are all empty, the house value cannot be determined. So  this should be removed.

In [21]:
columns = ['FULLVAL', 'AVLAND', 'AVTOT', 'AVLAND2', 'AVTOT2']
df = df[~(df[columns] == 0).all(axis=1)]

###### EXLAND, EXLAND2 and EXTOT, EXTOT2

- Same reason as above

In [22]:
df.loc[df['EXLAND2'].isnull(), 'EXLAND2'] = df['EXLAND']
df.loc[df['EXTOT2'].isnull(), 'EXTOT2'] = df['EXTOT']

###### Borough

In [23]:
df['Borough'].unique()

array([nan, 'MANHATTAN'], dtype=object)

- The administrative division in which the immovable property is located, a string indicating the administrative division.
- We have filtered out the real estate belonging to Manhattan by BORO, and the real estate data we know to be retained belongs to Manhattan. At the same time, we know that Borough has only two values: MANHATTAN and nan, so that column is useless for us. This should be removed.

In [24]:
df.drop(labels=['Borough'],axis=1,inplace=True)

###### NTA
- Since the addresses are associated, the "BLOCK" and "NTA" columns of all data are taken out to remove duplicate values and observe whether there is a corresponding relationship between the two columns

In [25]:
unique_combinations = df[['BLOCK', 'NTA']].drop_duplicates()
unique_combinations.to_csv('programing data/unique_combinations.csv', index=False)

In [26]:
pd.DataFrame(unique_combinations)

Unnamed: 0,BLOCK,NTA
0,16,
1,73,
4,36,
11,209,
18,274,
...,...,...
8911181,2038,
8912822,2031,
8916755,2028,
8918224,2025,


-  It is found that there are three cases, the BLOCK number corresponds to one NTA value, the BLOCK number corresponds to two values: null value and NTA name, and the BLOCK corresponds to three values: null value, NTA name 1 and NTA name 2.

In [27]:
block_counts = unique_combinations['BLOCK'].value_counts()
block_once = block_counts[block_counts == 1].index
block_twice = block_counts[block_counts == 2].index
block_thrice = block_counts[block_counts == 3].index

In [28]:
print("The total number of BLOCK values that appear once：", len(block_once))
print("The total number of BLOCK values that appear twice：", len(block_twice))
print("The total number of BLOCK values that appear Three：", len(block_thrice))

The total number of BLOCK values that appear once： 1367
The total number of BLOCK values that appear twice： 553
The total number of BLOCK values that appear Three： 34


- To fill the missing NTA values with another NTA value corresponding to the same BLOCK value when NTA is empty.

In [29]:
block_nta_mapping = df.groupby('BLOCK')['NTA'].first().to_dict()
df['NTA'] = df['NTA'].fillna(df['BLOCK'].map(block_nta_mapping))

- Re-save and observe the data group of [BLOCK-NTA]

In [30]:
unique_combinations = df[['BLOCK', 'NTA']].drop_duplicates()
unique_combinations.to_csv('programing data/unique_combinations.csv')

- The remaining vacancy values are filled by the NTA corresponding to the BLOCK with the closest value

In [31]:
df = df.sort_values('BLOCK', ascending=False)
df['NTA'] = df['NTA'].fillna(method='bfill')

- The LocationID is obtained according to the NTA

In [32]:
taxi_zone = pd.read_csv('programing data/taxi_zones.csv', usecols=['zone','LocationID'])
merged_df = pd.merge(df, taxi_zone, left_on='NTA', right_on = 'zone', how='inner')

In [33]:
merged_df['LocationID'].unique()

array([243, 244, 116, 152, 166,  74,  75, 148, 107, 249,  79,  45])

- There are 69 neighborhoods in Manhattan, and the data has been processed to show only 12 of them. The data cannot be used,  so this should be removed.

In [34]:
df.drop(labels=['NTA'],axis=1,inplace=True)

###### Latitude and Longitude

- The latitude and longitude cannot be filled with other data.
- Query for data where the latitude and longitude and New Georeferenced Column are empty.

In [35]:
df.shape

(1352026, 32)

In [36]:
filtered_data = df[df['Longitude'].isnull() & df['Latitude'].isnull()& df['NewGeoreferencedColumn'].isnull()]

In [37]:
filtered_data.shape

(10967, 32)

In [38]:
df[df['BBLE']=='1016440042']

Unnamed: 0,BBLE,BLOCK,LOT,BLDGCL,TAXCLASS,LTFRONT,LTDEPTH,EXT,STORIES,FULLVAL,...,EXTOT2,EXCD2,YEAR,Latitude,Longitude,CommunityBoard,CouncilDistrict,CensusTract,BIN,NewGeoreferencedColumn
4572147,1016440042,1644,42,V1,4,25,72,,,323000,...,0.0,,2014/15,40.798896,-73.940188,111.0,8.0,182.0,1000000.0,POINT (-73.940188 40.798896)
6736323,1016440042,1644,42,V1,4,25,72,,,301000,...,0.0,,2012/13,,,,,,,
3473558,1016440042,1644,42,V1,4,25,72,,,369000,...,0.0,,2015/16,40.798896,-73.940188,111.0,8.0,182.0,1000000.0,POINT (-73.940188 40.798896)
7835294,1016440042,1644,42,V1,4,25,72,,,296200,...,0.0,,2010/11,,,,,,,
5647665,1016440042,1644,42,V1,4,25,72,,,311217,...,0.0,,2013/14,,,,,,,
2394511,1016440042,1644,42,V1,4,25,72,,,389000,...,0.0,,2016/17,40.798896,-73.940188,111.0,8.0,182.0,1000000.0,POINT (-73.940188 40.798896)
171194,1016440042,1644,42,V1,4,25,72,,,452000,...,0.0,,2018/19,40.798896,-73.940188,111.0,8.0,182.0,1000000.0,POINT (-73.940188 40.798896)
1284215,1016440042,1644,42,V1,4,25,72,,,416000,...,0.0,,2017/18,40.798896,-73.940188,111.0,8.0,182.0,1000000.0,POINT (-73.940188 40.798896)
8896625,1016440042,1644,42,V1,4,25,72,,,296000,...,0.0,,2011/12,,,,,,,


- The data lacks latitude and longitude data. However, some data of the same building have latitude and longitude data and some do not. The same BBLE is used to group data and fill the missing latitude and longitude data.

In [39]:
grouped = df.groupby('BBLE')
for name, group in grouped:
    if group['Latitude'].nunique(dropna = False) != 1:
        valid_lat = group['Latitude'].dropna().drop_duplicates()
        valid_lon = group['Longitude'].dropna().drop_duplicates()
        df.loc[group.index, 'Latitude'] = group['Latitude'].fillna(valid_lat.values[0])
        df.loc[group.index, 'Longitude'] = group['Longitude'].fillna(valid_lon.values[0])

- Using latitude and longitude to determine what zone the real estate belongs to

In [40]:
# Loading the csv file of the zone data
region_data = pd.read_csv('programing data/taxi_zones.csv')
# creating a list of polygon objects for the area
region_polygons = []
region_ids = []  # area id list

for index, row in region_data.iloc[:].iterrows():
    geom_value = row['the_geom']
    cleaned_value = geom_value.lstrip('MULTIPOLYGON ')
    coordinates = cleaned_value.replace('(', '').replace(')', '').split(',')
    coordinates = [tuple(map(float, coord.strip().split())) for coord in coordinates]
    polygon = Polygon(coordinates)
    region_polygons.append(polygon)
    region_ids.append(row['LocationID'])

df['LocationID'] = None  # Creatinga new column and initialize it to None

for index, row in df.iterrows():
    if row['Latitude'] != "" and row['Longitude'] != "":
        house_latitude = row['Latitude']
        house_longitude = row['Longitude']
        house_point = Point(house_longitude, house_latitude)
    
    
        # Checking if the property is in either zone
        for i, polygon in enumerate(region_polygons):
            if house_point.within(polygon):
                df.at[index, 'LocationID'] = region_ids[i]
                break




- See if the building LocationID can be inferred from the same address.

In [41]:
df.loc[df['STADDR'] == '1 AVENUE', "LocationID"].unique()

array([None, 75, 233, 224], dtype=object)

- It is observed that the same address may belong to 2-3 different neighborhoods, and it is impossible to predicted the neighborhood by the address.
- Remove data without latitude and longitude values

In [42]:
df = df[~((df["Latitude"].isna() & df["Longitude"].isna() ))]

###### YEAR

In [43]:
df['YEAR'].unique()

array(['2010/11', '2011/12', '2017/18', '2016/17', '2018/19', '2013/14',
       '2014/15', '2015/16', '2012/13'], dtype=object)

- Save the latest year as dates.

In [44]:
df['YEAR'] = df['YEAR'].str[:4].astype(int) + 1

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['YEAR'] = df['YEAR'].str[:4].astype(int) + 1


- Check %Missing column and %null column

In [45]:
# Prepare %Missing column and %null column
categorical_missing = {'Feature':[], 'Missing%':[], 'Null%':[], 'Total%':[], '0%':[]}
for column in df.columns:
    categorical_missing['Feature'].append(column)
    categorical_missing['Missing%'].append(100*sum(df[column]=='Missing')/df.shape[0])
    categorical_missing['Null%'].append(100*(df[column].isnull().sum())/df.shape[0])
    categorical_missing['Total%'].append((100*sum(df[column]=='Missing')/df.shape[0])+(100*(df[column].isnull().sum())/df.shape[0]))
    categorical_missing['0%'].append(100 * len(df[df[column] == 0]) /df.shape[0])
pd.DataFrame(categorical_missing)

Unnamed: 0,Feature,Missing%,Null%,Total%,0%
0,BBLE,0.0,0.0,0.0,0.0
1,BLOCK,0.0,0.0,0.0,0.0
2,LOT,0.0,0.0,0.0,0.0
3,BLDGCL,0.0,0.0,0.0,0.0
4,TAXCLASS,0.0,0.0,0.0,0.0
5,LTFRONT,0.0,0.0,0.0,57.548422
6,LTDEPTH,0.0,0.0,0.0,57.979131
7,EXT,0.0,93.203289,93.203289,0.0
8,STORIES,0.0,3.02665,3.02665,0.0
9,FULLVAL,0.0,0.0,0.0,0.0


- The null LocationId indicates that the property is not located in the Manhattan area. ,so they should be removed.

In [46]:
df = df[~((df["LocationID"].isna()))]

### Selecting features：
- Look for rows and columns. Consider whether it makes sense to keep them or drop them.


Feature Selection Summary:

- Real Estate:
| Feature                                |    Data Plan                                         |
|----------------------------------------|--------------------------------------|
|     BBLE               |   Inherit BBLE from the original dataset                                          |     
|     BLDGCL             |   Inherit BLDGCL from the original dataset                                  |    
|     TAXCLASS           |   Inherit TAXCLASS from the original dataset                                  |    
|     EXT                |   Inherit EXT from the original dataset                                          |    
|     STORIES            |   Inherit STORIES from the original dataset                                  |    
|     FULLVAL            |   Inherit FULLVAL from the original dataset                                  |     
|     AVLAND             |   Average of AVLAND and AVLAND2 from the original dataset                         |    
|     AVTOT	             |   Average of AVTOT and AVTOT2 from the original dataset                         |    
|     EXLAND             |   Average of EXLAND and EXLAND2 from the original dataset                         |    
|     EXTOT              |   Average of EXTOT and EXTOT2 from the original dataset                         |        
|     YEAR               |   Inherit YEAR from the original dataset, slice the first 4 characters    |     
|     LocationID         |   Inherit LocationID from the original dataset                                  |     


In [47]:
new_df = df[['BBLE', 'BLDGCL', 'TAXCLASS', 'EXT', 'STORIES', 'FULLVAL', 'LocationID']]
new_df['AVLAND'] = (df['AVLAND'] + df['AVLAND2']) / 2
new_df['AVTOT'] = (df['AVTOT'] + df['AVTOT2']) / 2
new_df['EXLAND'] = (df['EXLAND'] + df['EXLAND2']) / 2
new_df['EXTOT'] = (df['EXTOT'] + df['EXTOT2']) / 2
new_df['YEAR'] = df['YEAR']
new_df.to_csv('programing data/Real_Estate.csv', index=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['AVLAND'] = (df['AVLAND'] + df['AVLAND2']) / 2
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['AVTOT'] = (df['AVTOT'] + df['AVTOT2']) / 2
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['EXLAND'] = (df['EXLAND'] + df['EXLAND2']) / 2
A value is trying to be set on a copy of a sli

- NTA real estate
| Feature                                |    Data Plan                                         |
|----------------------------------------|--------------------------------------| 
|     BLDGCL             |   Class of building                                                              |          
|     AMOUNT             |   The total number of buildings                                                  |    
|     FULLVAL            |   The sum of the 'FULLVAL' of all the buildings in the area                    |     
|     AVLAND             |   The sum of the 'AVLAND' of all the buildings in the area                    |    
|     AVTOT	             |   The sum of the 'AVTOT' of all the buildings in the area                    |    
|     EXLAND             |   The sum of the 'EXLAND' of all the buildings in the area                    |    
|     EXTOT              |   The sum of the 'EXTOT' of all the buildings in the area                    |        
|     YEAR               |   Inherit YEAR from the original dataset                                          |     
|     LocationID         |   The NTA where the property is located    

In [48]:
df = new_df
new_df = None

In [49]:
df.head(5)

Unnamed: 0,BBLE,BLDGCL,TAXCLASS,EXT,STORIES,FULLVAL,LocationID,AVLAND,AVTOT,EXLAND,EXTOT,YEAR
7838296,1022552000,Q1,4,,1.0,114000000,128,50085000.0,50310000.0,50085000.0,50310000.0,2011
8922015,1022550001,Q1,4,,1.0,16506000,128,6861870.0,7134750.0,6861870.0,7134750.0,2012
1298437,1022552000,Q1,4,,1.0,131813000,128,51286500.0,57522801.5,51286500.0,57522801.5,2018
2403683,1022552000,Q1,4,,1.0,127974000,128,51286500.0,55882507.0,51286500.0,55882507.0,2017
2418271,1022550001,Q1,4,,1.0,18439000,128,6973290.0,8051805.0,6973290.0,8051805.0,2017


In [50]:
new_df = df.groupby(['YEAR', 'BLDGCL', 'LocationID']).agg({
    'BBLE': 'count',  # Total number of statistics
    'AVLAND': 'sum',  
    'FULLVAL': 'sum',  
    'AVTOT': 'sum',
    'EXLAND':'sum',
    'EXTOT':'sum'  
}).reset_index()
new_df.columns = ['YEAR', 'BLDGCL', 'LocationID', 'AMOUNT', 'AVLAND', 'FULLVAL', 'AVTOT','EXLAND','EXTOT']
new_df.to_csv('programing data/NTA_Real_Estate.csv', index=False)