In [None]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm

sns.set()
rand = 9999

In [None]:
df = pd.read_csv('../input/nyc-property-sales/nyc-rolling-sales.csv')
df.head()

In [None]:
# the column 'Unnamed: 0' is an iterator and is of no use here

df.drop('Unnamed: 0', axis=1, inplace=True)

In [None]:
sum(df.duplicated())

In [None]:
df = df.drop_duplicates(df.columns, keep='last')

In [None]:
# instead of '0's, the dataset seems to consist of '-'s', we should replace those now

df.replace({"-":0},inplace=True)

In [None]:
# blank rows should instead be null values

df.replace(' ',np.nan, inplace=True)

In [None]:
df.info()

In [None]:
# the column 'EASE-MENT' is blank and can be dropped

df.drop('EASE-MENT', axis=1, inplace=True)

In [None]:
for col in df.columns:
    print(col, df[col].nunique())

In [None]:
for col in ['BOROUGH', 'TAX CLASS AT PRESENT', 'TAX CLASS AT TIME OF SALE']:
    print(df[col].value_counts())

In [None]:
categoricals = ['BOROUGH', 'NEIGHBORHOOD', 'BUILDING CLASS CATEGORY', 'TAX CLASS AT PRESENT', 
                'ZIP CODE', 'YEAR BUILT', 'TAX CLASS AT TIME OF SALE', 'BUILDING CLASS AT TIME OF SALE',
                'BUILDING CLASS AT PRESENT']

for col in categoricals:
    df[col] = df[col].astype('category')

In [None]:
floats = ['SALE PRICE', 'LAND SQUARE FEET', 'GROSS SQUARE FEET']

for col in floats:
    df[col]=pd.to_numeric(df[col], errors='coerce')

In [None]:
df.info()

In [None]:
# 'ADDRESS' and 'APARTMENT NUMBER' are irrelevant to our data analysis

df.drop(labels=['ADDRESS', 'APARTMENT NUMBER'], axis=1, inplace=True)

In [None]:
# 'SALE DATE' can be split into month and year

df['SALE DATE']= pd.to_datetime(df['SALE DATE'], errors='coerce')
df['SALE MONTH'] = pd.DatetimeIndex(df['SALE DATE']).month.astype("category")
df['SALE YEAR'] = pd.DatetimeIndex(df['SALE DATE']).year.astype("category")
df.drop('SALE DATE', axis=1, inplace=True)

In [None]:
null_pct = df.isnull().sum() / len(df)
null_pct = null_pct[null_pct>0]
null_pct

In [None]:
plt.figure(figsize=(10,10))
sns.heatmap(df.isnull(),yticklabels=False,cbar=False,cmap='viridis')
plt.show()

In [None]:
plt.figure(figsize=(10,10))
sns.heatmap(df.corr(), annot=True)
plt.show()

In [None]:
plt.figure(figsize=(12,5))
sns.displot(df['SALE PRICE'], bins=40, rug=True)
plt.show()

There are an incredible amount of small ($0) observations here that need to be removed as they are not useful for analysis

In [None]:
df2 = df[(df['SALE PRICE'] > 10000) & (df['SALE PRICE'] < 10000000)]

plt.figure(figsize=(12,5))
sns.displot(df2['SALE PRICE'], bins=40, rug=True)
plt.show()

This is better but still not normalized. Applying a log transformation will normalize our target variable and assist our future models. 

In [None]:
df2['LOG_PRICE'] = np.log(df2['SALE PRICE'])
sns.displot(df2['LOG_PRICE'], bins=100)

In [None]:
print(df2['LOG_PRICE'].skew())

Now our target variable is much more normalized.

Let's take a look at the independent features...

In [None]:
sns.displot(df2['GROSS SQUARE FEET'], bins=80)

In [None]:
sns.displot(df2['LAND SQUARE FEET'], bins=100)

We are seeing a similar issue with these two independent features. There are many extreme observations that can be removed to normalize our data. 

In [None]:
df3 = df2[(df2['GROSS SQUARE FEET'] < 8000) & (df2['LAND SQUARE FEET'] < 8000)]

In [None]:
sns.displot(df3['GROSS SQUARE FEET'], bins=100)

In [None]:
sns.displot(df3['LAND SQUARE FEET'], bins=100)

Next we look at 'YEAR BUILT'

In [None]:
df3['YEAR BUILT'].value_counts()

Values of 'YEAR BUILT' that are 0 do not make sense and should be removed

In [None]:
df4 = df3[df3['YEAR BUILT']!=0]

In [None]:
sns.distplot(df4['YEAR BUILT'], bins=100)

Next, 'TOTAL UNITS'

In [None]:
sns.displot(df4['TOTAL UNITS'], bins=100)

Here, there are many values of '0' which again do not make sense but also many values at the extreme '2261'. With this abnormal amount of '2261' values, we can assume this is a placeholder value and remove them alongside the '0' values. 

In [None]:
df5 = df4[(df4['TOTAL UNITS'] > 0) & (df4['TOTAL UNITS'] != 2261)]

In [None]:
sns.displot(df5['TOTAL UNITS'], bins=100)

# Data Exploration / EDA

In [None]:
plt.figure(figsize=(10,6))
sns.boxplot(x='RESIDENTIAL UNITS', y='SALE PRICE', data=df5)
plt.title('RESIDENTIAL UNITS vs SALE PRICE')
plt.show()

There is a linear increase of sale price as you increase the amount of residential units up until 10 at which the sale price distributions vary greatly. This is due to a lack of data for units > 10.

In [None]:
plt.figure(figsize=(10,6))
sns.boxplot(x='BOROUGH', y='SALE PRICE', data=df5)
plt.title('RESIDENTIAL UNITS vs SALE PRICE')
plt.show()

From this figure, there is a much wider distribution of sale prices in borough 1 compared to the other 4 boroughs. Much like in the sale price vs residential units figure, there is not much data for borough 1.