# DO UPVOTE AND COMMENT

# Context
**This dataset is a record of every building or building unit (apartment, etc.) sold in the New York City property market over a 12-month period.**

# Content
**This dataset contains the location, address, type, sale price, and sale date of building units sold. A reference on the trickier fields:**

1. BOROUGH: A digit code for the borough the property is located in; in order these are Manhattan (1), Bronx (2), Brooklyn (3), Queens (4), and Staten Island (5).

2. BLOCK; LOT: The combination of borough, block, and lot forms a unique key for property in New York City. Commonly called a BBL.

3. BUILDING CLASS AT PRESENT and BUILDING CLASS AT TIME OF SALE: The type of building at various points in time. 

**Note that because this is a financial transaction dataset, there are some points that need to be kept in mind:**

Many sales occur with a nonsensically small dollar amount: $0 most commonly. These sales are actually transfers of deeds between parties: for example, parents transferring ownership to their home to a child after moving out for retirement.

# ***THINGS YOU HAVE TO FOLLOW WHILE WALKING THROUGH THE WHOLE REPORT***

1. If you are familiar with python then only follow the code. 
2. The simple explanations about any visual or graph will be there.
3. There will be a brief conclusion of the report.
4. Every explanation is presented below the line of code's output.

***Enjoy***

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
!pip install pywaffle --quiet
from pywaffle import Waffle
from wordcloud import WordCloud

In [None]:
df= pd.read_csv("../input/nyc-property-sales/nyc-rolling-sales.csv")

In [None]:
df

In [None]:
df.iloc[:8,:10]

In [None]:
df.iloc[:8,10:20]

In [None]:
df.info()

In [None]:
#SALE PRICE is object but should be numeric
df['SALE PRICE'] = pd.to_numeric(df['SALE PRICE'], errors='coerce')

#LAND and GROSS SQUARE FEET is object but should be numeric
df['LAND SQUARE FEET'] = pd.to_numeric(df['LAND SQUARE FEET'], errors='coerce')
df['GROSS SQUARE FEET']= pd.to_numeric(df['GROSS SQUARE FEET'], errors='coerce')

#SALE DATE is object but should be datetime
df['SALE DATE'] = pd.to_datetime(df['SALE DATE'], errors='coerce')

#Both TAX CLASS attributes should be categorical
df['TAX CLASS AT TIME OF SALE'] = df['TAX CLASS AT TIME OF SALE'].astype('category')
df['TAX CLASS AT PRESENT'] = df['TAX CLASS AT PRESENT'].astype('category')

# DATA INSPECTION 

In [None]:
#Set the size of the plot
plt.figure(figsize=(15,6))

# Plot the data and configure the settings
sns.boxplot(x='SALE PRICE', data=df)
plt.ticklabel_format(style='plain', axis='x')
plt.title('Boxplot of SALE PRICE in USD')
plt.show()

# THERE's ONLY ONE THING I HATE IN DATA ==> OUTLIERS

In [None]:
# Remove observations with missing SALE PRICE
df = df[df['SALE PRICE'].notnull()]
len(df)

In [None]:
# Removes all NULL values
df = df[df['LAND SQUARE FEET'].notnull()] 
df = df[df['GROSS SQUARE FEET'].notnull()] 

In [None]:
#Set the size of the plot
plt.figure(figsize=(15,6))

#Get the data and format it
x = df[['SALE PRICE']].sort_values(by='SALE PRICE').reset_index()
x['PROPERTY PROPORTION'] = 1
x['PROPERTY PROPORTION'] = x['PROPERTY PROPORTION'].cumsum()
x['PROPERTY PROPORTION'] = 100* x['PROPERTY PROPORTION'] / len(x['PROPERTY PROPORTION'])

# Plot the data and configure the settings
plt.plot(x['PROPERTY PROPORTION'],x['SALE PRICE'], linestyle='None', marker='o')
plt.title('Cumulative Distribution of Properties according to Price')
plt.xlabel('Percentage of Properties in ascending order of Price')
plt.ylabel('Sale Price')
plt.ticklabel_format(style='plain', axis='y')
plt.show()

# NO OUTLIERS == DATA BETWEEN 100,000 USD - 5,000,000 USD

In [None]:
# Remove observations that fall outside those caps
df = df[(df['SALE PRICE'] > 100000) & (df['SALE PRICE'] < 5000000)]

In [None]:
#Set the size of the plot
plt.figure(figsize=(15,6))

#Get the data and format it
x = df[['SALE PRICE']].sort_values(by='SALE PRICE').reset_index()
x['PROPERTY PROPORTION'] = 1
x['PROPERTY PROPORTION'] = x['PROPERTY PROPORTION'].cumsum()
x['PROPERTY PROPORTION'] = 100* x['PROPERTY PROPORTION'] / len(x['PROPERTY PROPORTION'])

# Plot the data and configure the settings
plt.plot(x['PROPERTY PROPORTION'],x['SALE PRICE'], linestyle='None', marker='o')
plt.title('Cumulative Distribution of Properties according to Price')
plt.xlabel('Percentage of Properties in ascending order of Price')
plt.ylabel('Sale Price')
plt.ticklabel_format(style='plain', axis='y')
plt.show()

# MISSION ACCOMPLISHED == CURVE CHANGED

In [None]:
#Set the size of the plot
plt.figure(figsize=(15,6))

# Plot the data and configure the settings
sns.boxplot(x='SALE PRICE', data=df)
plt.ticklabel_format(style='plain', axis='x')
plt.title('Boxplot of SALE PRICE in USD')
plt.show()

# MISSION ACCOMPLISHED == BOXPLOT CHANGED

In [None]:
#Set the size of the plot
plt.figure(figsize=(15,6))

# Plot the data and configure the settings
sns.distplot(df['SALE PRICE'])
plt.title('Histogram of SALE PRICE in USD')
plt.ylabel('Normed Frequency')
plt.show()

# DATA SKEWNESS == UGLY

In [None]:
sales=np.log(df['SALE PRICE'])
print(sales.skew())
sns.distplot(sales)

# DATA NORMALIZED == BEAUTIFUL

In [None]:
plt.figure(figsize=(10,6))
sns.regplot(x='GROSS SQUARE FEET', y='SALE PRICE', data=df, fit_reg=False, scatter_kws={'alpha':0.3})
plt.title('Gross Square Feet vs Sale Price')
plt.show()

# UGH, OUTLIERS

In [None]:
plt.figure(figsize=(10,6))
sns.regplot(x='LAND SQUARE FEET', y='SALE PRICE', data=df, fit_reg=False, scatter_kws={'alpha':0.3})
plt.title('Land Square Feet vs Sale Price')
plt.show()

In [None]:
# Keeps properties with fewer than 20,000 Square Feet, which is about 2,000 Square Metres
df = df[df['GROSS SQUARE FEET'] < 20000]
df = df[df['LAND SQUARE FEET'] < 20000]

In [None]:
plt.figure(figsize=(10,6))
sns.regplot(x='GROSS SQUARE FEET', y='SALE PRICE', data=df, fit_reg=False, scatter_kws={'alpha':0.3})
plt.title('Gross Square Feet vs Sale Price')
plt.show()

# PROBLEM SOLVED

In [None]:
plt.figure(figsize=(10,6))
sns.regplot(x='LAND SQUARE FEET', y='SALE PRICE', data=df, fit_reg=False, scatter_kws={'alpha':0.3})
plt.title('Land Square Feet vs Sale Price')
plt.show()

In [None]:
#Dropping column as it is empty
del df['EASE-MENT']
#Dropping as it looks like an iterator
del df['Unnamed: 0']

In [None]:
#Checking for duplicated entries
sum(df.duplicated(df.columns))

In [None]:
#Delete the duplicates and check that it worked
df = df.drop_duplicates(df.columns, keep='last')
sum(df.duplicated(df.columns))

In [None]:
# Only a handful of properties with 0 total units are remaining and they will now be deleted
df = df[(df['TOTAL UNITS'] > 0) & (df['TOTAL UNITS'] < 50)]

In [None]:
#Remove data where commercial + residential doesn't equal total units
df = df[df['TOTAL UNITS'] == df['COMMERCIAL UNITS'] + df['RESIDENTIAL UNITS']]

In [None]:
df[["TOTAL UNITS", "SALE PRICE"]].groupby(['TOTAL UNITS'], as_index=False).count().sort_values(by='SALE PRICE', ascending=False)
df = df[(df['TOTAL UNITS'] > 0) & (df['TOTAL UNITS'] != 2261)] 

# DATA VISUALIZATION

In [None]:
plt.figure(figsize=(10,6))
sns.boxplot(x='COMMERCIAL UNITS', y='SALE PRICE', data=df)
plt.title('Commercial Units vs Sale Price')
plt.show()

In [None]:
plt.figure(figsize=(10,6))
sns.boxplot(x='RESIDENTIAL UNITS', y='SALE PRICE', data=df)
plt.title('Residential Units vs Sale Price')
plt.show()

In [None]:
plt.figure(figsize=(10,6))
sns.boxplot(x='TOTAL UNITS', y='SALE PRICE', data=df)
plt.title('Total Units vs Sale Price')
plt.show()

In [None]:
df = df[df['YEAR BUILT'] > 0]

In [None]:
df.columns[df.isnull().any()]

In [None]:
# Compute the correlation matrix
d= df[['SALE PRICE', 'TOTAL UNITS','GROSS SQUARE FEET',  'LAND SQUARE FEET', 'RESIDENTIAL UNITS', 
         'COMMERCIAL UNITS', 'BOROUGH', 'BLOCK', 'LOT', 'ZIP CODE', 'YEAR BUILT',]]
corr = d.corr()

# Generate a mask for the upper triangle
mask = np.zeros_like(corr, dtype=np.bool)
mask[np.triu_indices_from(mask)] = True

# Set up the matplotlib figure
f, ax = plt.subplots(figsize=(11, 9))

# Generate a custom diverging colormap
cmap = sns.diverging_palette(220, 10, as_cmap=True)

# Draw the heatmap with the mask and correct aspect ratio
sns.heatmap(corr, mask=mask, 
            square=True, linewidths=.5, annot=True, cmap=cmap)
plt.yticks(rotation=0)
plt.title('Correlation Matrix of all Numerical Variables')
plt.show()

# CORRELATION MATRIX

In [None]:
plt.figure(figsize=(10,6))
sns.boxplot(x='GROSS SQUARE FEET', data=df,showfliers=False)

# NO OUTLIERS OBSERVED IN GROSS AND LAND SQUARE FEET

In [None]:
plt.figure(figsize=(10,6))
sns.boxplot(x='LAND SQUARE FEET', data=df,showfliers=False)


In [None]:
pivot=df.pivot_table(index='TAX CLASS AT TIME OF SALE', values='SALE PRICE', aggfunc=np.median)
pivot

In [None]:
cat=df[["TAX CLASS AT TIME OF SALE", "SALE PRICE"]].groupby(['TAX CLASS AT TIME OF SALE'], as_index=False).mean().sort_values(by='SALE PRICE', ascending=False)
plt.figure(figsize=(20,10))
sns.barplot(x='TAX CLASS AT TIME OF SALE', y='SALE PRICE', data=cat)

In [None]:
cat=df[["BUILDING CLASS CATEGORY", "SALE PRICE"]].groupby(['BUILDING CLASS CATEGORY'], as_index=False).mean().sort_values(by='SALE PRICE', ascending=False)
plt.figure(figsize=(20,10))

sns.barplot(x='SALE PRICE', y='BUILDING CLASS CATEGORY', data=cat, orient = 'h')

# PRICEY BUILDING CLASS ==> 08 RENTALS-ELEVATOR APARTMENTS

In [None]:
df['SALE DATE'] = pd.to_datetime(df['SALE DATE'])
df['SALE DATE'].dtype
df['SALE DATE'] = pd.to_datetime(df['SALE DATE'])
df['YEAR SOLD'] = (df['SALE DATE']).dt.year
df['MONTH SOLD']= (df['SALE DATE']).dt.month
# del(df["SALE DATE"])

In [None]:
plt.subplots(figsize=(20,8))
sns.barplot(x='YEAR SOLD', y='SALE PRICE', hue='BOROUGH', data=df, palette='rainbow', ci=None)
plt.title('Sales per Borough from 2016-2017')

# MANHATTAN WAS EXPENSIVE IN 2016 & 2017 IN BOROUGH.
**HOWEVER, DIP IN SALES IN 2017. {WHY??}**
# WHEREAS, BRONX & BROOKLYN SHOWED INCREMENT.  

# BOROUGH: Manhattan (1), Bronx (2), Brooklyn (3), Queens (4), and Staten Island (5).

In [None]:
plt.subplots(figsize=(20,8))
sns.boxplot(x='BOROUGH', y='SALE PRICE', data=df)
plt.title('Sale Price Distribution by Borough')
plt.show()

In [None]:
plt.subplots(figsize=(20,8))
sns.countplot('BOROUGH',data=df,palette='Set2')
plt.title('Sales per Borough')

In [None]:
plt.subplots(figsize=(20,8))
sns.barplot(y='RESIDENTIAL UNITS', x='BOROUGH',data=df, palette='coolwarm', ci=None)
plt.title('Sales per borough_Residential')

In [None]:
plt.subplots(figsize=(20,8))
sns.barplot(y='COMMERCIAL UNITS', x='BOROUGH',data=df, palette='coolwarm', ci=None)
plt.title('Sales per borough_Commercial')

In [None]:
plt.figure(figsize=(20,8))
sns.barplot(x='MONTH SOLD', y='SALE PRICE', hue='BOROUGH', data=df, palette='rainbow', ci=None)
plt.title('Sales per Borough from 2016-2017')
plt.legend(loc='right')

In [None]:
plt.figure(figsize=(20,5))
sns.countplot('MONTH SOLD', hue='YEAR SOLD', data=df, palette='Purples_r')

# PROPERTY SALES{2017} ==> January-August, PROPERTY SALES{2016} ==> September-December.

In [None]:
df.columns = [c.replace(' ', '_') for c in df.columns]

In [None]:
from collections import Counter
NEIGHBORHOOD = list(dict(Counter(df.NEIGHBORHOOD).most_common(20)).keys())

avg_sale_prices = []
for i in NEIGHBORHOOD:
    avg_price = np.mean(df.SALE_PRICE[df.NEIGHBORHOOD == i])
    avg_sale_prices.append(avg_price)

In [None]:
plt.figure(figsize=(20,8))
sns.barplot(x= avg_sale_prices, y= NEIGHBORHOOD , ci=None)
plt.title('Average House Price in the top 20 neighborhoods')

# BEDFORD WAS WAY TOO EXPENSIVE

# THE END

In [None]:
plt.subplots(figsize=(25,15))
wordcloud = WordCloud(
                          background_color='Black',
                          width=1920,
                          height=1080
                         ).generate(" ".join(df.NEIGHBORHOOD))
plt.imshow(wordcloud)
plt.axis('off')
plt.savefig('cast.png')
plt.show()