In [None]:
# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.
import kagglehub
nelgiriyewithana_new_york_housing_market_path = kagglehub.dataset_download('nelgiriyewithana/new-york-housing-market')

print('Data source import complete.')


In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All"
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# ------------Import the necessary library
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [None]:
data= pd.read_csv('NY-House-Dataset.csv.zip')

data

In [None]:
data.describe()

In [None]:
data.isnull().sum()

In [None]:
new_data = data[['TYPE', 'BEDS', 'BATH', 'PROPERTYSQFT', 'ADMINISTRATIVE_AREA_LEVEL_2', 'LOCALITY', 'SUBLOCALITY', 'LATITUDE','LONGITUDE', 'PRICE']]
new_data.head()

**I modified the preexisiting data frame to exclude columns that I thought would have little impact on future analysis, such as Administrative Area level 2**

In [None]:
new_data = data[['TYPE', 'BEDS', 'BATH', 'PROPERTYSQFT', 'ADDRESS',
                 'LOCALITY', 'SUBLOCALITY', 'LATITUDE','LONGITUDE', 'PRICE']]
new_data.head()

**I then remove certain attributes within the type column of the new_data dataframe due to relevance and clarification; the attributes that were removed were pending, coming soon, contingent and for sale**

In [None]:
filter_data = new_data[(new_data['TYPE'] == 'Pending') | (new_data['TYPE'] == 'Coming Soon') | (new_data['TYPE'] == 'Contingent')
| (new_data['TYPE'] == 'For Sale')]

**I began to clean up the dataframe by removing any rows that contains null values, and adding it into a new dataframe to make graphing much easier for me in the future**

In [None]:
#Shauna Roberts
new_data2 = new_data.drop(filter_data.index)

**Ensuring that the necessary attributes within the type column was removed**

In [None]:
new_data2['TYPE'].unique()

In [None]:
new_data2.head()

**Established the similarities within the localities since many of them were within the same location just under different unnecessary names**

In [None]:
new_data2['LOCALITY'].unique()

In [None]:
#Richmond County = Staten Island
#The bronx and the broxn are the same
#Flatbush is a neighborhood within kings county, but brooklyn and kings county are the same
#New York county = Manhatten
#Queens and Queens county are the same
#Just left we have United States and New York

**Replaced the name of certain locality to narrow down the grouping in the future and to also make the locations more accurate. Also looked into the different sublocailties that were included within the dataframe**

In [None]:

new_data2['LOCALITY'].replace('Richmond County', 'Staten Island', inplace= True)
new_data2['LOCALITY'].replace('The Bronx', 'Bronx', inplace= True)
new_data2['LOCALITY'].replace('Bronx County', 'Bronx', inplace= True)
new_data2['LOCALITY'].replace('Flatbush', 'Brooklyn', inplace= True)
new_data2['LOCALITY'].replace('New York County', 'Manhattan', inplace= True)
new_data2['LOCALITY'].replace('Queens County', 'Queens', inplace= True)
new_data2['LOCALITY'].replace('Kings County', 'Brooklyn', inplace= True)

In [None]:
new_data2['LOCALITY'].unique()

In [None]:
new_data2['SUBLOCALITY'].unique()

In [None]:
# -------Install folium for mapping
!pip install folium

In [None]:
import folium

#Create a map centered at an average location
#avg_lat = new_data['LATITUDE'].mean()
#avg_lon = new_data['LONGITUDE'].mean()
#my_map = folium.Map(location=[avg_lat, avg_lon], zoom_start=4)

# Add markers for each location
#for index, row in new_data.iterrows():
 #   folium.Marker(location=[row['LATITUDE'], row['LONGITUDE']], popup=row['LOCALITY']).add_to(my_map)

#display(my_map)

**Edited the default viewing of the map to show all the available housing units within brooklyn since that is my area of interest rather than showing the whole map of the USA: more zoomed in, in-comparison to the orginal map**

In [None]:
#Shauna Roberts
import folium

#Create a map to show the whole map of brooklyn rather than the whole united States
my_map = folium.Map(location=[40.6782, -73.9442], zoom_start=12)

#Add markers for each location
for index, row in new_data2.iterrows():
    folium.Marker(location=[row['LATITUDE'], row['LONGITUDE']], popup=row['LOCALITY']).add_to(my_map)

display(my_map)

In [None]:
# ----A bar plot of the average price by locality
#Shauna Roberts edited (avg_price_loc = new_data2.groupby) ...
avg_price_loc= new_data2.groupby('LOCALITY')['PRICE'].mean().reset_index()
fig = px.bar(avg_price_loc, x= "LOCALITY", y= "PRICE", color= "PRICE", title= "Average Price per Locality")
fig.show()

**Adjustments to  the visulaization aspect of the data was made. Removed color="PRICE" because it had no impact to the graph since the Y axis already marked the price points**

In [None]:
avg_price_locaility = new_data2.groupby('LOCALITY')['PRICE'].mean().reset_index()
fig = px.bar(avg_price_locaility, x= "LOCALITY", y= "PRICE", title= "Average Price per Locality")
fig.show()

In [None]:
# Average Price per Property type
prop_price= new_data.groupby('TYPE')['PRICE'].mean().reset_index()
fig = px.bar(prop_price, x='TYPE', y='PRICE', color='PRICE',title='Plot of the Property Type By Average Price ')
fig.show()

**A similar approach was made towards the graph that showed the average price of the different properties. In addition to removing the color=PRICE we now only have meaningful attributes in the X axis**

In [None]:
prop_price= new_data2.groupby('TYPE')['PRICE'].mean().reset_index()
fig = px.bar(prop_price, x='TYPE', y='PRICE', title='Plot of the Property Type By Average Price ')
fig.show()

In [None]:
nums= new_data[['PRICE', 'BEDS', 'BATH', 'PROPERTYSQFT']]
nums.head()

**made sure the variable nums was retriving the attributes from the correct dataframe and finding the mean of each attribute and rounding to the nearest whole number**

In [None]:
nums = new_data2[['PRICE', 'BEDS', 'BATH', 'PROPERTYSQFT']].mean().round(0)
nums.head()

In [None]:
nums2 = new_data2[['PRICE', 'BEDS', 'BATH', 'PROPERTYSQFT']]
nums2.head()

In [None]:
# Is there a relationship between Property Square feet and Price

fig= px.scatter(data_frame=nums, x='PROPERTYSQFT', y='PRICE' ,title = f"Relatonship Btw Property size and Price is {(nums['PROPERTYSQFT'].corr(nums['PRICE']).round(2) * 100)}%", color = 'PRICE')
fig.show()

**In order to remove the outliers from the graph, we first import the necessary libraries. Then convert the columns in the num2 table into a numeric data type. We then calculate the z scores and filter our data based on the Z-score since a z score of 3 or more is considered to be an outlier. Then we caculate the correlation between the property's square foot and its price and then plot the relationship**

In [None]:
from scipy import stats
import plotly.express as px

In [None]:
nums2 = nums2.apply(pd.to_numeric, errors='coerce')
nums2=nums2.dropna()

In [None]:
z_scores = np.abs(stats.zscore(nums2))
filtered_nums2 = nums2[(z_scores <3).all(axis=1)]

In [None]:
correlation = filtered_nums2['PROPERTYSQFT'].corr(filtered_nums2['PRICE']).round(2) * 100

In [None]:
fig = px.scatter(data_frame=filtered_nums2, x='PROPERTYSQFT', y='PRICE',
                 title=f"Relatonship Between Property size and Price is {correlation}%")
fig.show()

In [None]:
# Is there a relationship between number of beds and Price

fig= px.scatter(data_frame=nums, x='PRICE', y='BEDS' ,
                title = f"Relatonship Between Beds and Price is {(nums['BEDS'].corr(nums['PRICE']).round(2) * 100)}%", color = 'PRICE')
fig.show()

**Continued to remove the outliers from the following graphs, but decided not to work on the graph describing the relationship between sales and profit because it has no significance to my analysis**

In [None]:
correlation2 = filtered_nums2['PRICE'].corr(filtered_nums2['BEDS']).round(2) * 100

In [None]:
fig = px.scatter(data_frame=filtered_nums2, x='PRICE', y='BEDS',
                 title=f"Relatonship Between PRICE and BEDS is {correlation}%")
fig.show()

In [None]:
# Is there a relationship between number of beds and Property size?

fig= px.scatter(data_frame=nums, x='PROPERTYSQFT', y='BEDS' ,
                title = f"Relatonship Btw Sales and Profit is {(nums['PROPERTYSQFT'].corr(nums['BEDS']).round(2) * 100)}%", color = 'BEDS')
fig.show()

In [None]:
# --------using countplots in exploring the categorical columns
cat_col= new_data.select_dtypes(exclude= 'number')
cat_col.head()

In [None]:
cat_col= new_data2.select_dtypes(exclude= 'number')
cat_col.head()

In [None]:
# ----Countplot of the Type of property
plt.figure(figsize= (20, 8))
sns.countplot(x = cat_col.TYPE, palette = 'husl')
plt.title(f'Countplot of Type of Property')

In [None]:
# ----Countplot of the Type of Locality
plt.figure(figsize= (20, 8))
sns.countplot(x = cat_col.LOCALITY, palette = 'rainbow')
plt.title(f'Countplot of Locality of Property')

In [None]:
import matplotlib.pyplot as plt

In [None]:
# ----Countplot of the Type of Locality
plt.figure(figsize= (25, 8))
sns.countplot(x = cat_col.SUBLOCALITY, palette = 'BuPu')
plt.title(f'Countplot of Sublocality of the property')

**For visulatization purposes the we rotated the sublocality on the x axis so it can be read easily**

In [None]:
# ----Countplot of the Type of Locality
plt.figure(figsize= (25, 8))
sns.countplot(x = cat_col.SUBLOCALITY, palette = 'BuPu')
plt.title(f'Countplot of Sublocality of the property')
plt.xticks(rotation=90)
plt.show()

In [None]:
new_data.head()

In [None]:
#-----Using Line plot to understand the prices and the property size
avg_temp_years= new_data.groupby('PROPERTYSQFT')['PRICE'].sum().reset_index()
fig = px.line(new_data, x= "PROPERTYSQFT", y= "PRICE", color= "PROPERTYSQFT", markers= True, title= "Total Prices by Property square feet")
fig.show()

In [None]:
#-----Using Line plot to understand the prices and the property size
avg_temp_years= new_data2.groupby('PROPERTYSQFT')['PRICE'].sum().reset_index()
fig = px.line(new_data2, x= "PROPERTYSQFT", y= "PRICE", color= "PROPERTYSQFT", markers= True, title= "Total Prices by Property square feet")
fig.show()

**After getting a better understanding on the housing market in NYC, I can now make decisions on where I want to live, what type of home I want to live in. With knowing all the specifics, I can then make filter my dataframe to output housing options that would better suit me***

In [None]:
future = new_data2[(new_data2['LOCALITY'] == 'Brooklyn') & (new_data2['PRICE'] <1000000) & (new_data2['BEDS']>= 5)]

In [None]:
future