In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline


In [None]:
nyc = pd.read_csv('NY-House-Dataset.csv')
nyc.head()

In [None]:
nyc.info()

In [None]:
nyc.describe()

In [None]:
## Check the missing Values
nyc.isnull().sum()

In [None]:
# Exploratory Data Analysis
# Correlation
nyc.corr()

In [None]:
# sns.pairplot(nyc)

## Analyzing The Correlated Features

In [None]:
nyc.corr()

In [None]:
plt.scatter(nyc['PROPERTYSQFT'], nyc['PRICE'])
plt.xlabel("Property SQFT")
plt.ylabel("Price")

In [None]:
plt.scatter(nyc['BEDS'],nyc['BATH'])
plt.xlabel("Bedroom")
plt.ylabel("Bathroom")

## Scatter Plot - Price vs. Property Square Footage

In [None]:

nyc['Log_PROPERTYSQFT'] = np.log(nyc['PROPERTYSQFT'])
nyc['Log_PRICE'] = np.log(nyc['PRICE'])

# Create the scatter plot
plt.figure(figsize=(10, 6))
scatter_plot = sns.scatterplot(data=nyc, x='Log_PROPERTYSQFT', y='Log_PRICE', hue='TYPE')

# Setting the title and labels
scatter_plot.set_title('Price vs. Property Square Footage (Logarithmic Scale)')
scatter_plot.set_xlabel('Log of Property Square Footage')
scatter_plot.set_ylabel('Log of Price')

# Show the plot
plt.show()


## Price Distribution by House Type (Logarithmic Scale)

In [None]:

# Create the box plot
plt.figure(figsize=(12, 8))  # Adjust the figure size as needed
box_plot = sns.boxplot(data=nyc, x='TYPE', y='PRICE', palette='Set3')

# Setting the title and labels
box_plot.set_title('Price Distribution by House Type (Logarithmic Scale)')
box_plot.set_xlabel('House Type')
box_plot.set_ylabel('Price')

# Set the y-axis to logarithmic scale
box_plot.set_yscale('log')

# Rotate x-axis labels
plt.xticks(rotation=45, ha='right')  # Rotate the labels and align right

# Improve layout
plt.tight_layout()

# Show the plot
plt.show()



## Distribution of Real Estate

In [None]:
# Plot
plt.figure(figsize=(12, 8))  # Size of the figure
plt.scatter(nyc['LONGITUDE'], nyc['LATITUDE'], c=nyc['PRICE'], cmap='viridis')  # Color by PRICE

# Title and labels
plt.title('Distribution of Real Estate')
plt.xlabel('Longitude')
plt.ylabel('Latitude')

# Colorbar
plt.colorbar(label='Price')

# Show the plot
plt.show()


In [None]:
# Perform one-hot encoding on the 'TYPE' column
nyc_encoded = pd.get_dummies(nyc, columns=['TYPE'])

# Now, nyc_encoded has additional columns, one for each property type,
# with binary indicators. You can use this dataframe to feed into a model.

# Let's view the first few rows to confirm the encoding
nyc_encoded.head()


# Splitting the data into features and target variable
X = nyc_encoded.drop(['PRICE','ADDRESS', 'MAIN_ADDRESS', 'FORMATTED_ADDRESS', 'BROKERTITLE','STATE', 
                      'ADMINISTRATIVE_AREA_LEVEL_2','LOCALITY','SUBLOCALITY','STREET_NAME',
                      'LONG_NAME', 'Log_PRICE'], axis=1)  # Drop non-encoded and target columns
y = nyc_encoded['Log_PRICE']  # Use the logarithm of the price as the target





In [None]:
X

In [None]:
y

In [None]:
##Train Test Split
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3,random_state=42)

In [None]:
X_train

In [None]:
X_test

In [None]:
## Standardize the dataset
from sklearn.preprocessing import StandardScaler
scaler=StandardScaler()

In [None]:
X_train=scaler.fit_transform(X_train)

In [None]:
X_test=scaler.transform(X_test)

## Model Training

In [None]:
from sklearn.linear_model import LinearRegression

In [None]:
regression=LinearRegression()

In [None]:
regression.fit(X_train,y_train)

In [None]:
## print the coefficients and the intercept
print(regression.coef_)

In [None]:
## on which parameters the model has been trained
regression.get_params()

In [None]:
### Prediction With Test Data
reg_pred=regression.predict(X_test)

In [None]:
reg_pred

## Assumptions

In [None]:
## plot a scatter plot for the prediction
plt.scatter(y_test,reg_pred)

In [None]:
## Residuals
residuals=y_test-reg_pred

In [None]:
residuals

In [None]:
## Plot this residuals 

sns.displot(residuals,kind="kde")

In [None]:
## Scatter plot with respect to prediction and residuals
## uniform distribution
plt.scatter(reg_pred,residuals)

In [None]:
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error

print(mean_absolute_error(y_test,reg_pred))
print(mean_squared_error(y_test,reg_pred))
print(np.sqrt(mean_squared_error(y_test,reg_pred)))

## R square and adjusted R square

In [None]:
#R^2 = coefficient of determination SSR = sum of squares of residuals SST = total sum of squares
from sklearn.metrics import r2_score
score=r2_score(y_test,reg_pred)
print(score)

In [None]:
# Adjusted R2 = 1 – [(1-R2)*(n-1)/(n-k-1)]
#display adjusted R-squared
1 - (1-score)*(len(y_test)-1)/(len(y_test)-X_test.shape[1]-1)

## New Data Prediction

In [None]:
nyc.columns

In [None]:
X.columns

In [None]:


# Selecting the features from the first row
sample_features = nyc_encoded.loc[0, X.columns].values

# Reshaping into a 2D array
sample_features_reshaped = sample_features.reshape(1, -1)

sample_features_reshaped


In [None]:
scaler.transform(sample_features_reshaped)

In [None]:
regression.predict(scaler.transform(sample_features_reshaped.reshape(1,-1)))

## Pickling Model File For Deployment

In [None]:
import pickle

In [None]:
pickle.dump(regression,open('regmodel.pkl', 'wb'))

In [None]:
pickled_model = pickle.load(open('regmodel.pkl', 'rb'))

In [None]:
pickled_model.predict(scaler.transform(sample_features_reshaped.reshape(1,-1)))