## Import the libraries

In [184]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

sns.set()

## Import the dataset

In [185]:
dataset_filepath = '/Users/davronabdukhakimov/Desktop/RealEstate_California.csv'
raw_dataset = pd.read_csv(dataset_filepath)

raw_dataset['date_posted'] = pd.to_datetime(raw_dataset['datePostedString'],format='%Y-%m-%d')
raw_dataset.dtypes

Unnamed: 0                     int64
id                            object
stateId                        int64
countyId                       int64
cityId                         int64
country                       object
datePostedString              object
is_bankOwned                   int64
is_forAuction                  int64
event                         object
time                         float64
price                        float64
pricePerSquareFoot           float64
city                          object
state                         object
yearBuilt                      int64
streetAddress                 object
zipcode                      float64
longitude                    float64
latitude                     float64
hasBadGeocode                  int64
description                   object
currency                      object
livingArea                   float64
livingAreaValue              float64
lotAreaUnits                  object
bathrooms                    float64
b

## Dealing with missing values

In [186]:
clean_dataset = raw_dataset.drop(['Unnamed: 0','cityId','id','stateId','currency','countyId','state','country','time','description','datePostedString','hasBadGeocode','is_bankOwned','is_forAuction'],axis=1)

clean_dataset = clean_dataset.dropna()
clean_dataset['hasPetsAllowed'].value_counts()

# isNewConstruction and hasPetsAllowed columns are not balanced should they be dropped

hasPetsAllowed
0    34440
1      632
Name: count, dtype: int64

In [187]:
duplicates_bool = clean_dataset.duplicated(subset = clean_dataset.drop('event',axis=1).columns,keep=False)
duplicates = clean_dataset.loc[duplicates_bool,:]

# Sort the duplicates DataFrame based on all columns except 'event'
duplicates_sorted = duplicates.sort_values(by=list(duplicates.drop('event', axis=1).columns))

# Display the reordered duplicates DataFrame
# duplicates_sorted

# Discuss about duplicate rows coming from event column with group

## Data analysis

### Ordinal encoding categorical columns

In [188]:
from sklearn.preprocessing import OrdinalEncoder

cat_columns = clean_dataset.select_dtypes(include = 'object').columns

ordinal_encoder = OrdinalEncoder()
clean_dataset[cat_columns] = ordinal_encoder.fit_transform(clean_dataset[cat_columns])
clean_dataset['pool'].value_counts()

pool
0    31066
1     4006
Name: count, dtype: int64

In [189]:
correlations = clean_dataset.corr()
correlations = correlations.round(3)
correlations.iloc[:,:13]

Unnamed: 0,event,price,pricePerSquareFoot,city,yearBuilt,streetAddress,zipcode,longitude,latitude,livingArea,livingAreaValue,lotAreaUnits,bathrooms
event,1.0,-0.016,-0.007,-0.014,-0.04,-0.025,-0.053,0.074,-0.057,-0.007,-0.007,-0.048,-0.002
price,-0.016,1.0,0.01,0.008,0.044,-0.003,-0.076,-0.036,-0.048,0.014,0.014,-0.104,0.351
pricePerSquareFoot,-0.007,0.01,1.0,-0.011,0.024,0.003,-0.008,0.003,-0.007,-0.001,-0.001,0.001,-0.01
city,-0.014,0.008,-0.011,1.0,0.055,-0.004,0.041,-0.059,0.014,-0.01,-0.01,0.06,0.023
yearBuilt,-0.04,0.044,0.024,0.055,1.0,0.107,-0.169,0.06,-0.171,-0.027,-0.027,0.363,0.509
streetAddress,-0.025,-0.003,0.003,-0.004,0.107,1.0,0.011,-0.018,0.019,0.01,0.01,0.05,0.053
zipcode,-0.053,-0.076,-0.008,0.041,-0.169,0.011,1.0,-0.784,0.885,0.014,0.014,-0.159,-0.086
longitude,0.074,-0.036,0.003,-0.059,0.06,-0.018,-0.784,1.0,-0.899,-0.006,-0.006,0.062,0.032
latitude,-0.057,-0.048,-0.007,0.014,-0.171,0.019,0.885,-0.899,1.0,0.01,0.01,-0.175,-0.115
livingArea,-0.007,0.014,-0.001,-0.01,-0.027,0.01,0.014,-0.006,0.01,1.0,1.0,-0.022,-0.004


In [190]:
correlations.iloc[:,13:]

Unnamed: 0,bedrooms,buildingArea,parking,garageSpaces,hasGarage,levels,pool,spa,isNewConstruction,hasPetsAllowed,homeType,county,date_posted
event,-0.021,-0.004,-0.009,0.023,0.015,0.021,-0.014,0.007,0.025,-0.008,-0.024,-0.007,-0.135
price,0.179,0.022,-0.016,0.111,0.041,0.027,0.151,0.106,0.037,0.012,0.073,0.018,-0.024
pricePerSquareFoot,-0.014,-0.0,0.005,0.022,0.007,-0.002,0.003,0.0,-0.001,-0.001,0.001,0.003,-0.002
city,0.026,-0.008,0.013,-0.029,-0.017,-0.028,-0.002,0.02,0.008,0.043,0.004,0.208,0.039
yearBuilt,0.546,-0.009,0.584,0.304,0.423,0.363,0.16,0.201,0.036,0.05,0.502,0.0,0.291
streetAddress,0.066,0.009,0.08,0.015,0.038,0.019,0.025,0.023,0.042,0.007,0.058,0.001,0.021
zipcode,-0.036,0.002,-0.134,-0.303,-0.389,-0.309,-0.033,-0.161,-0.003,0.044,-0.008,0.179,-0.147
longitude,0.015,-0.007,0.141,0.285,0.315,0.292,0.047,0.239,-0.012,-0.097,0.005,-0.032,0.101
latitude,-0.075,0.0,-0.183,-0.338,-0.412,-0.366,-0.046,-0.267,-0.001,0.043,-0.024,0.013,-0.163
livingArea,-0.005,0.578,-0.016,-0.005,-0.011,-0.008,-0.0,-0.003,-0.001,-0.002,-0.007,0.019,-0.016
