# Importing dataset directly from Kaggle into Colab

In [None]:
# Code to import dataset directly from Kaggle into Colab
# Need to have Kaggle user name and API token key (which may expire so new API token key might need to be generated)

!pip install opendatasets
import opendatasets as od
od.download("https://www.kaggle.com/datasets/yellowj4acket/real-estate-california")


Collecting opendatasets
  Downloading opendatasets-0.1.22-py3-none-any.whl.metadata (9.2 kB)
Downloading opendatasets-0.1.22-py3-none-any.whl (15 kB)
Installing collected packages: opendatasets
Successfully installed opendatasets-0.1.22
Please provide your Kaggle credentials to download this dataset. Learn more: http://bit.ly/kaggle-creds
Your Kaggle username: syedimtiazmir
Your Kaggle Key: ··········
Dataset URL: https://www.kaggle.com/datasets/yellowj4acket/real-estate-california
Downloading real-estate-california.zip to ./real-estate-california


100%|██████████| 11.4M/11.4M [00:00<00:00, 26.0MB/s]





In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# load the dataset
data = pd.read_csv('/content/real-estate-california/RealEstate_California.csv')

data.head()



In [None]:
# info about dataset
data.shape

(35389, 39)

In [None]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 35389 entries, 0 to 35388
Data columns (total 39 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Unnamed: 0          35389 non-null  int64  
 1   id                  35389 non-null  object 
 2   stateId             35389 non-null  int64  
 3   countyId            35389 non-null  int64  
 4   cityId              35389 non-null  int64  
 5   country             35389 non-null  object 
 6   datePostedString    35386 non-null  object 
 7   is_bankOwned        35389 non-null  int64  
 8   is_forAuction       35389 non-null  int64  
 9   event               35100 non-null  object 
 10  time                35100 non-null  float64
 11  price               35389 non-null  float64
 12  pricePerSquareFoot  35389 non-null  float64
 13  city                35389 non-null  object 
 14  state               35389 non-null  object 
 15  yearBuilt           35389 non-null  int64  
 16  stre

# Identifying columns to be dropped
1. Columns which have data that have no relevance to price prediction e.g. Serial # (Unnamed: 0), id
2. Columns which have duplicate information e.g. countyId, cityId
3. Columns which have only one value in the dataset
4. Columns which only have limited value instances e.g. is_bankOwned

In [72]:
# Initialize an empty list to store column names to drop
columns_to_drop = []

# Dropping irrelevant columns
columns_to_drop.extend(['Unnamed: 0', 'id'])

# Dropping duplicate columns
columns_to_drop.extend(['countyId', 'cityId'])


# Identify columns with only one unique value
one_unique_col = []

for column in data.columns:
    if data[column].nunique() == 1:
        one_unique_col.append((column, data[column].nunique()))
        columns_to_drop.append(column)

# Print columns with only one unique value and the count of unique values
print("Columns to drop (having only one unique value):")
for column, unique_count in one_unique_col:
    print(f"Column: {column}, Unique Values: {unique_count}")


print("\nRunnnig List of columns to drop:")
print(columns_to_drop)
print("\n")

Columns to drop (having only one unique value):
Column: stateId, Unique Values: 1
Column: country, Unique Values: 1
Column: state, Unique Values: 1
Column: currency, Unique Values: 1

Runnnig List of columns to drop:
['Unnamed: 0', 'id', 'countyId', 'cityId', 'stateId', 'country', 'state', 'currency']




In [73]:
# Identify columns with limited unique values and their counts
LIMITED_UNIQUE = 10             # constant to define how many unique values we may want to consider
limited_unique_col = []
limited_unique_col_counts = {} #dictionary to store column names and their unique values

for column in data.columns:
    if data[column].nunique() > 1 and data[column].nunique() < LIMITED_UNIQUE:
        limited_unique_col.append((column, data[column].nunique()))
        unique_values_counts = data[column].value_counts()
        limited_unique_col_counts[column] = unique_values_counts



print("="*50)

# Print columns with fewer than LIMITED_UNIQUE values and the count of unique values
print(f"\nColumns with fewer than {LIMITED_UNIQUE} unique values:")
for column, unique_count in limited_unique_col:
  print(f"Column: {column}, Unique Values: {unique_count}")

print("\n" + "="*50 + "\n\n")
# Print the unique values and their counts for columns with fewer than COUNT_INSTANCES values
COUNT_INSTANCES = 50
print(f"Unique values and their counts for columns with fewer than {COUNT_INSTANCES} unique values:\n")
for column, unique_values_counts in limited_unique_col_counts.items():
  if (unique_values_counts < COUNT_INSTANCES).any():
    print(f"Column: {column}")
    print(unique_values_counts)
    #print(unique_values_counts.sum())
    print("="*50)



Columns with fewer than 10 unique values:
Column: is_bankOwned, Unique Values: 2
Column: is_forAuction, Unique Values: 2
Column: event, Unique Values: 6
Column: hasBadGeocode, Unique Values: 2
Column: lotAreaUnits, Unique Values: 2
Column: parking, Unique Values: 2
Column: hasGarage, Unique Values: 2
Column: pool, Unique Values: 2
Column: spa, Unique Values: 2
Column: isNewConstruction, Unique Values: 2
Column: hasPetsAllowed, Unique Values: 2
Column: homeType, Unique Values: 6



Unique values and their counts for columns with fewer than 50 unique values:

Column: is_bankOwned
is_bankOwned
0    35386
1        3
Name: count, dtype: int64
Column: is_forAuction
is_forAuction
0    35363
1       26
Name: count, dtype: int64
Column: event
event
Listed for sale    24622
Price change        4893
Listing removed     4311
Sold                1268
Listed for rent        5
Pending sale           1
Name: count, dtype: int64
Column: hasBadGeocode
hasBadGeocode
0    35386
1        3
Name: count, dt

In [74]:
# Using output from above eliminating columns that have limited number of unique instances
# these include is_bankOwned, is_forAuction, hasBadGeocode

columns_to_drop.extend(['is_bankOwned', 'is_forAuction', 'hasBadGeocode'])
print("\nRunnnig List of columns to drop:")
print(columns_to_drop)
print("\n")



Runnnig List of columns to drop:
['Unnamed: 0', 'id', 'countyId', 'cityId', 'stateId', 'country', 'state', 'currency', 'is_bankOwned', 'is_forAuction', 'hasBadGeocode']




In [None]:
# Drop those columns from the dataset
# data_cleaned = data.drop(columns=columns_to_drop)

# Show the cleaned dataset with the columns dropped
# print("Dataset after dropping columns based an analysis:")
# print(data_cleaned.head())

# Cleaning Rows

After dropping columns, next step is to clean the rows.

This would include the following:
* Identify rows whose values need to be updated e.g.
  1. Column "homeType" has one row which has value "APARTMENT" --> this should be changed to "MULTI_FAMILY"
  2. Column event has 5 rows which has value "Listed for rent"
* Identify rows where price (intended model output) is missing --> these rows should be deleted
*





In [None]:
# Identify columns with missing values

# Identify columns with missing values
missing_values = data.isnull().sum()

# Filter the columns that have missing values
columns_with_missing_values = missing_values[missing_values > 0]

# Print the columns with missing values and the count of missing values
print("Columns with missing values and counts of missing values:")
print(columns_with_missing_values)

Columns with missing values and counts of missing values:
datePostedString      3
event               289
time                289
streetAddress         1
zipcode              25
description         279
dtype: int64


In [None]:
# Identify instances of rows which have no price listed

# First ensure that all values in the 'price' column are converted to numeric
data['price'] = pd.to_numeric(data['price'], errors='coerce')

# Identify count of rows with missing values or ZERO values in the 'price' column
no_price = data[data['price'].isna() | (data['price'] == 0)]
# no_price = data[data['price'] == 0]

# Print count of no_price
print("Count of rows with no price listed:")
print(len(no_price))  # Print the count of rows with no price listed



Count of rows with no price listed:
319
