In [6]:
# Troubleshooting script for reading a CSV file

def read_binary(file_path):
    try:
        with open(file_path, 'rb') as file:
            contents = file.read(500)
            print(contents)
    except Exception as e:
        print(f"An error occurred while reading the file in binary mode: {e}")

def read_text(file_path):
    try:
        with open(file_path, 'r', encoding='ISO-8859-1') as file:
            lines = file.readlines(10)  # Try to read the first 10 lines
            for line in lines:
                print(line)
    except Exception as e:
        print(f"An error occurred while reading the file in text mode: {e}")

# Replace 'your_file_path_here.csv' with the path to your Zillow rent data CSV file
file_path = '/Users/lawrence/Documents/GitHub/IEOR4501-Project/Data/zillow_rent_data/zillow_rent_data.csv'

print("Reading in binary mode:")
read_binary(file_path)

print("\nReading in text mode:")
read_text(file_path)


Reading in binary mode:
b'RegionID,SizeRank,RegionName,RegionType,StateName,State,City,Metro,CountyName,2015-01-31,2015-02-28,2015-03-31,2015-04-30,2015-05-31,2015-06-30,2015-07-31,2015-08-31,2015-09-30,2015-10-31,2015-11-30,2015-12-31,2016-01-31,2016-02-29,2016-03-31,2016-04-30,2016-05-31,2016-06-30,2016-07-31,2016-08-31,2016-09-30,2016-10-31,2016-11-30,2016-12-31,2017-01-31,2017-02-28,2017-03-31,2017-04-30,2017-05-31,2017-06-30,2017-07-31,2017-08-31,2017-09-30,2017-10-31,2017-11-30,2017-12-31,2018-01-31,2018-02-28,2018'

Reading in text mode:
RegionID,SizeRank,RegionName,RegionType,StateName,State,City,Metro,CountyName,2015-01-31,2015-02-28,2015-03-31,2015-04-30,2015-05-31,2015-06-30,2015-07-31,2015-08-31,2015-09-30,2015-10-31,2015-11-30,2015-12-31,2016-01-31,2016-02-29,2016-03-31,2016-04-30,2016-05-31,2016-06-30,2016-07-31,2016-08-31,2016-09-30,2016-10-31,2016-11-30,2016-12-31,2017-01-31,2017-02-28,2017-03-31,2017-04-30,2017-05-31,2017-06-30,2017-07-31,2017-08-31,2017-09-30,2017-10-

In [8]:
import pandas as pd

# The path to the Zillow rent data CSV file
zillow_rent_data_path = '/Users/lawrence/Documents/GitHub/IEOR4501-Project/Data/zillow_rent_data/zillow_rent_data.csv'

# Load the Zillow rent data
zillow_data = pd.read_csv(zillow_rent_data_path)

# Display the first few rows of the dataframe to understand its structure
print(zillow_data.head())


   RegionID  SizeRank  RegionName RegionType StateName State      City  \
0     91982         1       77494        zip        TX    TX      Katy   
1     91940         3       77449        zip        TX    TX      Katy   
2     91733         5       77084        zip        TX    TX   Houston   
3     93144         6       79936        zip        TX    TX   El Paso   
4     62093         7       11385        zip        NY    NY  New York   

                                   Metro        CountyName   2015-01-31  ...  \
0   Houston-The Woodlands-Sugar Land, TX  Fort Bend County  1606.206406  ...   
1   Houston-The Woodlands-Sugar Land, TX     Harris County  1257.814660  ...   
2   Houston-The Woodlands-Sugar Land, TX     Harris County          NaN  ...   
3                            El Paso, TX    El Paso County          NaN  ...   
4  New York-Newark-Jersey City, NY-NJ-PA     Queens County          NaN  ...   

    2022-12-31   2023-01-31   2023-02-28   2023-03-31   2023-04-30  \
0  1

In [11]:
import pandas as pd

# Load the Zillow rent data
zillow_rent_data_path = '/Users/lawrence/Documents/GitHub/IEOR4501-Project/Data/zillow_rent_data/zillow_rent_data.csv'
zillow_data = pd.read_csv(zillow_rent_data_path)

# Cleaning steps:

# Step 1: Remove unnecessary columns
columns_to_keep = ['RegionName', 'State', 'City', 'Metro', 'CountyName'] + [col for col in zillow_data.columns if '-' in col]
zillow_data_cleaned = zillow_data[columns_to_keep]

# Step 2: Remove rows with more than 50% missing values in the monthly rent columns
monthly_rent_columns = [col for col in zillow_data.columns if '-' in col]
zillow_data_cleaned = zillow_data_cleaned.dropna(thresh=len(monthly_rent_columns)/2, axis=0)

# Step 3: Normalize column names
zillow_data_cleaned = zillow_data_cleaned.rename(columns={'RegionName': 'ZipCode'})

# Display the cleaned dataframe
print(zillow_data_cleaned.head())

# Optionally, save the cleaned data to a new CSV file
zillow_data_cleaned.to_csv('/Users/lawrence/Documents/GitHub/IEOR4501-Project/Data/zillow_rent_data/zillow_rent_data_cleaned.csv', index=False)


   ZipCode State          City                                  Metro  \
0    77494    TX          Katy   Houston-The Woodlands-Sugar Land, TX   
1    77449    TX          Katy   Houston-The Woodlands-Sugar Land, TX   
2    77084    TX       Houston   Houston-The Woodlands-Sugar Land, TX   
4    11385    NY      New York  New York-Newark-Jersey City, NY-NJ-PA   
5    78660    TX  Pflugerville       Austin-Round Rock-Georgetown, TX   

         CountyName   2015-01-31   2015-02-28   2015-03-31   2015-04-30  \
0  Fort Bend County  1606.206406  1612.779844  1622.201575  1630.392427   
1     Harris County  1257.814660  1255.268025  1262.170452  1274.955754   
2     Harris County          NaN          NaN          NaN          NaN   
4     Queens County          NaN  2087.527084          NaN  2149.924252   
5     Travis County  1399.372678  1411.391149  1396.562265  1390.741122   

    2015-05-31  ...   2022-12-31   2023-01-31   2023-02-28   2023-03-31  \
0  1632.411500  ...  1994.653463  2

In [13]:
pip install geopandas

Defaulting to user installation because normal site-packages is not writeable
Collecting geopandas
  Downloading geopandas-0.14.1-py3-none-any.whl (1.1 MB)
[K     |████████████████████████████████| 1.1 MB 8.1 MB/s eta 0:00:01
Collecting shapely>=1.8.0
  Downloading shapely-2.0.2-cp39-cp39-macosx_11_0_arm64.whl (1.3 MB)
[K     |████████████████████████████████| 1.3 MB 5.4 MB/s eta 0:00:01
Collecting fiona>=1.8.21
  Downloading fiona-1.9.5-cp39-cp39-macosx_11_0_arm64.whl (14.0 MB)
[K     |████████████████████████████████| 14.0 MB 4.8 MB/s eta 0:00:01
[?25hCollecting pyproj>=3.3.0
  Downloading pyproj-3.6.1-cp39-cp39-macosx_11_0_arm64.whl (4.9 MB)
[K     |████████████████████████████████| 4.9 MB 5.0 MB/s eta 0:00:01
[?25hCollecting cligj>=0.5
  Downloading cligj-0.7.2-py3-none-any.whl (7.1 kB)
Collecting attrs>=19.2.0
  Downloading attrs-23.1.0-py3-none-any.whl (61 kB)
[K     |████████████████████████████████| 61 kB 6.1 MB/s eta 0:00:011
Collecting click-plugins>=1.0
  Downloading 

In [14]:
import geopandas as gpd

# Path to your NYC zip code shapefile components
zip_code_shp_path = '/Users/lawrence/Documents/GitHub/IEOR4501-Project/Data/nyc_zipcodes/nyc_zipcodes.shp'
zip_code_dbf_path = '/Users/lawrence/Documents/GitHub/IEOR4501-Project/Data/nyc_zipcodes/nyc_zipcodes.dbf'
zip_code_prj_path = '/Users/lawrence/Documents/GitHub/IEOR4501-Project/Data/nyc_zipcodes/nyc_zipcodes.prj'
zip_code_shx_path = '/Users/lawrence/Documents/GitHub/IEOR4501-Project/Data/nyc_zipcodes/nyc_zipcodes.shx'

# Load the shapefile
zip_codes_gdf = gpd.read_file(zip_code_shp_path)

# Inspect the first few rows
print(zip_codes_gdf.head())

# Remove unnecessary columns, you will need to specify which columns to drop
# For example, if you only need the zip code and geometry, you can drop the other columns
# zip_codes_gdf = zip_codes_gdf.drop(columns=['UnneededColumn1', 'UnneededColumn2'])

# Normalize the SRID if needed (you will need to know the SRID you want to use)
# zip_codes_gdf = zip_codes_gdf.to_crs(epsg=your_srid)

# Save the cleaned GeoDataFrame back to a shapefile if needed
# zip_codes_gdf.to_file('path_to_your_shapefile/nyc_zipcodes_cleaned.shp')


  ZIPCODE BLDGZIP   PO_NAME  POPULATION          AREA STATE  COUNTY ST_FIPS  \
0   11436       0   Jamaica     18681.0  2.269930e+07    NY  Queens      36   
1   11213       0  Brooklyn     62426.0  2.963100e+07    NY   Kings      36   
2   11212       0  Brooklyn     83866.0  4.197210e+07    NY   Kings      36   
3   11225       0  Brooklyn     56527.0  2.369863e+07    NY   Kings      36   
4   11218       0  Brooklyn     72280.0  3.686880e+07    NY   Kings      36   

  CTY_FIPS                   URL  SHAPE_AREA  SHAPE_LEN  \
0      081  http://www.usps.com/         0.0        0.0   
1      047  http://www.usps.com/         0.0        0.0   
2      047  http://www.usps.com/         0.0        0.0   
3      047  http://www.usps.com/         0.0        0.0   
4      047  http://www.usps.com/         0.0        0.0   

                                            geometry  
0  POLYGON ((1038098.252 188138.380, 1038141.936 ...  
1  POLYGON ((1001613.713 186926.440, 1002314.243 ...  
2  PO

In [16]:
import geopandas as gpd

# Load the shapefile
zip_codes_gdf = gpd.read_file('/Users/lawrence/Documents/GitHub/IEOR4501-Project/Data/nyc_zipcodes/nyc_zipcodes.shp')

# Cleaning steps:

# Step 1: Remove unnecessary columns
columns_to_drop = ['BLDGZIP', 'STATE', 'ST_FIPS', 'CTY_FIPS', 'URL', 'SHAPE_AREA', 'SHAPE_LEN']
zip_codes_gdf_cleaned = zip_codes_gdf.drop(columns=columns_to_drop)

# Step 2: Remove invalid data points (if any)
# This step depends on your specific criteria for invalid data points

# Step 3: Normalize column names
# Example: renaming 'PO_NAME' to 'City' for clarity
zip_codes_gdf_cleaned = zip_codes_gdf_cleaned.rename(columns={'PO_NAME': 'City'})

# Step 4: Check SRID
# If the SRID is not set or not what you want, you can set it like this:
zip_codes_gdf_cleaned = zip_codes_gdf_cleaned.to_crs(epsg=2263)

# Save the cleaned GeoDataFrame back to a shapefile if needed
zip_codes_gdf_cleaned.to_file('/Users/lawrence/Documents/GitHub/IEOR4501-Project/Data/nyc_zipcodes/nyc_zipcodes_cleaned.shp')

# Display the cleaned GeoDataFrame
print(zip_codes_gdf_cleaned.head())


  ZIPCODE      City  POPULATION          AREA  COUNTY  \
0   11436   Jamaica     18681.0  2.269930e+07  Queens   
1   11213  Brooklyn     62426.0  2.963100e+07   Kings   
2   11212  Brooklyn     83866.0  4.197210e+07   Kings   
3   11225  Brooklyn     56527.0  2.369863e+07   Kings   
4   11218  Brooklyn     72280.0  3.686880e+07   Kings   

                                            geometry  
0  POLYGON ((1038098.252 188138.380, 1038141.936 ...  
1  POLYGON ((1001613.713 186926.440, 1002314.243 ...  
2  POLYGON ((1011174.276 183696.338, 1011373.584 ...  
3  POLYGON ((995908.365 183617.613, 996522.848 18...  
4  POLYGON ((991997.113 176307.496, 992042.798 17...  
