In [1]:
import re

import pandas as pd
import requests
import numpy as np
from bs4 import BeautifulSoup as bs
import json
import csv
import warnings
import os
#import makeRequest

warnings.filterwarnings('ignore')
pd.set_option("display.max_rows",None)
pd.set_option("display.max_columns",None)

# Dallas Housing dataset 

## Data collection

   In this project, the house price in Dallas will be predicted using supervised machine learning technique- linear regression. The dataset will be collected from 3 sources - 
1. [Zillow](https://www.zillow.com) - is a renowned marketplace for real-estates.
2. [Walk Score](https://www.walkscore.com)
3. [Bestplaces](https://www.bestplaces.net)

    From Zillow, we will collect the data using the **API from RapidAPI** with a **personal API key**. In each call, we can collect 40 dataset in a page. So we call the API for 20 times( maximum limit for free use of the API). Similarly, the **walkscore API with a private key** is used to collect the walk,bike and transit score from the address obtained from the zillow data. Finally, some new features like- crime score, cost of living, student teacher ratio, air quality etc are added by web scrapping using **beautiful soup**. 




In [2]:
# Private API keys are stored in a file and read from it for privacy. 
with open('datacollectionAPI.txt', 'r') as f :
    rapidAPIkey = f.readline().split(' ')[1].strip()
    walkAPIkey = f.readline().split(' ')[1].strip()
    
# print("Rapid API key:", rapidAPI)
# print("Walk API key:", walkAPI)

In [3]:
# datal collection from zillow
# initalize the headers and parameters 

z_url = "https://zillow56.p.rapidapi.com/search" 

querystring = {"location":"dallas, tx","status":"forSale","isLotLand":"false","isNewConstruction":"false"}

headers={}
headers["X-RapidAPI-Key"] = rapidAPIkey
headers["X-RapidAPI-Host"] ="zillow56.p.rapidapi.com"

# headers

In [4]:
'''
This block will be commented because running this will cross the limit of free API call. 
So, it is called once and stored the raw data as json data
'''

'''
total_page =20

for page in range(total_page):
    querystring["page"]= str(page+1)
    response = requests.request("GET", z_url, headers = headers, params = querystring)
    jdata = json.loads(response.text) 
    
    # writing data in a file in json format 
    outfilename = "DallasHousingSepPage" + str(page+1)+ ".json"
    

    with open( outfilename, 'w') as outfile:
        outfile.write(json.dumps(jdata['results']))
'''

'\ntotal_page =20\n\nfor page in range(total_page):\n    querystring["page"]= str(page+1)\n    response = requests.request("GET", z_url, headers = headers, params = querystring)\n    jdata = json.loads(response.text) \n    \n    # writing data in a file in json format \n    outfilename = "DallasHousingSepPage" + str(page+1)+ ".json"\n    \n\n    with open( outfilename, \'w\') as outfile:\n        outfile.write(json.dumps(jdata[\'results\']))\n'

In [5]:
# load data

total_page =19
directory = 'Datasets/'
folder = 'zillow_rawData/'
filepath = directory + folder
json_data = []
# json_data = pd.DataFrame()
for page in range(total_page):
    datafile = 'DallasHousingSepPage'+str(page+1)+'.json'
    filename = filepath+datafile
    json_data.append(pd.read_json(filename))

df= pd.concat(json_data, ignore_index=True)


df


Unnamed: 0,bathrooms,bedrooms,city,country,currency,daysOnZillow,homeStatus,homeStatusForHDP,homeType,isFeatured,isNonOwnerOccupied,isPreforeclosureAuction,isPremierBuilder,isUnmappable,isZillowOwned,latitude,listing_sub_type,livingArea,longitude,lotAreaUnit,lotAreaValue,price,priceForHDP,rentZestimate,shouldHighlight,state,streetAddress,unit,zestimate,zipcode,zpid,taxAssessedValue,openHouse,open_house_info,datePriceChanged,priceChange,priceReduction,videoCount
0,1.0,1.0,Dallas,USA,USD,-1,FOR_SALE,FOR_SALE,CONDO,False,True,False,False,False,False,32.965416,{'is_FSBA': True},678,-96.81531,acres,8.766,168500,168500,1150.0,False,TX,5335 Bent Tree Forest Dr APT 253,Apt 253,169400.0,75248,2087749202,,,,,,,
1,3.0,4.0,Dallas,USA,USD,-1,FOR_SALE,FOR_SALE,SINGLE_FAMILY,False,True,False,False,False,False,33.007687,{'is_FSBA': True},2681,-96.80325,sqft,6534.0,575000,575000,,False,TX,6021 Buffridge Trl,,,75252,26631634,417708.0,,,,,,
2,1.0,3.0,Dallas,USA,USD,-1,FOR_SALE,FOR_SALE,SINGLE_FAMILY,False,True,False,False,False,False,32.673035,{'is_FSBA': True},1272,-96.80349,sqft,6795.36,210750,210750,1395.0,False,TX,6117 Singing Hills Dr,,221000.0,75241,26823842,155290.0,,,,,,
3,2.0,4.0,Dallas,USA,USD,-1,FOR_SALE,FOR_SALE,SINGLE_FAMILY,False,True,False,False,False,False,32.638004,{'is_FSBA': True},1796,-96.94526,sqft,6751.8,335000,335000,1990.0,False,TX,7116 Chinaberry Rd,,272300.0,75249,26892234,178710.0,,,,,,
4,2.0,2.0,Dallas,USA,USD,-1,FOR_SALE,FOR_SALE,CONDO,False,True,False,False,False,False,32.8732,{'is_FSBA': True},1164,-96.75794,acres,2.986,149900,149900,2026.0,False,TX,7135 Fair Oaks Ave APT 16,Apt 16,,75231,2126105453,,,,,,,
5,2.0,4.0,Dallas,USA,USD,-1,FOR_SALE,FOR_SALE,SINGLE_FAMILY,False,True,False,False,False,False,32.924534,"{'is_FSBA': True, 'is_openHouse': True}",2178,-96.71153,sqft,0.193,399000,399000,2400.0,False,TX,12909 Jasoncrest Trl,,401300.0,75243,26882902,246700.0,Mon. 2-4pm,{'open_house_showing': [{'open_house_end': 166...,,,,
6,2.0,3.0,Dallas,USA,USD,-1,FOR_SALE,FOR_SALE,SINGLE_FAMILY,False,True,False,False,False,False,32.898525,{'is_FSBA': True},2325,-96.86709,sqft,10203.0,648700,648700,2913.0,False,TX,3328 Townsend Dr,,637515.0,75229,26808606,320460.0,,,,,,
7,2.0,2.0,Dallas,USA,USD,-1,FOR_SALE,FOR_SALE,SINGLE_FAMILY,False,True,False,False,False,False,32.81513,{'is_FSBA': True},1544,-96.76566,sqft,9016.92,799000,799000,2699.0,False,TX,5903 Prospect Ave,,610400.0,75206,26690071,415000.0,,,,,,
8,2.0,4.0,Dallas,USA,USD,-1,FOR_SALE,FOR_SALE,SINGLE_FAMILY,False,True,False,False,False,False,32.895348,{'is_FSBA': True},2516,-96.74079,sqft,8494.2,675000,675000,,False,TX,8511 Flower Meadow Dr,,,75243,26893476,359600.0,,,,,,
9,2.0,3.0,Dallas,USA,USD,-1,FOR_SALE,FOR_SALE,SINGLE_FAMILY,False,True,False,False,False,False,32.745106,{'is_FSBA': True},1400,-96.700485,sqft,7492.32,275000,275000,,False,TX,1924 Nemechek Dr,,,75217,2067047264,,,,,,,


In [6]:
# extracted features from zillow raw data
feature_column =['zpid','streetAddress','city','state','zipcode','latitude','longitude',
                 'bedrooms','bathrooms','livingArea','lotAreaValue',
                 'lotAreaUnit','rentZestimate','taxAssessedValue','price']

feature_column

['zpid',
 'streetAddress',
 'city',
 'state',
 'zipcode',
 'latitude',
 'longitude',
 'bedrooms',
 'bathrooms',
 'livingArea',
 'lotAreaValue',
 'lotAreaUnit',
 'rentZestimate',
 'taxAssessedValue',
 'price']

In [7]:
# raw data extracting feature column
raw_data = df[feature_column]
raw_data

Unnamed: 0,zpid,streetAddress,city,state,zipcode,latitude,longitude,bedrooms,bathrooms,livingArea,lotAreaValue,lotAreaUnit,rentZestimate,taxAssessedValue,price
0,2087749202,5335 Bent Tree Forest Dr APT 253,Dallas,TX,75248,32.965416,-96.81531,1.0,1.0,678,8.766,acres,1150.0,,168500
1,26631634,6021 Buffridge Trl,Dallas,TX,75252,33.007687,-96.80325,4.0,3.0,2681,6534.0,sqft,,417708.0,575000
2,26823842,6117 Singing Hills Dr,Dallas,TX,75241,32.673035,-96.80349,3.0,1.0,1272,6795.36,sqft,1395.0,155290.0,210750
3,26892234,7116 Chinaberry Rd,Dallas,TX,75249,32.638004,-96.94526,4.0,2.0,1796,6751.8,sqft,1990.0,178710.0,335000
4,2126105453,7135 Fair Oaks Ave APT 16,Dallas,TX,75231,32.8732,-96.75794,2.0,2.0,1164,2.986,acres,2026.0,,149900
5,26882902,12909 Jasoncrest Trl,Dallas,TX,75243,32.924534,-96.71153,4.0,2.0,2178,0.193,sqft,2400.0,246700.0,399000
6,26808606,3328 Townsend Dr,Dallas,TX,75229,32.898525,-96.86709,3.0,2.0,2325,10203.0,sqft,2913.0,320460.0,648700
7,26690071,5903 Prospect Ave,Dallas,TX,75206,32.81513,-96.76566,2.0,2.0,1544,9016.92,sqft,2699.0,415000.0,799000
8,26893476,8511 Flower Meadow Dr,Dallas,TX,75243,32.895348,-96.74079,4.0,2.0,2516,8494.2,sqft,,359600.0,675000
9,2067047264,1924 Nemechek Dr,Dallas,TX,75217,32.745106,-96.700485,3.0,2.0,1400,7492.32,sqft,,,275000


## Walk and bike score:

The walk, bike and transit score have been collected based on the location of the property from [Walkscore](https://www.walkscore.com/)  using API and private key. The address contains apartment number (like APT,Unit,#) which gives wrong score with API. So the address are modified excluding the apartment numbers from street address of the properties.



In [8]:
# remove the apartment number

# street address in lower case
raw_data['modified_streetAddress'] = raw_data['streetAddress']
raw_data['modified_streetAddress']= raw_data['modified_streetAddress'].str.lower()


# conditions
conditions = [
    raw_data.modified_streetAddress.str.contains('apt'),
    raw_data.modified_streetAddress.str.contains('unit'),
    raw_data.modified_streetAddress.str.contains('#')
]

# choice based on conditions
choice =[
    raw_data.modified_streetAddress.apply(lambda x: x[:x.find('apt')]),
    raw_data.modified_streetAddress.apply(lambda x: x[:x.find('unit')]),
    raw_data.modified_streetAddress.apply(lambda x: x[:x.find('#')])
]

raw_data['modified_streetAddress'] =np.select(conditions,choice, default=raw_data['modified_streetAddress'])

In [9]:
raw_data['modified_streetAddress'] #print to  check address without apartment number

0         5335 bent tree forest dr 
1                6021 buffridge trl
2             6117 singing hills dr
3                7116 chinaberry rd
4               7135 fair oaks ave 
5              12909 jasoncrest trl
6                  3328 townsend dr
7                 5903 prospect ave
8             8511 flower meadow dr
9                  1924 nemechek dr
10           10724 park village pl 
11          5200 keller springs rd 
12                 2202 lawndale dr
13             12220 brookmeadow ln
14                 4107 bowser ave 
15             6218 liberty hill ln
16                   4715 stokes st
17              9030 meadowknoll dr
18                  9803 walnut st 
19                2535 kathleen ave
20              14228 open range dr
21              10230 china tree dr
22                4607 garrison ave
23              8307 stony creek dr
24               9921 greenfield dr
25                 2740 san jose dr
26               4122 avondale ave 
27                     4062 

In [25]:
# walkscore data collection function

def add_feature_walkscore(data):
    w_url = 'https://api.walkscore.com/score'

    # initial empty list
    walk_val = []
    bike_val = []
    for key,row in data.iterrows():
        param = '?format=json&address='+row['modified_streetAddress']+row['city']+row['state']+str(row['zipcode'])+'&lat='+str(row['latitude'])+'&lon='+str(row['longitude'])+'&transit=0&bike=1&wsapikey='+ walkAPIkey
        final_url = w_url + param
        w_response = requests.get(final_url)

        if w_response.status_code == 200:
            if 'walkscore' in w_response.json():
                # print(w_response.json()['walkscore'])
                walk_val.append( w_response.json()['walkscore'])
            else:
                walk_val.append( float('nan'))
        if 'bike' in w_response.json():
            bike_val.append(w_response.json()['bike']['score'])
        else:
            bike_val.append(float('nan'))

    return walk_val,bike_val


In [26]:
# # walkscore data collection
#
# w_url = 'https://api.walkscore.com/score'
#
# # initialize empty list
# walk_val =[]
# bike_val =[]
# # tran_val =[]
#
# for key,row in raw_data.iterrows() :
#     param = '?format=json&address='+row['modified_streetAddress']+row['city']+row['state']+str(row['zipcode'])+'&lat='+str(row['latitude'])+'&lon='+str(row['longitude'])+'&transit=0&bike=1&wsapikey='+ walkAPIkey
#     final_url = w_url + param
#     w_response = requests.get(final_url)
#     # print(w_response.json())
#     # print('==============================================')
#
#     if w_response.status_code == 200:
#         if 'walkscore' in w_response.json():
#             # print(w_response.json()['walkscore'])
#             walk_val.append( w_response.json()['walkscore'])
#         else:
#             walk_val.append( float('nan'))
#         if 'bike' in w_response.json():
#             bike_val.append(w_response.json()['bike']['score'])
#         else:
#             bike_val.append(float('nan'))
#         # if 'transit' in w_response.json():
#         #     tran_val.append(w_response.json()['transit']['score'])
#         # else:
#         #     tran_val.append(float('nan'))
#
#     # print(w_response.status_code)
#     # print(w_response.text)

# print('walkscore=', walk_val)
# print('bikescore',bike_val)

bike_val,walk_val = add_feature_walkscore(raw_data)

raw_data['walkscore'] = pd.Series(walk_val)#, index=[0,1])
raw_data['bikescore'] = pd.Series(bike_val)#, index=[0,1])
# raw_data['transitscore']=pd.Series(tran_val)

In [27]:
raw_data

Unnamed: 0,zpid,streetAddress,city,state,zipcode,latitude,longitude,bedrooms,bathrooms,livingArea,lotAreaValue,lotAreaUnit,rentZestimate,taxAssessedValue,price,modified_streetAddress,walkscore,bikescore
0,2087749202,5335 Bent Tree Forest Dr APT 253,Dallas,TX,75248,32.965416,-96.81531,1.0,1.0,678,8.766,acres,1150.0,,168500,5335 bent tree forest dr,44.0,43.0
1,26631634,6021 Buffridge Trl,Dallas,TX,75252,33.007687,-96.80325,4.0,3.0,2681,6534.0,sqft,,417708.0,575000,6021 buffridge trl,33.0,24.0
2,26823842,6117 Singing Hills Dr,Dallas,TX,75241,32.673035,-96.80349,3.0,1.0,1272,6795.36,sqft,1395.0,155290.0,210750,6117 singing hills dr,40.0,40.0
3,26892234,7116 Chinaberry Rd,Dallas,TX,75249,32.638004,-96.94526,4.0,2.0,1796,6751.8,sqft,1990.0,178710.0,335000,7116 chinaberry rd,36.0,32.0
4,2126105453,7135 Fair Oaks Ave APT 16,Dallas,TX,75231,32.8732,-96.75794,2.0,2.0,1164,2.986,acres,2026.0,,149900,7135 fair oaks ave,59.0,67.0
5,26882902,12909 Jasoncrest Trl,Dallas,TX,75243,32.924534,-96.71153,4.0,2.0,2178,0.193,sqft,2400.0,246700.0,399000,12909 jasoncrest trl,40.0,18.0
6,26808606,3328 Townsend Dr,Dallas,TX,75229,32.898525,-96.86709,3.0,2.0,2325,10203.0,sqft,2913.0,320460.0,648700,3328 townsend dr,75.0,61.0
7,26690071,5903 Prospect Ave,Dallas,TX,75206,32.81513,-96.76566,2.0,2.0,1544,9016.92,sqft,2699.0,415000.0,799000,5903 prospect ave,67.0,77.0
8,26893476,8511 Flower Meadow Dr,Dallas,TX,75243,32.895348,-96.74079,4.0,2.0,2516,8494.2,sqft,,359600.0,675000,8511 flower meadow dr,49.0,45.0
9,2067047264,1924 Nemechek Dr,Dallas,TX,75217,32.745106,-96.700485,3.0,2.0,1400,7492.32,sqft,,,275000,1924 nemechek dr,44.0,46.0


## Extract feature with zipcode:

Now, some additional features will be added based on the zipcode in the raw data. The features include - **violent  and property crime index**, **cost of living**,**average salary**, **student teacher ratio**, **air quality index** and **number of people per household:** .

In [16]:
# %run makeRequest.py # request make with parameters

In [17]:
# finding out the features using zipcode :
# viloent crime, property crime
# cost of living
# avarage salary
# student teacher ratio
# air quality index
# number of people per household


def add_feature_zipcode(zcode):
    '''
    Funtion to find out the extracted features
    from besplace.net based on the zipcode
    :param zcode
    :return: z_feature[violent crime, property crime, cost of living,avg salary,st_ratio,air quality index, number of people per household]
    '''
    z_website= "https://www.bestplaces.net/"
    location = "/zip-code/texas/dallas/"
    z_features =[]

    # extract feature
    extract_features = ['crime','cost_of_living','jobs','education','health','people']
    # feature matrix columns : violent_crime, property_crime, cost_of_living, avg_salary,st_ratio,air_quality_index,n_perHousehold
    for f in extract_features:
        url = z_website + f +location +str(zcode)
        r = requests.get(url, verify = False) # verify removes SSL certification error
        # r = make_request(url,n_try = 5, b_factor = 0.3)
        soup = bs(r.text,'html.parser')

        if f == 'crime':
            divs = soup.find_all("div", {"class": "col-md-12"})
            # find violent crime index
            soup_split = str(divs[1]).split('violent crime is ')[1]
            z_features.append(re.findall("[0-9]+[\.][0-9]+", soup_split)[0])


            # find property crime index
            soup_split = str(divs[1]).split('property crime is ')[1]
            z_features.append(re.findall("[0-9]+[\.][0-9]+", soup_split)[0])

        elif f == 'cost_of_living':
            divs = soup.find_all("div", {"class": "col-md-12"})
            soup_split = str(divs[1]).split('cost of living is')[1]
            z_features.append(re.findall("[0-9\.]+", soup_split)[0])

        elif f == 'jobs':
            divs = soup.find_all("div", {"class": "col-md-12"})
            soup_split = str(divs[2]).split('average salary in ')[1]
            # print(type(soup_split))
            z_features.append(re.findall("\$[0-9\,]+", soup_split)[0])

        elif f == 'education':
            divs = soup.find_all("div", {"class": "col-md-12"})
            soup_split = str(divs[1]).split("pupils per teacher")[0]
            z_features.append(len(re.findall("[0-9]+", soup_split))-1)

        elif f == 'health':
            divs = soup.find_all("div", {"class": "display-4"})
            z_features.append(re.findall("\>([0-9]+)", str(divs[3]))[0])

        else:
            divs = soup.find_all("div", {"class": "col-md-12"})
            soup_split = str(divs[1]).split("number of people per household ")[1]
            z_features.append(re.findall("[0-9\.]+", soup_split)[1])

    return z_features


In [18]:
# indexes of extracted features using zipcode
z_index = ["SerialNo","violent_crime","property_crime","cost_of_living",
           "avg_sal","st_ratio","air_quality","n_per_household"]

zipcode_features = pd.DataFrame(columns=z_index)
error_key =[]
f =open('error_index.txt','w+')

for key,value in enumerate(raw_data['zipcode']):
    try :
        zip_feature=[]
        zip_feature = add_feature_zipcode(value)
        zip_feature.insert(0,key)
        zipcode_features.loc[len(zipcode_features.index)]=zip_feature
    except:
        print("Error !!! Error index :",key)
        error_key.append(key)
        print(f.write(str(key)))



f.close()
error_key

[]

In [19]:
# the web scrapping is completed for all the zipcodes from the raw_data.
# Sometimes, more calls are made than the calls per second for that websites.
# so it skips some index. So "Serial No" column was added to track the missing index
# As there is no missing index, the "Serial No" column is dropped here.
zip_data =zipcode_features.drop(["SerialNo"], axis=1)
zip_data

Unnamed: 0,violent_crime,property_crime,cost_of_living,avg_sal,st_ratio,air_quality,n_per_household
0,35.2,49.4,129.0,"$68,745",10,49,2.2
1,42.2,56.5,124.0,"$55,237",10,48,2.1
2,70.1,76.0,92.1,"$32,258",10,25,3.0
3,37.9,50.8,100.2,"$53,624",10,59,3.0
4,66.4,74.8,92.5,"$32,011",10,47,2.5
5,63.7,72.7,102.0,"$36,010",10,48,2.5
6,32.4,44.1,131.3,"$74,274",10,47,3.0
7,47.6,61.2,132.5,"$52,889",10,44,1.8
8,63.7,72.7,102.0,"$36,010",10,48,2.5
9,60.6,67.8,92.1,"$35,786",10,50,3.8


In [28]:
# concat the zipcode features with raw data
final_rawData = pd.concat([raw_data.drop(['modified_streetAddress'],axis=1),zip_data],axis=1)
final_rawData

Unnamed: 0,zpid,streetAddress,city,state,zipcode,latitude,longitude,bedrooms,bathrooms,livingArea,lotAreaValue,lotAreaUnit,rentZestimate,taxAssessedValue,price,walkscore,bikescore,violent_crime,property_crime,cost_of_living,avg_sal,st_ratio,air_quality,n_per_household
0,2087749202,5335 Bent Tree Forest Dr APT 253,Dallas,TX,75248,32.965416,-96.81531,1.0,1.0,678,8.766,acres,1150.0,,168500,44.0,43.0,35.2,49.4,129.0,"$68,745",10,49,2.2
1,26631634,6021 Buffridge Trl,Dallas,TX,75252,33.007687,-96.80325,4.0,3.0,2681,6534.0,sqft,,417708.0,575000,33.0,24.0,42.2,56.5,124.0,"$55,237",10,48,2.1
2,26823842,6117 Singing Hills Dr,Dallas,TX,75241,32.673035,-96.80349,3.0,1.0,1272,6795.36,sqft,1395.0,155290.0,210750,40.0,40.0,70.1,76.0,92.1,"$32,258",10,25,3.0
3,26892234,7116 Chinaberry Rd,Dallas,TX,75249,32.638004,-96.94526,4.0,2.0,1796,6751.8,sqft,1990.0,178710.0,335000,36.0,32.0,37.9,50.8,100.2,"$53,624",10,59,3.0
4,2126105453,7135 Fair Oaks Ave APT 16,Dallas,TX,75231,32.8732,-96.75794,2.0,2.0,1164,2.986,acres,2026.0,,149900,59.0,67.0,66.4,74.8,92.5,"$32,011",10,47,2.5
5,26882902,12909 Jasoncrest Trl,Dallas,TX,75243,32.924534,-96.71153,4.0,2.0,2178,0.193,sqft,2400.0,246700.0,399000,40.0,18.0,63.7,72.7,102.0,"$36,010",10,48,2.5
6,26808606,3328 Townsend Dr,Dallas,TX,75229,32.898525,-96.86709,3.0,2.0,2325,10203.0,sqft,2913.0,320460.0,648700,75.0,61.0,32.4,44.1,131.3,"$74,274",10,47,3.0
7,26690071,5903 Prospect Ave,Dallas,TX,75206,32.81513,-96.76566,2.0,2.0,1544,9016.92,sqft,2699.0,415000.0,799000,67.0,77.0,47.6,61.2,132.5,"$52,889",10,44,1.8
8,26893476,8511 Flower Meadow Dr,Dallas,TX,75243,32.895348,-96.74079,4.0,2.0,2516,8494.2,sqft,,359600.0,675000,49.0,45.0,63.7,72.7,102.0,"$36,010",10,48,2.5
9,2067047264,1924 Nemechek Dr,Dallas,TX,75217,32.745106,-96.700485,3.0,2.0,1400,7492.32,sqft,,,275000,44.0,46.0,60.6,67.8,92.1,"$35,786",10,50,3.8


In [29]:
filename  = 'final_rawData.csv'
folder ='final_rawData/'
filepath = directory+folder
os.makedirs(filepath,exist_ok=True)
file = filepath+filename
final_rawData.to_csv(file)

In [None]:
# # sc = SparkContext(master = 'local')
# rdd1 = sc.parallelize([1,2,3])

In [None]:
# rdd1.collect()

In [None]:
# frdd = sc.textFile("shakespeare.txt").collect()

In [None]:
# frdd.collect()

In [None]:
# example = add_feature_zipcode(75243)

In [None]:
# example

In [None]:
# re.findall("(\$)[0-9]+(\,)[0-9]+",example)