### Libraries

In [1]:
import pandas as pd
import re
import requests

**Objective**: Extract and transform information from a csv file to obtain information about some motorhomes areas

In [2]:
df = pd.read_csv("../files/01-areas_ac.csv")
df.head()

Unnamed: 0,WKT,NOMBRE
0,POINT (-0.4098 39.06956 0.0),Area AC Hort de Soriano (Carcaixent) (Gratuit...
1,POINT (-8.89696 42.64159 0.0),Area de ac playa de BarraÃ±a (Boiro) (Gratuit...
2,POINT (0.35921 42.90715 0.0),Arreau Ã¡rea AC municipal (2 eurosAgua:sivaci...
3,POINT (-0.41007 38.39479 0.0),Camper Area Campello Beach (14eur/nocheAgua:...
4,POINT (-8.44443 43.37179 0.0),A CoruÃ±a Gratuito: Si Pernocta: Si WC: Si Ag...


In [3]:
# Check if any value is NaN
df.isnull().sum()

WKT       0
NOMBRE    0
dtype: int64

In [4]:
# Check the data type
df.dtypes

WKT       object
NOMBRE    object
dtype: object

In [5]:
# Check duplicates
df.duplicated().sum()

0

### EXTRACT COLUMNS INFORMATION
---

In [6]:
# Function to get coordinates
def get_latitude(col):
    pattern = "-?\d+\.\d+" 
    return re.findall(pattern, col)[1]

def get_longitude(col):
    pattern = "-?\d+\.\d+" 
    return re.findall(pattern, col)[0]

In [7]:
df["latitude"]= df["WKT"].apply(get_latitude)

In [8]:
df["longitude"] = df["WKT"].apply(get_longitude)

In [9]:
df.head(2)

Unnamed: 0,WKT,NOMBRE,latitude,longitude
0,POINT (-0.4098 39.06956 0.0),Area AC Hort de Soriano (Carcaixent) (Gratuit...,39.06956,-0.4098
1,POINT (-8.89696 42.64159 0.0),Area de ac playa de BarraÃ±a (Boiro) (Gratuit...,42.64159,-8.89696


In [10]:
# Function to get if it's free
def area_price(col):
    pattern1 = "[G|g]ratuito:*\s*[N|n]o"
    pattern2 = "[G|g]ratuito"
    pattern3 = ".*\d+"
    if re.findall(pattern1,col):             
        return "Paying"
    elif re.findall(pattern2,col):             
        return "Free"
    elif re.findall(pattern3,col):             
        return "Paying"
    else:
        return "Free"

In [11]:
df["price"] = df["NOMBRE"].apply(area_price)

In [12]:
df.head(2)

Unnamed: 0,WKT,NOMBRE,latitude,longitude,price
0,POINT (-0.4098 39.06956 0.0),Area AC Hort de Soriano (Carcaixent) (Gratuit...,39.06956,-0.4098,Free
1,POINT (-8.89696 42.64159 0.0),Area de ac playa de BarraÃ±a (Boiro) (Gratuit...,42.64159,-8.89696,Free


In [13]:
df.price.value_counts()

Free      552
Paying    361
Name: price, dtype: int64

In [14]:
# Function to know if they have water service
def get_water(col):
    pattern = "[A|a]gua:*\s*[N|n]o*"
    pattern2 = "[A|a]gua:*\s*[S|s]*i*"
    if re.findall(pattern, col):
        return "no"
    elif re.findall(pattern2, col):
        return "yes"
    else:
        return "unknown"

In [15]:
df["water"] = df["NOMBRE"].apply(get_water)

In [16]:
df.head(2)

Unnamed: 0,WKT,NOMBRE,latitude,longitude,price,water
0,POINT (-0.4098 39.06956 0.0),Area AC Hort de Soriano (Carcaixent) (Gratuit...,39.06956,-0.4098,Free,yes
1,POINT (-8.89696 42.64159 0.0),Area de ac playa de BarraÃ±a (Boiro) (Gratuit...,42.64159,-8.89696,Free,yes


In [17]:
df["water"].value_counts()

yes        885
no          19
unknown      9
Name: water, dtype: int64

In [18]:
# Function to know if they have wc
def get_wc(col):
    pattern = "[W|w][C|c]:*\s*[N|n]o*"
    pattern2 = "[W|w][C|c]:*\s*[S|s]*i*"
    if re.findall(pattern, col):
        return "no"
    elif re.findall(pattern2, col):
        return "yes"
    else:
        return "unknown"

In [19]:
df["wc"] = df["NOMBRE"].apply(get_wc)

In [20]:
df.head(2)

Unnamed: 0,WKT,NOMBRE,latitude,longitude,price,water,wc
0,POINT (-0.4098 39.06956 0.0),Area AC Hort de Soriano (Carcaixent) (Gratuit...,39.06956,-0.4098,Free,yes,yes
1,POINT (-8.89696 42.64159 0.0),Area de ac playa de BarraÃ±a (Boiro) (Gratuit...,42.64159,-8.89696,Free,yes,yes


In [21]:
df.wc.value_counts()

no         511
yes        363
unknown     39
Name: wc, dtype: int64

In [22]:
# Function to know if they have grey water disposal
def water_disposal(col):
    pattern = "[V|v]aciado:*\s*[N|n]o*"
    pattern2 = "[V|v]aciado:*\s*[S|s]*i*"
    if re.findall(pattern, col):
        return "no"
    elif re.findall(pattern2, col):
        return "yes"
    else:
        return "unknown"

In [23]:
df["grey_water"] = df["NOMBRE"].apply(water_disposal)

In [24]:
df.head(2)

Unnamed: 0,WKT,NOMBRE,latitude,longitude,price,water,wc,grey_water
0,POINT (-0.4098 39.06956 0.0),Area AC Hort de Soriano (Carcaixent) (Gratuit...,39.06956,-0.4098,Free,yes,yes,yes
1,POINT (-8.89696 42.64159 0.0),Area de ac playa de BarraÃ±a (Boiro) (Gratuit...,42.64159,-8.89696,Free,yes,yes,unknown


In [25]:
df.grey_water.value_counts()

yes        629
unknown    278
no           6
Name: grey_water, dtype: int64

In [26]:
# Drop the columns we don't need
df.drop(columns=["WKT", "NOMBRE"], axis=1, inplace= True)

In [27]:
df.to_csv("../files/02-areas_clean.csv")