In [1]:
# coding: utf-8

"""
This file contains the text processing functions such as Tokenizing, StopWords, Stemming ..
"""
from __future__ import unicode_literals

import os
import codecs
import pandas as pd
import re
import numpy as np
PATH = os.path.abspath(os.path.join(os.getcwd(), ".."))

In [2]:
PATH

'/home/sidoine/OneDrive/projet'

In [3]:
# INPUT
#root_path = PATH
input_data_path = os.path.join(PATH, "bikes_clustering", "datasets","input", "Brisbane_CityBike.json")

# OUTPUT

output_data_path = os.path.join(PATH, "bikes_clustering", "datasets","output", "prediction_location_Brisbane_CityBike.json")

In [315]:
# loading data

df = pd.read_json(input_data_path)
df.head(10)

Unnamed: 0,address,coordinates,id,latitude,longitude,name,position
0,Lower River Tce / Ellis St,,122,-27.4823,153.029,122 - LOWER RIVER TCE / ELLIS ST,
1,Main St / Darragh St,,91,-27.4706,153.036,91 - MAIN ST / DARRAGH ST,
2,,,88,,153.043,88 - SYDNEY ST FERRY TERMINAL / PARK,
3,Browne St / James St,,75,-27.4619,153.047,75 - BROWNE ST / JAMES ST,
4,Kurilpa Point / Montague Rd,,99,-27.4697,153.017,98 - KURILPA POINT / MONTAGUE RD,
5,Montague Rd / Skinner St,,109,-27.4817,153.004,109 - MONTAGUE RD / SKINNER ST,
6,Macquarie St / Guyatt Park,,149,-27.4936,153.001,,
7,Bi-centennial Bike Way / Lang Pde,,139,-27.4761,153.002,139 - BI-CENTENNIAL BIKE WAY / LANG PDE,
8,Sir William McGregor Dr / Sir Fred Schonnell,,24,-27.494,153.012,24 - SIR WILLIAM MCGREGOR DR / SIR FRED SCHONNELL,
9,Vulture St / Tribune St,,117,-27.4822,153.021,117 - VULTURE ST / TRIBUNE ST,


In [316]:
df.shape

(150, 7)

In [317]:
# cleaning steps #
# keeping only rows with coordinates / latitude or longitude not empty value
df_1 = df[df.coordinates.notnull() | (df.longitude.notnull() & df.latitude.notnull())]
df_1.shape

(148, 7)

In [319]:
# retrieving only Rows with valid float type in fields longitude and latitude
# after analysing the data content, we notice the noise value 'not relevant' in longitude and latitudes ==> 2 rows
import numpy as np
df_1 = df_1[df_1.longitude.apply(lambda x: type(x) in [int, np.int64, float, np.float64])]
df_1 = df_1[df_1.latitude.apply(lambda x: type(x) in [int, np.int64, float, np.float64])]
df_1.shape

(146, 7)

In [322]:
# Filling the longitude / latitude value with the value in 'coordinates' when exists and are valid
### # checking the Nan rows to replace in longitude and latitude when coordinates exists ==> creating first temporary filtrering dataframe
tmp_df = df_1[df_1.coordinates.notnull() & (df_1.longitude.isnull() | df_1.latitude.isnull())]

df_1.latitude.fillna(tmp_df.coordinates.apply(lambda x : x['latitude']),inplace=True)
df_1.longitude.fillna(tmp_df.coordinates.apply(lambda x : x['longitude']),inplace=True)
df_1.head()

Unnamed: 0,address,coordinates,id,latitude,longitude,name,position
0,Lower River Tce / Ellis St,,122,-27.482279,153.028723,122 - LOWER RIVER TCE / ELLIS ST,
1,Main St / Darragh St,,91,-27.47059,153.036046,91 - MAIN ST / DARRAGH ST,
3,Browne St / James St,,75,-27.461881,153.046986,75 - BROWNE ST / JAMES ST,
4,Kurilpa Point / Montague Rd,,99,-27.469658,153.016696,98 - KURILPA POINT / MONTAGUE RD,
5,Montague Rd / Skinner St,,109,-27.48172,153.00436,109 - MONTAGUE RD / SKINNER ST,


In [324]:
## Filling Nan Value with valid value in "Coordinates" from 'longitude' and "latitude" fields
df_1.coordinates.fillna(df_1.apply(lambda x : {'latitude':x['latitude'],'longitude':x['longitude']},axis=1),inplace=True)
df_1.head()

Unnamed: 0,address,coordinates,id,latitude,longitude,name,position
0,Lower River Tce / Ellis St,"{'latitude': -27.482279, 'longitude': 153.028723}",122,-27.482279,153.028723,122 - LOWER RIVER TCE / ELLIS ST,
1,Main St / Darragh St,"{'latitude': -27.47059, 'longitude': 153.036046}",91,-27.47059,153.036046,91 - MAIN ST / DARRAGH ST,
3,Browne St / James St,"{'latitude': -27.461880999999998, 'longitude':...",75,-27.461881,153.046986,75 - BROWNE ST / JAMES ST,
4,Kurilpa Point / Montague Rd,"{'latitude': -27.469658, 'longitude': 153.016696}",99,-27.469658,153.016696,98 - KURILPA POINT / MONTAGUE RD,
5,Montague Rd / Skinner St,"{'latitude': -27.48172, 'longitude': 153.00436}",109,-27.48172,153.00436,109 - MONTAGUE RD / SKINNER ST,


In [325]:
df_1.shape

(146, 7)

In [327]:
## Keeping only Rows with valid float type in Coordinates ==> May be escape if coordinates is not needed for our training steps
## hence the df_1 will be sufficient.
df_2 = df_1[df_1.coordinates.apply(lambda x: type(x['latitude']) in [int, np.int64, float, np.float64])]
df_2 = df_1[df_1.coordinates.apply(lambda x: type(x['longitude']) in [int, np.int64, float, np.float64])]

In [328]:
df_2.shape

(146, 7)

In [330]:
df_2.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 146 entries, 0 to 149
Data columns (total 7 columns):
address        146 non-null object
coordinates    146 non-null object
id             146 non-null int64
latitude       146 non-null float64
longitude      146 non-null float64
name           145 non-null object
position       4 non-null object
dtypes: float64(2), int64(1), object(4)
memory usage: 9.1+ KB
