# Impute Missing Station Name with DataWig

## Import Packages

In [19]:
# Import necessary packages
from google.cloud import bigquery
import os
import pandas as pd
import datawig

## Import Data from Bigquery

In [2]:
# Set environment variables
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = 'acoustic-portal-322707-496a5c838490.json'
# initiates BQ service
bigquery_client = bigquery.Client()
# Write Query on BQ
QUERY = """
SELECT
    ride_id,
    start_station_name,
    start_lat,
    start_lng,
    end_station_name,
    end_lat,
    end_lng
FROM
  `bike_dataset.bike_df_missing_station`
;
  """
# Run the query and write result to a pandas data frame
Query_Results = bigquery_client.query(QUERY)
bike_df = Query_Results.to_dataframe()

## Checking Null values

In [3]:
# check for null values
print(bike_df.isna().sum())

ride_id                    0
start_station_name    801888
start_lat                  0
start_lng                  0
end_station_name      844332
end_lat                    0
end_lng                    0
dtype: int64


## Impute Missing Values of Start Station Name using DataWig based on latitude and longitude

In [4]:
# Split dataframe into train and test subsets
df_train, df_test = datawig.utils.random_split(bike_df)

#Initialize a SimpleImputer model
imputer = datawig.SimpleImputer(
    input_columns=['start_lat', 'start_lng'], # column(s) containing information about the column we want to impute
    output_column= 'start_station_name', # the column we'd like to impute values for
    output_path = 'imputer_model' # stores model data and metrics
)

#Fit an imputer model on the train data
imputer.fit(train_df=df_train, num_epochs=10)

#Impute missing values and return original dataframe with predictions
imputed_start_station = imputer.predict(df_test)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]
2023-01-31 00:05:54,826 [INFO]  
2023-01-31 00:07:16,961 [INFO]  Epoch[0] Batch [0-49955]	Speed: 9732.03 samples/sec	cross-entropy=0.162354	start_station_name-accuracy=0.969474
2023-01-31 00:08:43,166 [INFO]  Epoch[0] Train-cross-entropy=0.122119
2023-01-31 00:08:43,168 [INFO]  Epoch[0] Train-start_station_name-accuracy=0.981645
2023-01-31 00:08:43,168 [INFO]  Epoch[0] Time cost=168.339
2023-01-31 00:08:43,173 [INFO]  Saved checkpoint to "imputer_model/model-0000.params"
2023-01-31 00:08:50,699 [INFO]  Epoch[0] Validation-cross-entropy=0.056008
2023-01-31 00:08:50,700 [INFO]  Epoch[0] Validation-start_station_name-accuracy=0.999058
2023-01-31 00:10:21,006 [INFO]  Epoch[1] Batch [0-49955]	Speed: 8851.33 sa

In [5]:
# Check for null values
print(imputed_start_station.isna().sum())

ride_id                                  0
start_station_name                  160485
start_lat                                0
start_lng                                0
end_station_name                    168928
end_lat                                  0
end_lng                                  0
start_station_name_imputed               0
start_station_name_imputed_proba         0
dtype: int64


In [6]:
# Check for new generated columns
imputed_start_station[['start_station_name_imputed', 'start_station_name_imputed_proba']].head(20)

Unnamed: 0,start_station_name_imputed,start_station_name_imputed_proba
4008280,Halsted St & Polk St,1.0
705745,Millennium Park,1.0
1354033,Sheridan Rd & Irving Park Rd,1.0
2808282,Orleans St & Merchandise Mart Plaza,1.0
683942,Wabash Ave & Roosevelt Rd,1.0
1894742,Loomis St & Lexington St,1.0
1950797,Wells St & Hubbard St,1.0
4791471,DuSable Lake Shore Dr & Diversey Pkwy,1.0
1338903,Halsted St & Polk St,0.999987
3178517,Racine Ave & Fullerton Av,0.999999


## Impute Missing Values of End Station Name using DataWig based on latitude and longitude

In [7]:
# Split dataframe into train and test subsets
df_train, df_test = datawig.utils.random_split(bike_df)

#Initialize a SimpleImputer model
imputer = datawig.SimpleImputer(
    input_columns=['end_lat', 'end_lng'], # column(s) containing information about the column we want to impute
    output_column= 'end_station_name', # the column we'd like to impute values for
    output_path = 'imputer_model' # stores model data and metrics
)

#Fit an imputer model on the train data
imputer.fit(train_df=df_train, num_epochs=10)

#Impute missing values and return original dataframe with predictions
imputed_end_station = imputer.predict(df_test)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]
  return np.log(probas)


In [8]:
# Check for null values
print(imputed_end_station.isna().sum())

ride_id                                0
start_station_name                160485
start_lat                              0
start_lng                              0
end_station_name                  168928
end_lat                                0
end_lng                                0
end_station_name_imputed               0
end_station_name_imputed_proba         0
dtype: int64


In [9]:
# Check for new generated columns
imputed_end_station[['end_station_name_imputed', 'end_station_name_imputed_proba']].head(20)

Unnamed: 0,end_station_name_imputed,end_station_name_imputed_proba
4008280,Ellis Ave & 55th St,0.59464
705745,Indiana Ave & Roosevelt Rd,0.561914
1354033,Clark St & Wellington Av,0.997078
2808282,Larrabee St & Kingsbury St,0.999998
683942,Halsted St & Polk St,1.0
1894742,Ellis Ave & 55th St,0.996378
1950797,Orleans St & Merchandise Mart Plaza,0.988646
4791471,DuSable Lake Shore Dr & Belmont Av,0.999998
1338903,Clinton St & Madison St,0.528859
3178517,Sheffield Ave & Fullerton Av,0.999988


## Joining imputed station name into one dataframe

In [10]:
# Create new dataframe that contains imputed station name
clean_station_name = imputed_start_station[['ride_id', 'start_station_name_imputed', 'start_lat', 'start_lng']]

In [12]:
imputed_end_station = imputed_end_station[['ride_id', 'end_station_name_imputed', 'end_lat', 'end_lng']]

In [13]:
# Join two dataframe that have new imputed station name into one
clean_station_name = clean_station_name.merge(imputed_end_station, on='ride_id')

## Checking null values

In [15]:
# Check for null values for new dataframe
print(clean_station_name.isnull().sum())

ride_id                       0
start_station_name_imputed    0
start_lat                     0
start_lng                     0
end_station_name_imputed      0
end_lat                       0
end_lng                       0
dtype: int64


## Checking duplicate based on ride_id

In [16]:
# Checking for duplicate values
duplicates = clean_station_name.duplicated(subset=['ride_id'])
clean_station_name[duplicates]

Unnamed: 0,ride_id,start_station_name_imputed,start_lat,start_lng,end_station_name_imputed,end_lat,end_lng


## Rename column name

In [17]:
# Rename the column names
clean_station_name = clean_station_name.rename(columns={'start_station_name_imputed':'start_station_name',
                                                        'end_station_name_imputed':'end_station_name'})

In [18]:
# Final check before upload to bigquery
clean_station_name.sample(20)

Unnamed: 0,ride_id,start_station_name,start_lat,start_lng,end_station_name,end_lat,end_lng
552866,91AC2F46B44A1663,Pine Grove Ave & Irving Park Rd,41.95,-87.77,Kedzie Ave & Milwaukee Av,41.95,-87.71
1094241,B312F7918EC5DBF2,Sheridan Rd & Irving Park Rd,41.958494,-87.654966,Sheridan Rd & Irving Park Rd,41.958494,-87.654966
381821,E95134932647D4B0,Wells St & Hubbard St,41.891044,-87.635443,Wabash Ave & Roosevelt Rd,41.870769,-87.625734
306452,DE4D057D3D2BC3FB,Streeter Dr & Grand Av,41.89,-87.61,Clark St & Wrightwood Av,41.92,-87.64
78579,4EE4ACAE43CAA377,Orleans St & Merchandise Mart Plaza,41.888303,-87.636475,Loomis St & Lexington St,41.87295,-87.66913
957888,C11AF4DE4C25AB25,State St & Randolph St,41.884621,-87.627834,LaSalle St & Illinois St,41.894877,-87.632326
70269,D96CA14BD2B9B34F,Fairbanks Ct & Grand Av,41.89,-87.62,Bissell St & Armitage Av,41.913688,-87.652855
426497,DD75F451B9F62AFB,Indiana Ave & Roosevelt Rd,41.8349,-87.61793,Clinton St & Madison St,41.841707,-87.626938
219737,9CE7D2CEB6DD3E63,Sheridan Rd & Irving Park Rd,41.98,-87.67,Sheridan Rd & Irving Park Rd,41.97,-87.66
15162,AA72E073D05346A1,Southport Ave & Roscoe St,41.954688,-87.673838,Sheffield Ave & Waveland Av,41.949399,-87.654529


## Upload new imputed dataframe back to Bigquery

In [20]:
# Load client
client = bigquery.Client(project='acoustic-portal-322707')

# Define table name, in format dataset.table_name
table = 'bike_dataset.clean_station_name'

# Load data to BQ
job = client.load_table_from_dataframe(clean_station_name, table)