# Cleaning Data with python

In [1]:
# import library
import numpy as np
import pandas as pd

## Why cleaning data?
To make sure every variables have the correct data types for further analysis

In [2]:
path = 'data/csv/ride_sharing_new.csv'
ride_df = pd.read_csv(path, index_col=6, parse_dates=['user_birth_year'])

In [3]:
ride_df.drop('Unnamed: 0', axis=1, inplace=True)

In [4]:
ride_df.head()

Unnamed: 0_level_0,duration,station_A_id,station_A_name,station_B_id,station_B_name,user_type,user_birth_year,user_gender
bike_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
5480,12 minutes,81,Berry St at 4th St,323,Broadway at Kearny,2,1959-01-01,Male
5193,24 minutes,3,Powell St BART Station (Market St at 4th St),118,Eureka Valley Recreation Center,2,1965-01-01,Male
3652,8 minutes,67,San Francisco Caltrain Station 2 (Townsend St...,23,The Embarcadero at Steuart St,3,1993-01-01,Male
1883,4 minutes,16,Steuart St at Market St,28,The Embarcadero at Bryant St,1,1979-01-01,Male
4626,11 minutes,22,Howard St at Beale St,350,8th St at Brannan St,2,1994-01-01,Male


In [5]:
ride_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 25760 entries, 5480 to 1705
Data columns (total 8 columns):
 #   Column           Non-Null Count  Dtype         
---  ------           --------------  -----         
 0   duration         25760 non-null  object        
 1   station_A_id     25760 non-null  int64         
 2   station_A_name   25760 non-null  object        
 3   station_B_id     25760 non-null  int64         
 4   station_B_name   25760 non-null  object        
 5   user_type        25760 non-null  int64         
 6   user_birth_year  25760 non-null  datetime64[ns]
 7   user_gender      25760 non-null  object        
dtypes: datetime64[ns](1), int64(3), object(4)
memory usage: 1.4+ MB


In [6]:
ride_df.describe()

Unnamed: 0,station_A_id,station_B_id,user_type
count,25760.0,25760.0,25760.0
mean,31.023602,89.558579,2.008385
std,26.409263,105.144103,0.704541
min,3.0,3.0,1.0
25%,15.0,21.0,2.0
50%,21.0,58.0,2.0
75%,67.0,93.0,3.0
max,81.0,383.0,3.0


In [7]:
ride_df['user_type_cat'] = ride_df['user_type'].astype('category')

In [8]:
ride_df['user_type_cat'].describe()

count     25760
unique        3
top           2
freq      12972
Name: user_type_cat, dtype: int64

In [9]:
ride_df['duration_time'] = ride_df['duration'].str.strip('minutes')

In [10]:
ride_df['duration_time'] = ride_df['duration_time'] .astype('int')

In [11]:
ride_df.columns

Index(['duration', 'station_A_id', 'station_A_name', 'station_B_id',
       'station_B_name', 'user_type', 'user_birth_year', 'user_gender',
       'user_type_cat', 'duration_time'],
      dtype='object')

In [12]:
ride_df.drop(labels=['duration', 'user_type'], axis=1, inplace=True)

In [13]:
ride_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 25760 entries, 5480 to 1705
Data columns (total 8 columns):
 #   Column           Non-Null Count  Dtype         
---  ------           --------------  -----         
 0   station_A_id     25760 non-null  int64         
 1   station_A_name   25760 non-null  object        
 2   station_B_id     25760 non-null  int64         
 3   station_B_name   25760 non-null  object        
 4   user_birth_year  25760 non-null  datetime64[ns]
 5   user_gender      25760 non-null  object        
 6   user_type_cat    25760 non-null  category      
 7   duration_time    25760 non-null  int32         
dtypes: category(1), datetime64[ns](1), int32(1), int64(2), object(3)
memory usage: 1.2+ MB


## Out of range value
Usually from miscounted rating or out of range date

for date greater than today(dt.date.today()) and convert columns to date using df('col') = pd.to_datetime(df('col'))

### How to deal it?
- Drop value
- Setting custom minimun and maximum
- Set with average value
- Set (ffill) or (bfill)


### 

In [15]:
ride_df['user_year_date'] = ride_df['user_birth_year'].dt.year

In [22]:
ride_df.drop('user_birth_year', axis=1, inplace=True)

In [23]:
ride_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 25760 entries, 5480 to 1705
Data columns (total 8 columns):
 #   Column          Non-Null Count  Dtype   
---  ------          --------------  -----   
 0   station_A_id    25760 non-null  int64   
 1   station_A_name  25760 non-null  object  
 2   station_B_id    25760 non-null  int64   
 3   station_B_name  25760 non-null  object  
 4   user_gender     25760 non-null  object  
 5   user_type_cat   25760 non-null  category
 6   duration_time   25760 non-null  int32   
 7   user_year_date  25760 non-null  int64   
dtypes: category(1), int32(1), int64(3), object(3)
memory usage: 1.7+ MB


## Duplicate value
Usually comes from bug
### How to deal it?
- get duplicates = df.duplicated(subset=list_col, keep = bool)
- drop duplicated with .drop_duplicates(subset=list_col, keep = bool, inplace=bool)

# Text and categorical data problems

## Membership constraint
- Drop data
- Remapping categories
- Inferring Categories

## Categorical Common pobrem
- Unconsistency (uppercase, lowercase, spacebar) : .str.upper(), .str.lower(), .str.strip(string)
- Mapping category : pd.cut, .replace()

## Cleaning Text data
- Replace text with .str.replace()
- fill with null value if text is not normal (np.nan)
- regular expression


#  Adcvnace data problem
## Uniformity
- plot with scatter plot to see uniformity
- Multiple date time format and change it with pd.to_datetime(df('col'), infer_datetime_format=True, errors='coerce')

## Cross field validation
- check concistencies between 2 or more related columns

## Completeness
- drop missing data
- ffil or bfill
- with statistical measures
- algorithmic approach
- machine learning model

# Record linkage

## Comparing strings
- Minimunm edit distance algorithm (fuzzywuzzy)
- record linkage
- linking data frame