In [1]:
def load_raw_data():
    
    # Read business file and filter only restaurants data
    cols = ['business_id','categories']
    df_business = pd.read_csv(file_path + file_name_business, usecols = cols) 
    df_business = df_business[df_business['categories'].str.contains('Restaurants')]
    df_business.drop('categories', axis=1, inplace=True)

    # Load reviews data
    cols_rv = ['review_id', 'business_id', 'date', 'stars', 'text']
    df_reviews = pd.read_csv(file_path + file_name_review, usecols = cols_rv) 
    df_reviews['date'] = pd.to_datetime(df_reviews['date'])
    df_reviews = df_reviews[df_reviews['date'].dt.year.isin([2017])]
    
    # combine the two datasets
    df = pd.merge(df_business, df_reviews, how='inner', on='business_id')
    df['text_len'] = df['text'].str.len() 
    print("Total Rows:", df.shape[0], ", Total Columns:", df.shape[1], \
      ", Total Memory Usage (Bytes):",df.memory_usage().sum())
    
    df_business, df_reviews = None, None #Free-up resources
    return df


def clean_outlier_text(df):
    """Filter for roughly within the 'box' of IQR range on text length"""
    mask = (df['text'].str.len() > 50) & (df['text'].str.len() < 200)
    return df.loc[mask]


def save_preprocessed_data(df, file_name=file_name_preprocess):
    with open(file_path + file_name, 'wb') as f:
        pickle.dump(df, f)

In [1]:
def load_pkl_file(file_name):
    with open(file_path + file_name, 'rb') as f:
        df = pickle.load(f)
    return df

In [None]:
def load_and_preprocess():
    df = load_raw_data()
    
    #Remove outliers
    df = clean_outlier_text(df)
    
    # Dump preprocessed output
    save_preprocessed_data(df, file_name_preprocess)
    
    return df

In [None]:
def load_processed_data():
    try:
        with open(file_path + file_name_preprocess, 'rb') as f:
            df = pickle.load(f)
        
    except (OSError, IOError) as e:
        df = load_and_preprocess()
    
    print("After removing outliers")
    print("Total Rows:", df.shape[0], ", Total Columns:", df.shape[1], \
      ", Total Memory Usage (Bytes):",df.memory_usage().sum())
    
    return df