In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
import math
import warnings
warnings.filterwarnings("ignore")

### Part-I
##### What to expect:
   * In part-I, we will *__clean the data__*, and will write down each step in detail.
        * Read CSV file
        * Remove columns which are not requierd in analysis. 
            * ['address', 'phone']
        * Rename columns to some intuitive names
        * Handle the null values [drop it]
            * The risk of bias arises if the imputed values are not representative of the true missing values, so we will remove missing values as we have enough dataset to work on.
        * Drop duplicates
        * Remove the irrelevant text from each column, if any
            * For example in rating column extract 4.1 from '4.1/5' and convert it to float type.
            * In 'online_order' column we want "Yes or No", so will remove records which have 
            values other than "Yes or No".
            * Convert ['votes', 'approx_cost'] columns to integer.
        * Check uniqueness of the data in each column 
             * If any nan value is created during conversion from string to float or int, handle those records with nan values.
        * Remove Non-printable characters from Name column



*__Note: First handle missing values and extract substring after that convert to numeric type as missing values and nan values might generate exception.__*

In [2]:
def read_data_from_csv():
    path = input('Enter the path to csv file:\t')
    restaurants=pd.read_csv(path)
    return restaurants


def remove_unwanted_columns():
    #call read_data_from_csv() function to get dataframe
    
    restaurants = read_data_from_csv()
    restaurants = restaurants.drop(columns=['address', 'phone'])
    
    return restaurants


def rename_columns():
    #call remove_unwanted_columns() function to get dataframe
    restaurants = remove_unwanted_columns()
    
    #rename columns,  only these columns will be used in the dataset
    restaurants.rename(columns={'rate':'rating',
                       'approx_cost(for two people)':'approx_cost',
                       'listed_in(type)': 'type'}, inplace=True)
   
    return restaurants


#handle  null values of each column
def null_value_check():
    
    #call rename_columns() function to get dataframe
    restaurants=rename_columns()
    
    #remove all null values from all columns
    restaurants.dropna(inplace=True)
    
    for col in restaurants.columns:
        
        cnt = restaurants[col].loc[restaurants[col].isnull() == True].count()

        if cnt != 0:
            print(f'Oops :(, Null values in {col} column')
            print(col, cnt)
        
    return restaurants


#find duplicates in the dataset
def remove_duplicates():
    
    #call null_value_check() function to get dataframe
    restaurants=null_value_check()
    
    #droping the duplicates value keeping the first
    restaurants.drop_duplicates(inplace=True)
    
    return restaurants


#removing irrelevant text from all the columns
def removing_irrelevant_text():
    
    #call remove_duplicates() function to get dataframe
    restaurants= remove_duplicates()
    
    
    #Extract ratings 4.1 from 4.1/5
    restaurants['rating'] = restaurants['rating'].str.extract(r'^(\d\.\d)')
    restaurants['rating'] = restaurants['rating'].astype(float)
    
    #Remove rows which contains value other than Yes or No
    restaurants = restaurants.loc[restaurants['online_order'].isin(['Yes', 'No'])]
    
    #Convert to float 
    restaurants['votes'] = restaurants['votes'].astype(int)
    restaurants['approx_cost'] = restaurants['approx_cost'].str.replace(',', '').astype(int)
    
    
    return restaurants


#check for unique values in each column and handle the irrelevant values
def check_for_unique_values():
    '''
    Handling or removing np.nan values which might arise during the extraction of ratings or 
    during conversion to float type.
    '''
    
    #call removing_irrelevant_text() function to get dataframe
    restaurants = removing_irrelevant_text()
    
    # Creating copy of the datafram as mutating the same dataframe might cause problems.
    restaurants_cp = restaurants.copy()
   

    for col in restaurants_cp.columns:
        if restaurants_cp[col].dtype != 'object':          
            for val in restaurants_cp[col].unique():
                if math.isnan(val) or np.isnan(val) or pd.isna(val):
                    #remove all those rows where val is np.nan 
                    restaurants = restaurants.loc[np.isnan(restaurants[col]) == False]
                    
    return restaurants


#remove the unknown character from the dataset and export it to "zomatocleaned.csv"
def remove_the_unknown_character():
    '''
    remove unknown character from dataset
    '''
    
    #call check_for_unique_values() function to get dataframe
    dataframe=check_for_unique_values()
    
    # define a regular expression pattern to remove unknown characters 
    # AsCII code for printable chars
    pattern = r'[^\x00-\x7F]+'

    # remove unknown characters from non-numeric columns
    for col in dataframe.columns:  
        if dataframe[col].dtype == 'object':
            dataframe[col] = dataframe[col].apply(lambda x: re.sub(pattern, '', x))

    
    
    #export cleaned Dataset to newcsv file named "zomatocleaned.csv"
#     dataframe.to_csv('zomato_cleaned.csv')
    return dataframe




def cleaned_data():
    '''
    Returns dataframe where all null values are handled, and 
    converts all columns to appropriate format.
    
    parameters
    ----------
    path : Path to origianl csv file ('zomato.csv')
    
    '''
    return remove_the_unknown_character()


In [3]:
df = cleaned_data()

Enter the path to csv file:	zomato.csv


In [4]:
df.head(1)

Unnamed: 0,name,online_order,book_table,rating,votes,location,rest_type,dish_liked,cuisines,approx_cost,type
0,Jalsa,Yes,Yes,4.1,775,Banashankari,Casual Dining,"Pasta, Lunch Buffet, Masala Papad, Paneer Laja...","North Indian, Mughlai, Chinese",800,Buffet
