Load & Explore Data

In [None]:
import pandas as pd
import numpy as np

In [None]:
df = pd.read_csv('/Users/shadieftekhari/Desktop/UTS/adv_dsi_lab_2/adv_dsi_lab_2/adv_dsi_lab_2/data/raw/day-1.csv')
df.head()
df.shape
df.info()
df.describe()

Data Prep


In [None]:
#duplicate Data
df_cleaned = df.copy()

#Drop column instant
df_cleaned.drop('instant',axis=1,inplace=True)

Create Function to convert into datetime

In [None]:
def convert_to_date(df, cols:list):
    """Convert specified columns from a Pandas dataframe into datetime

    Parameters
    ----------
    df : pd.DataFrame
        Input dataframe
    cols : list
        List of columns to be converted

    Returns
    -------
    pd.DataFrame
        Pandas dataframe with converted columns
    """
    import pandas as pd

    for col in cols:
        if col in df.columns:
            df[col] = pd.to_datetime(df[col])
    return df

Call Function to convert to date

In [None]:
df_cleaned = convert_to_date(df_cleaned, ['dteday'])
# Extract the year
df_cleaned['yr'] = df_cleaned['dteday'].dt.year
# Extract the month
df_cleaned['mnth'] = df_cleaned['dteday'].dt.month_name()
# Extract the Day
df_cleaned['weekday'] = df_cleaned['dteday'].dt.day_name()

## Season Dictionary Mapping
season_mapping = {
    1: 'winter',
    2: 'spring',
    3: 'summer',
    4: 'autumn',
}
df_cleaned['season'] = df_cleaned['season'].map(season_mapping)

## Weather Dictionary Mapping 
eather_mapping = {
    1: 'clear',
    2: 'cloudy',
    3: 'rain',
    4: 'heavy'
}
df_cleaned['weathersit'] = df_cleaned['weathersit'].map(weather_mapping)

# Change the Holiday Value
df_cleaned.loc[df_cleaned['dteday'] == '2011-01-01', 'holiday'] = 1
df_cleaned['holidaydate'] = np.nan
df_cleaned = convert_to_date(df_cleaned, ['holidaydate'])
holiday_mask = df_cleaned['holiday'] == 1

# Change the values of holidaydate to be equals to dteday for all the observations that have the value 1 in column holiday (use holiday_mask)
df_cleaned.loc[holiday_mask, 'holidaydate'] = df_cleaned.loc[holiday_mask, 'dteday']

# New column called last_holiday that will be equals to holidaydate but with forward filling for missing values 
df_cleaned['last_holiday'] = df_cleaned['holidaydate'].fillna(method='ffill')

# Create a new column called last_holiday that will be equals to holidaydate but with back filling for missing values
df_cleaned['next_holiday'] = df_cleaned['holidaydate'].fillna(method='bfill')

# Replace missing values for next_holiday with the timestamp '2013-01-01
df_cleaned['next_holiday'].fillna(pd.Timestamp('2013-01-01'), inplace=True)

# Calculate the number of days for each observation between the current date and last holiday date. Save the results in a new column called days_last_holiday
df_cleaned['days_next_holiday'] = (df_cleaned['next_holiday'] - df_cleaned['dteday']).dt.days

# Create a variable called cat_cols that contains the names of the categorical columns
cat_cols = ['season','mnth','holiday','weekday','workingday','weathersit']

# Perform One-Hot encoding on the categorical features
df_cleaned = pd.get_dummies(df_cleaned, columns=cat_cols)

# Save the dataframe in the /data/interim folder
df_cleaned.to_csv('/Users/shadieftekhari/Desktop/UTS/adv_dsi_lab_2/adv_dsi_lab_2/adv_dsi_lab_2/data/interim/day.csv', index=False)

# Remove the following columns: 'dteday', 'holidaydate', 'last_holiday', 'next_holiday'
df_cleaned.drop(['dteday', 'holidaydate', 'last_holiday', 'next_holiday'], axis=1, inplace=True)




Split Dataset

In [None]:
# Create a subset function
def subset_x_y(target, features, start_index:int, end_index:int):
    """Keep only the rows for X and y sets from the specified indexes

    Parameters
    ----------
    target : pd.DataFrame
        Dataframe containing the target
    features : pd.DataFrame
        Dataframe containing all features
    features : int
        Index of the starting observation
    features : int
        Index of the ending observation

    Returns
    -------
    pd.DataFrame
        Subsetted Pandas dataframe containing the target
    pd.DataFrame
        Subsetted Pandas dataframe containing all features
    """

    return features[start_index:end_index], target[start_index:end_index]

# Create a function to split data by time
def split_sets_by_time(df, target_col, test_ratio=0.2):
    """Split sets by indexes for an ordered dataframe

    Parameters
    ----------
    df : pd.DataFrame
        Input dataframe
    target_col : str
        Name of the target column
    test_ratio : float
        Ratio used for the validation and testing sets (default: 0.2)

    Returns
    -------
    Numpy Array
        Features for the training set
    Numpy Array
        Target for the training set
    Numpy Array
        Features for the validation set
    Numpy Array
        Target for the validation set
    Numpy Array
        Features for the testing set
    Numpy Array
        Target for the testing set
    """

    df_copy = df.copy()
    target = df_copy.pop(target_col)
    cutoff = int(len(target) / 5)

    X_train, y_train = subset_x_y(target=target, features=df_copy, start_index=0, end_index=-cutoff*2)
    X_val, y_val     = subset_x_y(target=target, features=df_copy, start_index=-cutoff*2, end_index=-cutoff)
    X_test, y_test   = subset_x_y(target=target, features=df_copy, start_index=-cutoff, end_index=len(target))

    return X_train, y_train, X_val, y_val, X_test, y_test

# Import your new function split_sets_by_time and split the data into several sets
X_train, y_train, X_val, y_val, X_test, y_test = split_sets_by_time(df_cleaned, 'cnt', test_ratio=0.2)

# Create a function to save sets
def save_sets(X_train=None, y_train=None, X_val=None, y_val=None, X_test=None, y_test=None, path='/Users/shadieftekhari/Desktop/UTS/adv_dsi_lab_2/adv_dsi_lab_2/adv_dsi_lab_2/data/processed/'):
    """Save the different sets locally

    Parameters
    ----------
    X_train: Numpy Array
        Features for the training set
    y_train: Numpy Array
        Target for the training set
    X_val: Numpy Array
        Features for the validation set
    y_val: Numpy Array
        Target for the validation set
    X_test: Numpy Array
        Features for the testing set
    y_test: Numpy Array
        Target for the testing set
    path : str
        Path to the folder where the sets will be saved (default: '/Users/shadieftekhari/Desktop/UTS/adv_dsi_lab_2/adv_dsi_lab_2/adv_dsi_lab_2/data/processed/')

    Returns
    -------
    """
    import numpy as np

    if X_train is not None:
      np.save(f'{path}X_train', X_train)
    if X_val is not None:
      np.save(f'{path}X_val',   X_val)
    if X_test is not None:
      np.save(f'{path}X_test',  X_test)
    if y_train is not None:
      np.save(f'{path}y_train', y_train)
    if y_val is not None:
      np.save(f'{path}y_val',   y_val)
    if y_test is not None:
      np.save(f'{path}y_test',  y_test)

# Create a function to load sets
def load_sets(path='../data/processed/', val=False):
    """Load the different locally save sets

    Parameters
    ----------
    path : str
        Path to the folder where the sets are saved (default: '/Users/shadieftekhari/Desktop/UTS/adv_dsi_lab_2/adv_dsi_lab_2/adv_dsi_lab_2/data/processed/')

    Returns
    -------
    Numpy Array
        Features for the training set
    Numpy Array
        Target for the training set
    Numpy Array
        Features for the validation set
    Numpy Array
        Target for the validation set
    Numpy Array
        Features for the testing set
    Numpy Array
        Target for the testing set
    """
    import numpy as np
    import os.path

    X_train = np.load(f'{path}X_train.npy') if os.path.isfile(f'{path}X_train.npy') else None
    X_val   = np.load(f'{path}X_val.npy'  ) if os.path.isfile(f'{path}X_val.npy')   else None
    X_test  = np.load(f'{path}X_test.npy' ) if os.path.isfile(f'{path}X_test.npy')  else None
    y_train = np.load(f'{path}y_train.npy') if os.path.isfile(f'{path}y_train.npy') else None
    y_val   = np.load(f'{path}y_val.npy'  ) if os.path.isfile(f'{path}y_val.npy')   else None
    y_test  = np.load(f'{path}y_test.npy' ) if os.path.isfile(f'{path}y_test.npy')  else None

    return X_train, y_train, X_val, y_val, X_test, y_test

# Import your new function and save the sets into the folder data/processed
save_sets(X_train, y_train, X_val, y_val, X_test, y_test, path='/Users/shadieftekhari/Desktop/UTS/adv_dsi_lab_2/adv_dsi_lab_2/adv_dsi_lab_2/data/processed/')

# Calculate the average of the target variable for the training set and save it into a variable called y_mean
y_mean = y_train.mean()

# Create a numpy array called y_base of dimensions (len(y_train), 1) filled with this value
y_base = np.full((len(y_train), 1), y_mean)

# In src/models/ create a file called performance.py. Inside it you will define a function called print_reg_perf
def print_reg_perf(y_preds, y_actuals, set_name=None):
    """Print the RMSE and MAE for the provided data

    Parameters
    ----------
    y_preds : Numpy Array
        Predicted target
    y_actuals : Numpy Array
        Actual target
    set_name : str
        Name of the set to be printed

    Returns
    -------
    """
    from sklearn.metrics import mean_squared_error as mse
    from sklearn.metrics import mean_absolute_error as mae

    print(f"RMSE {set_name}: {mse(y_actuals, y_preds, squared=False)}")
    print(f"MAE {set_name}: {mae(y_actuals, y_preds)}")

# Display the RMSE and MAE scores of this baseline model
    print_reg_perf(y_preds=y_base, y_actuals=y_train, set_name='Training')
