# Chapter 2 Answers

## Data importing

In [None]:
# Load relavent libraries
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

%matplotlib inline

bikes_filepath = '../data/bikes.csv'

original_data = pd.read_csv(filepath_or_buffer=bikes_filepath, delimiter=",")

## Exercise 1:

Using two methods to fill in missing data as the first does not guarentee to fill all.

In [None]:
# Fill missing data with linear interpolation
clean_data = original_data.interpolate(method='linear')

# Fill remaining missing data with backfilling
clean_data = clean_data.fillna(method="bfill")

clean_data.isna().sum()

## Exercise 2:

The correct data types of the examples are:
* Ordinal
* Numerical
* Categorical

## Exercise 3:

In [None]:
from sklearn.preprocessing import OneHotEncoder

# Create new data frame with just the weather_code attribute.
weather_data = pd.DataFrame(clean_data["weather_code"], columns=["weather_code"])

# Initialise the encoder
one_hot_encoder = OneHotEncoder()

# Fit_transform the categorical data. It needs to be in an array to be transformed.
weather_encoded_array = one_hot_encoder.fit_transform(weather_data[["weather_code"]]).toarray()

# Fetch the feature names for use in the column headers.
column_names = one_hot_encoder.get_feature_names(['weather_code'])

# Create a new data frame with the weather encoded data array.
weather_encoded_data = pd.DataFrame(data=weather_encoded_array, columns=column_names)

# Use the original names of the categories.
weather_encoded_data["original_weather"] = clean_data["weather_code"]

weather_encoded_data.head()

## Exercise 4:

We can use the **`pandas`** function **`.select_dtypes(include='number')`** to return only columns of a certain data type. 

In [None]:
from sklearn.preprocessing import RobustScaler

# Initialise scaler
rb_scaler = RobustScaler()

# Select only numerical data and remove the target variable.
numerical_data = clean_data.select_dtypes(include='number').drop(columns=["count"])

# Scale the features, produces an array.
scaled_features_array = rb_scaler.fit_transform(numerical_data)

# Put the scaled array into a data frame. 
scaled_data = pd.DataFrame(scaled_features_array, columns=numerical_data.columns)

scaled_data.head()

## Exercise 5:


In [None]:
# Attach the count attribute to the data frame.
# This needs to be done so we can find the correlation between the features and count.
weather_encoded_data["count"] = clean_data["count"]

# Generate the correlation coefficients using the pandas method.
correlation_matrix = weather_encoded_data.corr()

# Plot the correlation matrix.
correlation_matrix.style.background_gradient(cmap='coolwarm').set_precision(2)

## Exercise 6:

In [None]:
from sklearn.feature_selection import SelectKBest, mutual_info_regression

# Set the number of attributes
K = 4

# Remove the target and pre-encoded attributes as they are not features.
weather_encoded_data = weather_encoded_data.drop(columns=["count", "original_weather"])

# Initialise the selector.
mutual_info_selector = SelectKBest(score_func=mutual_info_regression, k=K)

# Fit the selector to the encoded weather data.
mutual_info_selector.fit(X=weather_encoded_data, y=clean_data["count"])

# Find the chosen attributes with True/False values.
columns_selected = mutual_info_selector.get_support()

# Select the most important columns from the original frame.
selected_cols = weather_encoded_data.columns[columns_selected]
print("Top {} Columns are:\n\n\t".format(K), selected_cols)

# Create a data frame of just the most important features.
best_four_data = weather_encoded_data[selected_cols]
print(columns_selected)
best_four_data.head()