# BLU02 - Exercises Notebook

In [1]:
import hashlib # for grading

import os
import pandas as pd

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.datasets import load_boston
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler

## 1 Read the Programs data (graded)

In this first exercise, we aim to create a single dataframe, combining all programs from all seasons.

With a caveat though: **we want to include seasons after 1900**.

In [2]:
def make_programs():
    files = os.listdir('data/programs/')
    # Create a list with the name of all files containing programs from
    # 1900 inclusive and onwards (just the filename, no complete path.)
    # files_after_1900: List[str] = ...
    files_after_1900 = [f for f in files if int(f.split('-')[0])>=1900 ]
    # YOUR CODE HERE
   
    #raise NotImplementedError()
    # Create a list with the name of all .csv files.
    # seasons: List[pd.DataFrame] = .
    seasons = [read_season(f) for f in files_after_1900 if '.csv' in f]
    # YOUR CODE HERE
    #raise NotImplementedError()
    # Use pd.concat to create a single dataframe.
    # programs: pd.DataFrame = ...
    programs = pd.concat(seasons, axis=0, ignore_index=True)
#programs = pd.concat(seasons,axis = 0)
    # YOUR CODE HERE
    #raise NotImplementedError()
    # Drop the column ProgramID.
    programs = programs.drop(columns = 'ProgramID')
    # YOUR CODE HERE
    #raise NotImplementedError()
    # Set the index to be the column GUID, and sort the dataframe by the index 
    #( use the DataFrame.sort_index() function).
    programs = programs.set_index('GUID').sort_index()
    # Feel free to use method chaining if you want.
    # YOUR CODE HERE
    #raise NotImplementedError()
    return programs


def read_season(file):
    path = os.path.join('data', 'programs', file)
    return pd.read_csv(path)


programs = make_programs()

In [3]:
assert programs['Season'].min() == '1900-01'

shape = str(programs.shape)
expected_hash = '16278afb4c2032bcddc35b915f5439ef586333e2723c2ba6cfb9cc1b58eca0e1'
assert hashlib.sha256(shape.encode()).hexdigest() == expected_hash

Let's preview the `programs` dataframe.

In [4]:
programs.head()

Unnamed: 0_level_0,Orchestra,Season
GUID,Unnamed: 1_level_1,Unnamed: 2_level_1
0002718f-a7a0-4362-9366-92fabab4ff3c,New York Philharmonic,1928-29
0004749e-19e2-4c85-a51e-76a2b0987e4e,New York Philharmonic,1922-23
0008995b-f0ce-4bdb-b2f8-2fc9827430fe,New York Symphony,1925-26
0008fd59-7b87-4e87-8b42-ab5b0f8505cf,New York Philharmonic,1942-43
000c0467-d7bf-4599-8e37-c856bc13a389,New York Philharmonic,1991-92


## 2 Read the Concerts data (graded)

Read the concerts data.

Although we list all transformations step-by-step for the sake of clarity, we expect you to use method chaining.

In [5]:
def make_concerts(): 
    # Read concerts data and drop the ProgramID and ConcertID columns.
    # concerts: pd.DataFrame = ...
    concerts = pd.read_csv('./data/concerts.csv').drop(columns = ['ProgramID','ConcertID'])
    # YOUR CODE HERE
    #raise NotImplementedError()
    # Remember to_datetime? We need here. We need to parse the columns Date and 
    # Time. Use pd.to_datetime(...).dt.date for the Date and pd_to_datetime(..., 
    # format=%I:%M%p).dt.time for the Time.
    # YOUR CODE HERE
    concerts['Date'] = pd.to_datetime(concerts['Date']).dt.date
    concerts['Time'] = pd.to_datetime(concerts['Time'],format ='%I:%M%p' ).dt.time
    #raise NotImplementedError()
    return concerts


concerts = make_concerts()

In [6]:
shape = str(concerts.shape)
expected_hash = 'c030586e7370b1f2c34307d5de9b921d96efa28c933e44111b121ed819f339da'
assert hashlib.sha256(shape.encode()).hexdigest() == expected_hash

sample = str(concerts.sample(random_state=0))
expected_hash = '392a3db01753b02d85173c38cde95112fb5cdf06ca5a45d25f828238d56103be'
assert hashlib.sha256(sample.encode()).hexdigest() == expected_hash

In [7]:
concerts.head()

Unnamed: 0,GUID,EventType,Location,Venue,Date,Time
0,38e072a7-8fc9-4f9a-8eac-3957905c0002,Subscription Season,"Manhattan, NY",Apollo Rooms,1842-12-07,20:00:00
1,c7b2b95c-5e0b-431c-a340-5b37fc860b34,Subscription Season,"Manhattan, NY",Apollo Rooms,1843-02-18,20:00:00
2,894e1a52-1ae5-4fa7-aec0-b99997555a37,Special,"Manhattan, NY",Apollo Rooms,1843-04-07,20:00:00
3,34ec2c2b-3297-4716-9831-b538310462b7,Subscription Season,"Manhattan, NY",Apollo Rooms,1843-04-22,20:00:00
4,610a4acc-94e4-4cd6-bdc1-8ad020edc7e9,Subscription Season,"Manhattan, NY",Apollo Rooms,1843-11-18,NaT


## 3 Combine Programs and Concerts data (graded)

Let's combine both dataframes into a single dataset, using an inner join.

In [8]:
# Remember that you want to join on the index of one of the dataframes.
nyp = concerts.join(programs,on='GUID' ,how = 'inner')
# YOUR CODE HERE
#raise NotImplementedError()

In [9]:
shape = str(nyp.shape)
expected_hash = 'a75738e37ac4ccf37a893a1009ba624efce9efaa7721d4319e9e078193fe8de6'
assert hashlib.sha256(shape.encode()).hexdigest() == expected_hash 

## 4 Read Works and Soloists data (graded)

We will read the two remaining pieces of data. 

Again, albeit the step-by-step description, we encourage you to use method chaining.

In [10]:
def make_works():
    # Read the works data.
    # works: pd.DataFrame = ...
    works = pd.read_csv('./data/works.csv')
    # YOUR CODE HERE
    #raise NotImplementedError()
    # Remove the Intervals (attention to the values in the Interval column).
    works = works.drop(works[works.Interval =='Intermission'].index).drop(works[works.Interval =='Intermission-Short'].index).drop(works[works.Interval =='Intermission-Second'].index).drop(works[works.Interval =='Intermission-Third'].index)
    # works: pd.DataFrame = ...
    # YOUR CODE HERE
    #raise NotImplementedError()
    # Select the columns GUID, ComposerName, WorkTitle, Movement and ConductorName.
    works = works[['GUID', 'ComposerName','WorkTitle','Movement','ConductorName']]
    # YOUR CODE HERE
    #raise NotImplementedError()
    return works


def make_soloists():
    # Read the soloists data and drop ProgramID, WorkID and MovementID.
    soloists = pd.read_csv('./data/soloists.csv')
    soloists = soloists.drop(columns = ['ProgramID','WorkID','MovementID'])
    # YOUR CODE HERE
    #raise NotImplementedError()
    return soloists


works = make_works()
soloists = make_soloists()

In [11]:
shape = str(works.shape)
expected_hash = 'cad58aa6cd33cfa24c08a0f0f846877178ab31278f212c80b16b952d9416f883'
assert hashlib.sha256(shape.encode()).hexdigest() == expected_hash

shape = str(soloists.shape)
expected_hash = 'a7b0d20a45ff1344e0398eebb162af9afb8805082b0dfdcb70e9a4b78f94dd13'
assert hashlib.sha256(shape.encode()).hexdigest() == expected_hash 

## 5 Combine Works and Soloists (graded)

Like we did for Programs and Concerts, now we combine Works and Soloists.

In [12]:
# Combine both dataframes, again using an inner type of join.
# works_and_soloists : pd.DataFrame = ....
soloists = soloists.set_index('GUID')
works_and_soloists = works.join(soloists,on= 'GUID',how = 'inner')

# YOUR CODE HERE
#raise NotImplementedError()

In [13]:
shape = str(works_and_soloists.shape)
expected_hash = 'c0e73877aac4f3916267cb58f2f122ffef32c79039bde2ecb217fda123270d12'
assert hashlib.sha256(shape.encode()).hexdigest() == expected_hash

## 6 Combine everything (graded)

The final goal here is to create a single dataframe.

In [14]:
nyp.sort_index().head(n=5)

Unnamed: 0,GUID,EventType,Location,Venue,Date,Time,Orchestra,Season
535,06cf12ad-35ce-4ad1-9784-b41d71e444d3,Subscription Season,"Manhattan, NY",Carnegie Hall,1900-11-16,14:00:00,New York Philharmonic,1900-01
536,bc12831d-b37e-41b2-97e6-0c09505c22ed,Young People's Concert,"Manhattan, NY",Carnegie Hall,1900-12-01,14:30:00,New York Symphony,1900-01
537,dccd1848-bc49-46e7-92d5-822b7e31c579,Subscription Season,"Manhattan, NY",Carnegie Hall,1900-12-07,14:00:00,New York Philharmonic,1900-01
538,b97bea0a-373c-461f-b64f-9f7381faef19,Subscription Season,"Manhattan, NY",Carnegie Hall,1900-12-21,14:00:00,New York Philharmonic,1900-01
539,b964d1ce-47b3-499b-b164-d0a40e0aab2a,Young People's Concert,"Manhattan, NY",Carnegie Hall,1900-12-29,14:30:00,New York Symphony,1900-01


In [15]:
works_and_soloists.sort_index().head(n=5)

Unnamed: 0,GUID,ComposerName,WorkTitle,Movement,ConductorName,SoloistName,SoloistInstrument,SoloistRole
0,38e072a7-8fc9-4f9a-8eac-3957905c0002,"Beethoven, Ludwig van","SYMPHONY NO. 5 IN C MINOR, OP.67",,"Hill, Ureli Corelli","Otto, Antoinette",Soprano,S
0,38e072a7-8fc9-4f9a-8eac-3957905c0002,"Beethoven, Ludwig van","SYMPHONY NO. 5 IN C MINOR, OP.67",,"Hill, Ureli Corelli","Scharfenberg, William",Piano,A
0,38e072a7-8fc9-4f9a-8eac-3957905c0002,"Beethoven, Ludwig van","SYMPHONY NO. 5 IN C MINOR, OP.67",,"Hill, Ureli Corelli","Hill, Ureli Corelli",Violin,A
0,38e072a7-8fc9-4f9a-8eac-3957905c0002,"Beethoven, Ludwig van","SYMPHONY NO. 5 IN C MINOR, OP.67",,"Hill, Ureli Corelli","Derwort, G. H.",Viola,A
0,38e072a7-8fc9-4f9a-8eac-3957905c0002,"Beethoven, Ludwig van","SYMPHONY NO. 5 IN C MINOR, OP.67",,"Hill, Ureli Corelli","Boucher, Alfred",Cello,A


In [16]:
# Combine everything into a single dataframe.
works_and_soloists = works_and_soloists.set_index('GUID')
nyp_merged = nyp.join(works_and_soloists,on = 'GUID', how = 'inner')
# YOUR CODE HERE
#raise NotImplementedError()

In [17]:
shape = str(nyp_merged.shape)
expected_hash = '3c25d9867a3c0134a6625087698dac6314f7c225f806e78dd259788bedcfb10b'
assert hashlib.sha256(shape.encode()).hexdigest() == expected_hash

## 7 Final transformations (graded)

Now, we perform the train-test split.

We also perform some final transformations on both datasets:
* Include some date features: Year, Month, Day and Weekday
* Drop Date, Season and GUID
* Change the column name Orchestra to OrchestraName, for consistency with other name columns
* Filter out composers that appear in less than 100 concerts.

In [18]:
def preprocess_data(df):
    # You should follow these exact steps:
    #   1 - add_date_features, ideally using df.pipe
    df = df.copy()
    df = df.pipe(add_date_features).drop(columns = ['Date', 'Season','GUID']).drop_duplicates().rename({'Orchestra':'OrchestraName'},axis = 1).groupby('ComposerName').filter(lambda x: x.shape[0] >= 100)
    #   2 - drop Date, Season and GUID
    ###df = df.drop(columns = ['Date', 'Season','GUID'])
    #   3 - rename Orchestra to OrchestraName
    ###df = df.rename({'Orchestra':'OrchestraName'},axis = 1)
    #   4 - filter out composers with less than 100 concerts (keep the ones with >= 100 rows)
    ###df_groupby_composername = df.groupby('ComposerName')
    ###df = df_groupby_composername.filter(lambda x: x.shape[0] >= 100)
    # YOUR CODE HERE
    #raise NotImplementedError()
    return df

def add_date_features(df):
    df['Date']= pd.to_datetime(df['Date'])
    df['Year'] = df['Date'].dt.year
    df['Month'] = df['Date'].dt.month
    df['Day'] = df['Date'].dt.day
    df['Weekday'] = df['Date'].dt.weekday
    # YOUR CODE HERE
    #raise NotImplementedError()
    return df


nyp_ = preprocess_data(nyp_merged)
X_train, X_test = train_test_split(nyp_, random_state=0)

In [19]:
nyp_.head(n=3)

Unnamed: 0,EventType,Location,Venue,Time,OrchestraName,ComposerName,WorkTitle,Movement,ConductorName,SoloistName,SoloistInstrument,SoloistRole,Year,Month,Day,Weekday
535,Subscription Season,"Manhattan, NY",Carnegie Hall,14:00:00,New York Philharmonic,"Brahms, Johannes","ACADEMIC FESTIVAL OVERTURE, OP.80",,"Paur, Emil","Carreño, Teresa",Piano,S,1900,11,16,4
535,Subscription Season,"Manhattan, NY",Carnegie Hall,14:00:00,New York Philharmonic,"Bach, Johann Sebastian","TOCCATA & FUGUE, F MAJOR, BWV 540 (ARR. Esser)",,"Paur, Emil","Carreño, Teresa",Piano,S,1900,11,16,4
535,Subscription Season,"Manhattan, NY",Carnegie Hall,14:00:00,New York Philharmonic,"Tchaikovsky, Pyotr Ilyich","CONCERTO, PIANO, NO. 1, B-FLAT MINOR, OP. 23",,"Paur, Emil","Carreño, Teresa",Piano,S,1900,11,16,4


In [20]:
nyp_.shape

(204775, 16)

In [21]:
shape = str(nyp_merged.shape)
expected_hash = '3c25d9867a3c0134a6625087698dac6314f7c225f806e78dd259788bedcfb10b'
assert hashlib.sha256(shape.encode()).hexdigest() == expected_hash

shape = str(nyp_.shape)
expected_hash = '31fa2b10222342d4743fa75b3a04c69945106f22fcf7473f5d1daeb84bca88b7'
assert hashlib.sha256(shape.encode()).hexdigest() == expected_hash

columns = str(sorted(nyp_.columns.values))
expected_hash = '097ce79d998a726b3ed60a6cd1d4e4652e10af8285a1e0b4526057805234ee4e'
assert hashlib.sha256(columns.encode()).hexdigest() == expected_hash 

AssertionError: 

In [22]:
nyp_.groupby('ComposerName').size().nsmallest()

ComposerName
Gubaidulina,  Sofia           100
Strayhorn,  Billy             101
Busoni,  Ferruccio            102
Carpenter,  John  Alden       102
Gottschalk,  Louis  Moreau    102
dtype: int64

And, finally, we would be ready to explore modeling.

For the next part, however, we will be using the famous [Boston House Prices Dataset](https://archive.ics.uci.edu/ml/machine-learning-databases/housing/housing.names).

## 8 Scaling features (graded)

About the Boston dataset:

> Each record in the database describes a Boston suburb or town. The data is from the Boston Standard Metropolitan Statistical Area (SMSA) in 1970.

The features are all numerical (real, positive):
* **CRIM** - per capita crime rate by town
* **ZN** - proportion of residential land zoned for lots over 25,000 sq.ft.
* **INDUS** - proportion of non-retail business acres per town
* **CHAS** - Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)
* **NOX** - nitric oxides concentration (parts per 10 million)
* **RM** - average number of rooms per dwelling
* **AGE** - proportion of owner-occupied units built prior to 1940
* **DIS** - weighted distances to five Boston employment centres
* **RAD** - index of accessibility to radial highways
* **TAX** - full-value property-tax rate per \$10,000
* **PTRATIO** - pupil-teacher ratio by town
* **B** - 1000(Bk - 0.63)^2 where Bk is the proportion of blacks by town
* **LSTAT** - % lower status of the population
* **MEDV** - Median value of owner-occupied homes in \$1000's.

We want to scale all features to the same range, using `sklearn.preprocessing.MinMaxScaler()`.

In [23]:
boston = load_boston()
X = pd.DataFrame(data=boston.data, columns=boston.feature_names)
y = boston.target

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

# Initialize the MinMaxScaler to a [0, 5] range.
minmaxscaler = MinMaxScaler(feature_range=(0,5))
# YOUR CODE HERE
#raise NotImplementedError()

# Fit on the training set and transform X_train. We expect X_train_
# to be a dataframe **just like** X_train, only scaled. 
# X_train_: pd.DataFrame = ...
minmaxscaler.fit(X_train)
X_train_ = minmaxscaler.transform(X_train)
X_train_ = pd.DataFrame(X_train_,columns = X_train.columns)

# YOUR CODE HERE
#raise NotImplementedError()

# Transform the test set.
# X_test_: pd.DataFrame = ...
minmaxscaler.fit(X_test)
X_test_ = minmaxscaler.transform(X_test)
X_test_ = pd.DataFrame(X_test_,columns = X_test.columns)

# YOUR CODE HERE
#raise NotImplementedError()

In [24]:
shape = str(X_train_.shape)
expected_hash = '6f696c7e30c15aae3f0fa4807b596cf15d28cadaf33602d8d20368f7ac921f26'
assert hashlib.sha256(shape.encode()).hexdigest() == expected_hash

columns = str(sorted(X_train_.columns.values))
expected_hash = 'e946a91d360d95e04adfdea57023d5242defc1e9cf5f41323bfd55a0022adf4c'
assert hashlib.sha256(columns.encode()).hexdigest() == expected_hash

shape = str(X_test_.shape)
expected_hash = 'aa2b4e3c1e358b4b9f21c2c86bbf1187020582395419f1a02a949d7a6efac9e4'
assert hashlib.sha256(shape.encode()).hexdigest() == expected_hash

columns = str(sorted(X_test_.columns.values))
expected_hash = 'e946a91d360d95e04adfdea57023d5242defc1e9cf5f41323bfd55a0022adf4c'
assert hashlib.sha256(columns.encode()).hexdigest() == expected_hash

## 9 Build a ColumnSelector transformer (graded)

There's a simple transformer that can be useful, from times to times, when modeling.

What we want is to build a transformer that returns the columns we select beforehand. 

This transformer could be used to determine what features go into modeling.

In [None]:
X_train[['DIS']]

In [32]:
class ColumnSelector(BaseEstimator, TransformerMixin):
    # Implement the __init__ method.
    def __init__(self,columns = 'all'):
        self.columns = columns
    # Our ColumnSelector must be able to receive a parameter columns.
    # The default value for columns must be set to 'all', so we can
    # initialize it without any explicit parameters.
    # YOUR CODE HERE
    #raise NotImplementedError()
        
    # There's no need for a fit method in this case, it does nothing.
    # We should be able to call fit without any explicit parameters.
    # Meaning: we should be able to call ColumnSelector.fit().
   
    def fit(self,X=None, y=None,**fit_params):
        return self
        
        

    # Transform should return all columns if the parameter columns we
    # passed upon initialization is equal to 'all'. If a column or a
    # list of columns are passed, only those should be returned.
    # YOUR CODE HERE
    def transform(self,X=None):
        x_copy = X.copy()
        if self.columns == 'all':
            return x_copy
        else:
            return x_copy[columns]
    
        
        
    # YOUR CODE HERE
    #raise NotImplementedError()        

cols = ['CRIM', 'DIS', 'INDUS', 'RM', 'DIS', 'TAX', 'B']
selector = ColumnSelector(columns=cols)
X_train__ = selector.fit_transform(X_train_)
X_test__ = selector.transform(X_test_)
X_test__

KeyError: "['AGE', 'B', 'CHAS', 'CRIM', 'DIS', 'INDUS', 'LSTAT', 'NOX', 'PTRATIO', 'RAD', 'RM', 'TAX', 'ZN']"

In [None]:
assert(ColumnSelector())
assert(selector.fit())

shape = str(X_train__.shape)
expected_hash = '5d4f688e84beb21ec07f136c16a6cc11318d4f5de7b81bf0232e5282d9834123'
assert hashlib.sha256(shape.encode()).hexdigest() == expected_hash

columns = str(sorted(X_train__.columns.values))
expected_hash = 'cdcff8ef5982ac9ff530dae2b67e0e3c068a4a50ea6c7dbf54b5d9341748af78'
assert hashlib.sha256(columns.encode()).hexdigest() == expected_hash

shape = str(X_test__.shape)
expected_hash = '0aba1c19151f76aa2ecb00fd75be05c6f73860573972e967f3d1fe1c44ae2629'
assert hashlib.sha256(shape.encode()).hexdigest() == expected_hash

columns = str(sorted(X_test__.columns.values))
expected_hash = 'cdcff8ef5982ac9ff530dae2b67e0e3c068a4a50ea6c7dbf54b5d9341748af78'
assert hashlib.sha256(columns.encode()).hexdigest() == expected_hash

## 10 Building the pipeline (graded)

Finally, we want to use the two transformers together and run a linear regression on top.

In [None]:
# Create a pipeline including:
#   1 - 'selector', ColumSelector(columns=cols)
#   2 - 'min_max', MinMaxScaler() with same range as above
#   3 - 'model', LinearRegression
# YOUR CODE HERE
pipeline = Pipeline([('selector', ColumnSelector(columns=cols)),
                     ('min_max', MinMaxScaler(feature_range=(0,5))),
                     ('model', LinearRegression())])
#raise NotImplementedError()


pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
print('MSE: {}'.format(mse))
print('MAE: {}'.format(mae))

In [None]:
assert type(pipeline) == Pipeline
assert type(pipeline.named_steps['selector']) == ColumnSelector
assert type(pipeline.named_steps['min_max']) == MinMaxScaler
assert pipeline.named_steps['min_max'].get_params()['feature_range'] == (0,5)
assert type(pipeline.named_steps['model']) == LinearRegression 

Exercises complete, congratulations! You are about to become a certified data wrangler.