# https://deeplearningcourses.com/c/deep-learning-prerequisites-the-numpy-stack-in-python
# https://www.udemy.com/deep-learning-prerequisites-the-numpy-stack-in-python
# YouTube direct link: http://bit.ly/2LENC50

# Get the data from:
# https://archive.ics.uci.edu/ml/datasets/Airfoil+Self-Noise

In [1]:
# just in case we need it
import numpy as np
import pandas as pd

In [3]:
# load the data
# important note: this is where we will usually put data files
df = pd.read_csv('../large_files/airfoil_self_noise.dat', sep='\t', header=None)

In [4]:
# check the data
df.head()
df.info()

# get the inputs
data = df[[0,1,2,3,4]].values

# get the outputs
target = df[5].values

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1503 entries, 0 to 1502
Data columns (total 6 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   0       1503 non-null   int64  
 1   1       1503 non-null   float64
 2   2       1503 non-null   float64
 3   3       1503 non-null   float64
 4   4       1503 non-null   float64
 5   5       1503 non-null   float64
dtypes: float64(5), int64(1)
memory usage: 70.6 KB


In [5]:
# normally we would put all of our imports at the top
# but this lets us tell a story
from sklearn.model_selection import train_test_split

In [8]:
# split the data into train and test sets
# this lets us simulate how our model will perform in the future
X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=0.33)

In [6]:
# instantiate a classifer and train it
from sklearn.linear_model import LinearRegression

In [9]:
model = LinearRegression()
model.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [10]:
# evaluate the model's performance
print(model.score(X_train, y_train))
print(model.score(X_test, y_test))

0.5178174085574289
0.5093302963242687


In [11]:
# how you can make predictions
predictions = model.predict(X_test)

# what did we get?
predictions

array([133.09103598, 128.09463829, 114.23583955, 130.71720728,
       126.87377129, 127.80461105, 114.64524473, 122.20951344,
       122.15713823, 132.54714837, 122.78963455, 130.64543105,
       130.41579885, 121.01610784, 123.36819134, 122.21911935,
       129.3668704 , 133.21593625, 127.6987309 , 125.77511823,
       128.34871566, 124.22591775, 114.73113592, 126.23404937,
       126.14876802, 131.91809122, 122.01934171, 127.65090425,
       129.7628116 , 128.29778733, 118.26317918, 123.26405122,
       129.52756453, 131.32563954, 122.4765458 , 124.67222433,
       131.06754192, 126.75148746, 127.90954938, 120.57014736,
       119.81027742, 125.50628252, 125.82774719, 129.26535996,
       119.2573937 , 133.12921143, 118.50082589, 123.38691543,
       125.63707539, 120.34808467, 127.63975341, 125.51029424,
       128.06546884, 117.77263211, 116.79691774, 126.1258583 ,
       129.60193423, 124.93801031, 124.52513098, 125.34133076,
       120.14249289, 123.98722809, 131.00224314, 118.23

In [12]:
# we can even use random forest to solve the same problem!
from sklearn.ensemble import RandomForestRegressor

In [13]:
model2 = RandomForestRegressor()
model2.fit(X_train, y_train)

RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=None, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=100, n_jobs=None, oob_score=False,
                      random_state=None, verbose=0, warm_start=False)

In [14]:
# evaluate the model's performance
print(model2.score(X_train, y_train))
print(model2.score(X_test, y_test))


0.98918106208871
0.919442618951517


In [15]:
# we can even use deep learning to solve the same problem!
from sklearn.neural_network import MLPRegressor

In [16]:
# you'll learn why scaling is needed in a later course
from sklearn.preprocessing import StandardScaler

In [17]:
scaler = StandardScaler()
X_train2 = scaler.fit_transform(X_train)
X_test2 = scaler.transform(X_test)
scaler2 = StandardScaler()
y_train2 = scaler2.fit_transform(np.expand_dims(y_train, -1)).ravel()
y_test2 = scaler2.fit_transform(np.expand_dims(y_test, -1)).ravel()


In [18]:
model = MLPRegressor(max_iter=500)
model.fit(X_train2, y_train2)

MLPRegressor(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
             beta_2=0.999, early_stopping=False, epsilon=1e-08,
             hidden_layer_sizes=(100,), learning_rate='constant',
             learning_rate_init=0.001, max_fun=15000, max_iter=500,
             momentum=0.9, n_iter_no_change=10, nesterovs_momentum=True,
             power_t=0.5, random_state=None, shuffle=True, solver='adam',
             tol=0.0001, validation_fraction=0.1, verbose=False,
             warm_start=False)

In [19]:
# evaluate the model's performance
print(model.score(X_train2, y_train2))
print(model.score(X_test2, y_test2))
# not as good as a random forest!
# but not as bad as linear regression

0.8458209973424983
0.8067769433655068
