### 1. Getting data Ready

In [1]:

# Import pandas under the abbreviation 'pd'
###

import pandas as pd

# Import NumPy under the abbreviation 'np'
###

import numpy as np

In [2]:
# Import the heart disease dataset and save it to a variable

heart_disease = pd.read_csv("heart-disease.csv")

# Check the first 5 rows of the data

heart_disease.head(5)

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


In [3]:
# Create X (all columns except target)
X = heart_disease.drop("target", axis=1)

# Create y (only the target column)
y = heart_disease["target"]

In [4]:
# Import train_test_split from sklearn's model_selection module
from sklearn.model_selection import train_test_split

# Use train_test_split to split X & y into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2)

#### 2. Preparing a machine learning model

In [5]:
# Import the RandomForestClassifier from sklearn's ensemble module
from sklearn.ensemble import RandomForestClassifier

# Instantiate an instance of RandomForestClassifier as clf
clf = RandomForestClassifier()

#### 4 Fit the model and make predictions

In [6]:
# Fit the RandomForestClassifier to the training data
np.random.seed(42)

clf.fit(X_train, y_train)

In [7]:
# Use the fitted model to make predictions on the test data and
# save the predictions to a variable called y_preds
y_preds = clf.predict(X_test)


#### 4. Evaluating a model's predictions

In [8]:
# Evaluate the fitted model on the training set using the score() function

clf.score(X_train, y_train)

1.0

In [9]:
# Evaluate the fitted model on the test set using the score() function

clf.score(X_test, y_test)

0.8360655737704918

In [10]:
### Let us try a bunch of other ML algorithms offered by Scikit learn!

In [11]:
# Import LinearSVC from sklearn's svm module

from sklearn.svm import LinearSVC

# Import KNeighborsClassifier from sklearn's neighbors module

from sklearn.neighbors import KNeighborsClassifier

# Import SVC from sklearn's svm module

from sklearn.svm import SVC

# Import LogisticRegression from sklearn's linear_model module

from sklearn.linear_model import LogisticRegression

In [12]:
models = {"LinearSVC": LinearSVC,
          "KNN": KNeighborsClassifier,
          "SVC": SVC,
          "LogisticRegression": LogisticRegression,
          "RandomForestClassifier": RandomForestClassifier
         }

# Create an empty dictionary called results
results = {}

In [13]:
# Loop through the models dictionary items, fitting the model on the training data
# and appending the model name and model score on the test data to the results dictionary
np.random.seed(42)

for model_name, model_class in models.items():
    model = model_class()
    model.fit(X_train,y_train)
    results[model_name] = model.score(X_test,y_test)

# View the results
results

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


{'LinearSVC': 0.7704918032786885,
 'KNN': 0.6721311475409836,
 'SVC': 0.7704918032786885,
 'LogisticRegression': 0.7704918032786885,
 'RandomForestClassifier': 0.8360655737704918}

In [14]:
# Create a pandas dataframe with the data as the values of the results dictionary,
# the index as the keys of the results dictionary and a single column called accuracy.
# Be sure to save the dataframe to a variable.
results_df = pd.DataFrame(results.values(), 
                          results.keys(), 
                          columns=["accuracy"])

In [15]:
results_df

Unnamed: 0,accuracy
LinearSVC,0.770492
KNN,0.672131
SVC,0.770492
LogisticRegression,0.770492
RandomForestClassifier,0.836066


#### 5 Hyperparameter Tuning

In [16]:
log_reg_grid = {"C": np.logspace(-4, 4, 20),
                "solver": ["liblinear"]}

In [17]:
np.random.seed(42)
from sklearn.model_selection import RandomizedSearchCV
# Import RandomizedSearchCV from sklearn's model_selection module


# Setup an instance of RandomizedSearchCV with a LogisticRegression() estimator,

clf = LogisticRegression()

# our log_reg_grid as the param_distributions, a cv of 5 and n_iter of 5.
rs_log_reg = RandomizedSearchCV(estimator=clf,
                                param_distributions=log_reg_grid,
                                cv=5,
                                n_iter=5,
                                verbose=2)

# Fit the instance of RandomizedSearchCV
rs_log_reg.fit(X_train, y_train)

Fitting 5 folds for each of 5 candidates, totalling 25 fits
[CV] END .........................C=0.0001, solver=liblinear; total time=   0.0s
[CV] END .........................C=0.0001, solver=liblinear; total time=   0.0s
[CV] END .........................C=0.0001, solver=liblinear; total time=   0.0s
[CV] END .........................C=0.0001, solver=liblinear; total time=   0.0s
[CV] END .........................C=0.0001, solver=liblinear; total time=   0.0s
[CV] END ...............C=1438.44988828766, solver=liblinear; total time=   0.0s
[CV] END ...............C=1438.44988828766, solver=liblinear; total time=   0.0s
[CV] END ...............C=1438.44988828766, solver=liblinear; total time=   0.0s
[CV] END ...............C=1438.44988828766, solver=liblinear; total time=   0.0s
[CV] END ...............C=1438.44988828766, solver=liblinear; total time=   0.0s
[CV] END ...............C=206.913808111479, solver=liblinear; total time=   0.0s
[CV] END ...............C=206.913808111479, solve

In [18]:
# Find the best parameters of the RandomizedSearchCV instance using the best_params_ attribute
rs_log_reg.best_params_

{'solver': 'liblinear', 'C': np.float64(0.23357214690901212)}

In [19]:
rs_log_reg.score(X_test,y_test)

0.7868852459016393

In [20]:
# Instantiate a LogisticRegression classifier using the best hyperparameters from RandomizedSearchCV
clf_new = LogisticRegression(solver = 'liblinear' , C= np.float64(1438.44988828766))
np.random.seed(42)
# Fit the new instance of LogisticRegression with the best hyperparameters on the training data 
clf_new.fit(X_train, y_train)
clf_new.score(X_test, y_test)

0.7868852459016393

#### 6 Model Evaluating

In [21]:
# Import confusion_matrix and classification_report from sklearn's metrics module
from sklearn.metrics import confusion_matrix

# Import precision_score, recall_score and f1_score from sklearn's metrics module
from sklearn.metrics import precision_score, recall_score

In [22]:
y_preds = clf_new.predict(X_test)

In [23]:
y_preds

array([1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0,
       1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1,
       1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1])

In [24]:
precision = precision_score(y_preds, y_test)

recall = recall_score(y_preds, y_test)

score = clf_new.score(X_test,y_test)

In [25]:
precision

np.float64(0.8648648648648649)

In [26]:
score

0.7868852459016393

In [27]:
recall

np.float64(0.8)

In [28]:
conf_matrix = confusion_matrix(y_test, y_preds)

In [29]:
conf_matrix

array([[16,  8],
       [ 5, 32]])

In [30]:
# Import cross_val_score from sklearn's model_selection module
from sklearn.model_selection import cross_val_score

In [31]:
# EXAMPLE: By default cross_val_score returns 5 values (cv=5).
cross_val_score(clf_new, 
                X, 
                y, 
                scoring="accuracy",
                cv=5)

array([0.80327869, 0.8852459 , 0.83606557, 0.86666667, 0.76666667])

In [32]:
# Find the cross-validated precision
cross_val_score(clf_new, 
                X, 
                y, 
                scoring="precision",
                cv=5)

array([0.78378378, 0.90625   , 0.84848485, 0.83783784, 0.74358974])

In [33]:
# Find the cross-validated recall
cross_val_score(clf_new, 
                X, 
                y, 
                scoring="recall",
                cv=5)

array([0.87878788, 0.87878788, 0.84848485, 0.93939394, 0.87878788])

In [34]:
# Import the dump and load functions from the joblib library
from joblib import dump, load
dump(clf_new, filename="heart_disease_CLF")

['heart_disease_CLF']

### DONE!

### Now let us make a regression!

In [36]:
car_sales = pd.read_csv("car-sales-extended-missing-data.csv")

In [39]:
car_sales.head()

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
0,Honda,White,35431.0,4.0,15323.0
1,BMW,Blue,192714.0,5.0,19943.0
2,Honda,White,84714.0,4.0,28343.0
3,Toyota,White,154365.0,4.0,13434.0
4,Nissan,Blue,181577.0,3.0,14043.0


In [40]:
car_sales.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Make           951 non-null    object 
 1   Colour         950 non-null    object 
 2   Odometer (KM)  950 non-null    float64
 3   Doors          950 non-null    float64
 4   Price          950 non-null    float64
dtypes: float64(3), object(2)
memory usage: 39.2+ KB


In [42]:
car_sales.isna().sum()

Make             49
Colour           50
Odometer (KM)    50
Doors            50
Price            50
dtype: int64

In [45]:
car_sales.dropna(subset=["Price"], inplace=True)

In [46]:
car_sales.isna().sum()

Make             47
Colour           46
Odometer (KM)    48
Doors            47
Price             0
dtype: int64

In [47]:
# Import Pipeline from sklearn's pipeline module
from sklearn.pipeline import Pipeline

# Import ColumnTransformer from sklearn's compose module
from sklearn.compose import ColumnTransformer

# Import SimpleImputer from sklearn's impute module
from sklearn.impute import SimpleImputer

# Import OneHotEncoder from sklearn's preprocessing module
from sklearn.preprocessing import OneHotEncoder

# Import train_test_split from sklearn's model_selection module
from sklearn.model_selection import train_test_split

In [48]:
car_sales.head()

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
0,Honda,White,35431.0,4.0,15323.0
1,BMW,Blue,192714.0,5.0,19943.0
2,Honda,White,84714.0,4.0,28343.0
3,Toyota,White,154365.0,4.0,13434.0
4,Nissan,Blue,181577.0,3.0,14043.0


In [67]:
# Define different categorical features 
categorical_features = ["Make", "Colour"]

# Create categorical transformer Pipeline
categorical_transformer = Pipeline(steps=[
    # Set SimpleImputer strategy to "constant" and fill value to "missing"
    ("imputer", SimpleImputer(strategy="constant", fill_value="missing")),
    # Set OneHotEncoder to ignore the unknowns
    ("onehot", OneHotEncoder(handle_unknown="ignore"))])

# Define Doors features
door_feature = ["Doors"]

# Create Doors transformer Pipeline
door_transformer = Pipeline(steps=[
    # Set SimpleImputer strategy to "constant" and fill value to 4
    ("imputer", SimpleImputer(strategy="constant", fill_value=4))])

# Define numeric features (only the Odometer (KM) column)
numeric_features = ["Odometer (KM)"]

# Crearte numeric transformer Pipeline
numeric_transformer = Pipeline(steps=[
    # Set SimpleImputer strategy to fill missing values with the "Median"
    ("imputer", SimpleImputer(strategy="median"))
])

# Setup preprocessing steps (fill missing values, then convert to numbers)
preprocessor = ColumnTransformer(
    transformers=[
        # Use the categorical_transformer to transform the categorical_features
        ("cat", categorical_transformer, categorical_features),
        # Use the door_transformer to transform the door_feature
        ("door", door_transformer, door_feature),
        # Use the numeric_transformer to transform the numeric_features
        ("num", numeric_transformer, numeric_features)])

In [70]:
# Import Ridge from sklearn's linear_model module
from sklearn.linear_model import Ridge

# Import SVR from sklearn's svm module
from sklearn.svm import SVR

# Import RandomForestRegressor from sklearn's ensemble module
from sklearn.ensemble import RandomForestRegressor

In [87]:
# Create dictionary of model instances, there should be 4 total key, value pairs
# in the form {"model_name": model_instance}.
# Don't forget there's two versions of SVR, one with a "linear" kernel and the
# other with kernel set to "rbf".
regression_models = {"Ridge": Ridge(),
                     "SVR_linear": SVR(kernel="linear"),
                     "SVR_rbf": SVR(kernel="rbf"),
                     "RandomForestRegressor": RandomForestRegressor()}

# Create an empty dictionary for the regression results
regression_results = {}

In [74]:
# Create car sales X data (every column of car_sales except Price)
car_sales_X = car_sales.drop("Price", axis = 1)

# Create car sales y data (the Price column of car_sales)
car_sales_y = car_sales["Price"]

In [85]:
# Use train_test_split to split the car_sales_X and car_sales_y data into 
# training and test sets.
# Give the test set 20% of the data using the test_size parameter.
# For reproducibility set the random_state parameter to 42.
np.random.seed(42)
car_X_train, car_X_test, car_y_train, car_y_test = train_test_split(car_sales_X,
                                                                    car_sales_y,
                                                                    test_size=0.2,
                                                                    random_state=42)

# Check the shapes of the training and test datasets
car_X_train.shape

(760, 4)

In [79]:
car_X_test.shape

(190, 4)

In [88]:
# Loop through the items in the regression_models dictionary
for model_name, model in regression_models.items():
    
    # Create a model pipeline with a preprocessor step and model step
    model_pipeline = Pipeline(steps=[("preprocessor", preprocessor),
                                      ("model", model)])
    
    # Fit the model pipeline to the car sales training data
    print(f"Fitting {model_name}...")
    model_pipeline.fit(car_X_train, car_y_train)
    
    # Score the model pipeline on the test data appending the model_name to the 
    # results dictionary
    print(f"Scoring {model_name}...")
    regression_results[model_name] = model_pipeline.score(car_X_test, 
                                                          car_y_test)

Fitting Ridge...
Scoring Ridge...
Fitting SVR_linear...
Scoring SVR_linear...
Fitting SVR_rbf...
Scoring SVR_rbf...
Fitting RandomForestRegressor...
Scoring RandomForestRegressor...


In [92]:
# Import mean_absolute_error from sklearn's metrics module
from sklearn.metrics import mean_absolute_error

# Import mean_squared_error from sklearn's metrics module
from sklearn.metrics import mean_squared_error

# Import r2_score from sklearn's metrics module
from sklearn.metrics import r2_score

In [93]:
regression_results

{'Ridge': 0.2540261105794389,
 'SVR_linear': -0.489452821008145,
 'SVR_rbf': 0.0018546241516633755,
 'RandomForestRegressor': 0.21788451427725153}

In [100]:
# Create RidgeRegression Pipeline with preprocessor as the "preprocessor" and
# Ridge() as the "model".
ridge_pipeline = Pipeline(steps=[("preprocessor", preprocessor),
                                 ("model", Ridge())])

# Fit the RidgeRegression Pipeline to the car sales training data
ridge_pipeline.fit(car_X_train, car_y_train)

# Make predictions on the car sales test data using the RidgeRegression Pipeline
car_y_preds = ridge_pipeline.predict(car_X_test)

# View the first 50 predictions
car_y_preds[:50]

array([18514.480873  , 22204.86580267, 11045.72092225,  6891.87687957,
        8793.93970278, 10926.96786577, 15267.76229871, 13834.80312146,
       20207.44377898, 14412.59968701,  6216.01228306, 16543.06707068,
       11783.69374936, 13492.13284255, 14321.17899086, 16425.45131776,
       15999.88230172,  9925.04252351, 11576.64865889, 11580.55130633,
       10647.22402588, 13113.54464064, 17874.90033257, 23447.29246732,
       11797.33019118, 14481.85336505, 18430.527126  , 14680.33044208,
       20585.63901269, 19953.91395769, 18163.1170151 , 22366.43390174,
       12435.69707467, 14265.25495748, 18351.62656383, 19564.58267503,
       12213.05747919, 12479.20772529, 18677.93478691, 11284.89072638,
       15278.54001605, 17378.81425165, 19247.77623181, 17343.90692672,
       15048.12797569, 12724.44477165, 12389.86021752,  8475.63778401,
       15255.49324105, 18602.79970861])

In [101]:
# EXAMPLE: Find the MSE by comparing the car sales test labels to the car sales predictions
mse = mean_squared_error(car_y_test, car_y_preds)
# Return the MSE
mse

np.float64(49950182.63337459)

In [102]:
# Find the MAE by comparing the car sales test labels to the car sales predictions
mae =  mean_absolute_error(car_y_test, car_y_preds)
# Return the MAE
mae

np.float64(5713.821520855156)

In [103]:
# Find the R^2 score by comparing the car sales test labels to the car sales predictions
r2 = r2_score(car_y_test, car_y_preds)
# Return the R^2 score
r2

0.2540261105794389