# scikit-learn

> Simple and efficient tools for predictive data analysis

> Accessible to everybody, and reusable in various contexts

> Built on NumPy, SciPy, and matplotlib

> Classification, Regression, Clustering 

> Dimensionality Reduction, Model Selection, Preprocessing 

# Anatomy of scikit-learn

In [1]:
# https://s3.amazonaws.com/assets.datacamp.com/blog_assets/Scikit_Learn_Cheat_Sheet_Python.pdf
    
# import the packages
from sklearn import neighbors, datasets, preprocessing
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# load the data set
iris = datasets.load_iris()


# Split the data into features and labels
X, y = iris.data[:, :2], iris.target

# split the dataset into train/test
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=33)

# Scale the dataset to deal with outliers or imblance
scaler = preprocessing.StandardScaler().fit(X_train)

# Perform the transformation
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

# set the modle with hyper-parameters
knn = neighbors.KNeighborsClassifier(n_neighbors=5)

# fit the model
knn.fit(X_train, y_train)

# perform a prediction 
y_pred = knn.predict(X_test)

# test the accuracy
accuracy_score(y_test, y_pred)

0.631578947368421

# load a dataset

In [2]:

import pandas as pd
import numpy as np

epaMpg = pd.read_csv ("https://raw.githubusercontent.com/sqlshep/SQLShepBlog/master/data/epaMpg.csv")


In [3]:
epaMpg.head()

Unnamed: 0,RowNumber,Represented.Test.Veh.Make,Model,Vehicle.Type,HorsePower,Cylinders,Tested.Transmission.Type.Code,Tested.Transmission.Type,Gears,Drive.System.Code,Weight,AxleRatio,Test.Procedure.Cd,Test.Procedure.Description,Test.Fuel.Type.Cd,Test.Fuel.Type.Description,FuelEcon
0,1,Aston Martin,Rapide S,Car,552,12,SA,Semi-Automatic,8,R,4750,2.73,21,Federal fuel 2-day exhaust (w/can load),61,Tier 2 Cert Gasoline,17.3
1,2,Aston Martin,Vanquish,Car,568,12,SA,Semi-Automatic,8,R,4500,2.73,21,Federal fuel 2-day exhaust (w/can load),61,Tier 2 Cert Gasoline,16.5
2,3,BENTLEY,Continental GT,Car,616,12,SA,Semi-Automatic,8,F,6000,2.85,90,US06,61,Tier 2 Cert Gasoline,17.4
3,4,BENTLEY,Continental GT,Car,616,12,SA,Semi-Automatic,8,F,6000,2.85,11,Cold CO,27,Cold CO Premium (Tier 2),13.6
4,5,BMW,230i Convertible,Car,248,4,SA,Semi-Automatic,8,R,4000,2.81,3,HWFE,61,Tier 2 Cert Gasoline,45.8


In [4]:
epaMpg = epaMpg.drop(epaMpg.columns[[0]], axis=1)

epaMpg.columns = epaMpg.columns.str.replace('.', '_')

In [5]:
y = epaMpg.FuelEcon
y.shape

(1034,)

In [6]:
X = epaMpg.loc[:, epaMpg.columns != 'FuelEcon']
X.shape

(1034, 15)

In [7]:
X.columns 

Index(['Represented_Test_Veh_Make', 'Model', 'Vehicle_Type', 'HorsePower',
       'Cylinders', 'Tested_Transmission_Type_Code',
       'Tested_Transmission_Type', 'Gears', 'Drive_System_Code', 'Weight',
       'AxleRatio', 'Test_Procedure_Cd', 'Test_Procedure_Description',
       'Test_Fuel_Type_Cd', 'Test_Fuel_Type_Description'],
      dtype='object')

In [8]:
X = X[['Represented_Test_Veh_Make', 
'Vehicle_Type', 
'HorsePower',
'Cylinders', 
'Tested_Transmission_Type_Code', 
'Gears', 
'Drive_System_Code', 
'Weight',
'AxleRatio', 
'Test_Procedure_Cd', 
'Test_Fuel_Type_Cd']]


In [9]:
#pd.get_dummies(epaMpg['Represented_Test_Veh_Make'], prefix='Veh_Make')

X = pd.concat([X.drop('Represented_Test_Veh_Make', axis=1), pd.get_dummies(X['Represented_Test_Veh_Make'], prefix='Veh_Make')], axis=1)
X = pd.concat([X.drop('Vehicle_Type', axis=1), pd.get_dummies(X['Vehicle_Type'], prefix='Vehicle_Type')], axis=1)
X = pd.concat([X.drop('Tested_Transmission_Type_Code', axis=1), pd.get_dummies(X['Tested_Transmission_Type_Code'], prefix='Transmission_Type')], axis=1)
X = pd.concat([X.drop('Drive_System_Code', axis=1), pd.get_dummies(X['Drive_System_Code'], prefix='Drive_System_Code')], axis=1)

X

Unnamed: 0,HorsePower,Cylinders,Gears,Weight,AxleRatio,Test_Procedure_Cd,Test_Fuel_Type_Cd,Veh_Make_ACURA,Veh_Make_AUDI,Veh_Make_Alfa Romeo,...,Transmission_Type_AMS,Transmission_Type_CVT,Transmission_Type_M,Transmission_Type_SA,Transmission_Type_SCV,Drive_System_Code_4,Drive_System_Code_A,Drive_System_Code_F,Drive_System_Code_P,Drive_System_Code_R
0,552,12,8,4750,2.73,21,61,0,0,0,...,0,0,0,1,0,0,0,0,0,1
1,568,12,8,4500,2.73,21,61,0,0,0,...,0,0,0,1,0,0,0,0,0,1
2,616,12,8,6000,2.85,90,61,0,0,0,...,0,0,0,1,0,0,0,1,0,0
3,616,12,8,6000,2.85,11,27,0,0,0,...,0,0,0,1,0,0,0,1,0,0
4,248,4,8,4000,2.81,3,61,0,0,0,...,0,0,0,1,0,0,0,0,0,1
5,248,4,6,3625,3.91,21,61,0,0,0,...,0,0,1,0,0,0,0,0,0,1
6,248,4,8,3625,2.81,3,61,0,0,0,...,0,0,0,1,0,0,0,0,0,1
7,248,4,8,4000,2.81,31,61,0,0,0,...,0,0,0,1,0,0,0,0,0,1
8,248,4,8,3750,2.81,21,61,0,0,0,...,0,0,0,1,0,0,0,0,0,1
9,181,4,8,3625,3.20,31,61,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [10]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

In [11]:
from sklearn.linear_model import LinearRegression
    
lr = LinearRegression()

In [12]:
lr.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)

In [13]:
y_pred = lr.predict(X_test)

In [14]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

print(mean_absolute_error(y_test, y_pred))
print(mean_squared_error(y_test, y_pred))
print(r2_score(y_test, y_pred))

5.009215188053212
45.78712797711447
0.5855900509013576


In [15]:
from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor()

In [16]:
rf.fit(X_train, y_train)



RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

In [17]:
y_pred = rf.predict(X_test)

In [18]:
print(mean_absolute_error(y_test, y_pred))
print(mean_squared_error(y_test, y_pred))
print(r2_score(y_test, y_pred))

2.4461521419378567
20.098427211621974
0.8180932378660207


# Grid search


In [20]:
from sklearn.model_selection import GridSearchCV,RandomizedSearchCV

rfc=RandomForestRegressor()

In [21]:
param_grid = { 
    'n_estimators': [1, 10, 100, 1000],
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth' : [1,2,3,4,5,6,7,8,9,10]
}

In [22]:
grid = RandomizedSearchCV(rfc, param_grid, n_jobs=-1, cv=5)
grid.fit(X_train, y_train)

RandomizedSearchCV(cv=5, error_score='raise-deprecating',
          estimator=RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=None,
           oob_score=False, random_state=None, verbose=0, warm_start=False),
          fit_params=None, iid='warn', n_iter=10, n_jobs=-1,
          param_distributions={'n_estimators': [1, 10, 100, 1000], 'max_features': ['auto', 'sqrt', 'log2'], 'max_depth': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]},
          pre_dispatch='2*n_jobs', random_state=None, refit=True,
          return_train_score='warn', scoring=None, verbose=0)

In [23]:
print(grid.best_score_ , grid.best_params_)


0.8309235734912586 {'n_estimators': 100, 'max_features': 'auto', 'max_depth': 5}


In [24]:
y_pred = grid.predict(X_test)

In [25]:
print(mean_absolute_error(y_test, y_pred))
print(mean_squared_error(y_test, y_pred))
print(r2_score(y_test, y_pred))


2.97565150436003
24.993704404225753
0.7737870832361726
