In [1]:
from sklearn import preprocessing
import pandas as pd
import numpy as np
import random

### Finding Missing Values

In [2]:
X = pd.DataFrame([[np.nan, 2], [6, np.nan], [7, 6]])
X.isnull()

Unnamed: 0,0,1
0,True,False
1,False,True
2,False,False


### Filling Missing Values

In [3]:
import numpy as np
from sklearn.impute import SimpleImputer
imp = SimpleImputer(missing_values=np.nan, strategy='mean')
X = [[np.nan, 2], [6, np.nan], [7, 6]]
X

[[nan, 2], [6, nan], [7, 6]]

In [4]:
print(imp.fit_transform(X))

[[6.5 2. ]
 [6.  4. ]
 [7.  6. ]]


### Standardization using Z-Score

In [5]:
X_train = random.sample(range(100), 20)
X_train

[95, 21, 15, 86, 30, 33, 31, 12, 40, 2, 43, 72, 61, 54, 50, 7, 71, 0, 39, 75]

In [6]:
X_scaled = preprocessing.scale(X_train) # 
X_scaled

array([ 1.93033259, -0.75724242, -0.97515391,  1.60346536, -0.43037519,
       -0.32141944, -0.39405661, -1.08410965, -0.06718938, -1.44729546,
        0.04176637,  1.09500522,  0.69550083,  0.44127076,  0.29599644,
       -1.26570256,  1.05868664, -1.51993263, -0.10350796,  1.20396097])

### Encoding categorical features

In [7]:
le = preprocessing.LabelEncoder()
le.fit(["paris", "paris", "tokyo", "amsterdam"])
le = preprocessing.LabelEncoder()
le.fit(["paris", "paris", "tokyo", "amsterdam"])

list(le.classes_)


['amsterdam', 'paris', 'tokyo']

In [8]:
le.transform(["tokyo", "tokyo", "paris"])

array([2, 2, 1])

### Binning

In [9]:
from sklearn.preprocessing import KBinsDiscretizer 
X = [[-2, 1, -4,   -1],
    [-1, 2, -3, -0.5],
    [ 0, 3, -2,  0.5],
    [ 1, 4, -1,    2]]
est = KBinsDiscretizer(n_bins=3, encode='ordinal', strategy='uniform')
est.fit(X)

Xt = est.transform(X)
Xt

array([[0., 0., 0., 0.],
       [1., 1., 1., 0.],
       [2., 2., 2., 1.],
       [2., 2., 2., 2.]])

### Test Train Split

In [10]:
import seaborn as sns
data = sns.load_dataset("tips")
data.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


In [11]:
features = ['total_bill', 'tip']
X = data[features]
y = data['size']
X.shape

(244, 2)

In [12]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.33, random_state=42)

In [13]:
X_train.shape

(163, 2)

### Available Models

https://scikit-learn.org/stable/supervised_learning.html

### Train a Model

In [14]:
from sklearn.ensemble import RandomForestRegressor

regr = RandomForestRegressor(max_depth=2, random_state=0, n_estimators=100)
regr.fit(X_train, y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=2,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=None,
           oob_score=False, random_state=0, verbose=0, warm_start=False)

### Perform  a Prediction

In [15]:
pred = regr.predict( X_test)

### Evaluation

In [16]:
from sklearn.metrics import mean_absolute_error
mean_absolute_error(y_test, pred)

0.5986181563707077

Other Matrices : https://scikit-learn.org/stable/modules/model_evaluation.html

### Parameter Tuning

In [17]:
from sklearn.model_selection import GridSearchCV
param_grid = {
   'n_estimators': [2, 50],
   'max_features': ['auto', 'sqrt', 'log2'],
   'max_depth' : [4,5]
}
CV_rfc = GridSearchCV(estimator=regr, param_grid=param_grid, cv= 5)
CV_rfc.fit(X_train, y_train)

CV_rfc.best_params_



{'max_depth': 4, 'max_features': 'sqrt', 'n_estimators': 2}