In [146]:
import pandas as pd
import numpy as np
from sklearn import datasets

df = pd.DataFrame(datasets.load_boston().data, columns=['CRIM', 'ZN', 'INDUS', 'CHAS', 
                                                        'NOX', 'RM', 'AGE', 'DIS', 'RAD',
                                                        'TAX', 'PTRATIO', 'B', 'LSTAT'])

### Handling Outliers

In [147]:
df.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33


In [148]:
# Dropping outliers gt\lt than a number
print(f"Before: {df['PTRATIO'].max()}")
df = df[df['PTRATIO'] < 20]
print(f"After: {df['PTRATIO'].max()}")

# keep only the ones that are within +3 to -3 standard deviations in the column 'Data'.
# df[np.abs(df["CRIM"] -df["CRIM"].mean()) <= (3 * df["CRIM"].std())]

# or if you prefer the other way around
# df[~(np.abs(df.Data-df.Data.mean())>(3*df.Data.std()))] 

Before: 22.0
After: 19.7


In [149]:
# Marking Outliers
df['LSTAT Outlier'] = np.where(df['LSTAT'] < 10, 0, 1)
df[4:9]

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,LSTAT Outlier
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33,0
5,0.02985,0.0,2.18,0.0,0.458,6.43,58.7,6.0622,3.0,222.0,18.7,394.12,5.21,0
6,0.08829,12.5,7.87,0.0,0.524,6.012,66.6,5.5605,5.0,311.0,15.2,395.6,12.43,1
7,0.14455,12.5,7.87,0.0,0.524,6.172,96.1,5.9505,5.0,311.0,15.2,396.9,19.15,1
8,0.21124,12.5,7.87,0.0,0.524,5.631,100.0,6.0821,5.0,311.0,15.2,386.63,29.93,1


In [150]:
# Rescaling Outliers
df['Log of LSTAT'] = [np.log(x) for x in df['LSTAT']]
df.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,LSTAT Outlier,Log of LSTAT
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98,0,1.60543
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14,0,2.21266
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03,0,1.393766
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94,0,1.07841
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33,0,1.673351


## Convert Pandas Categorical Data For Scikit-Learn

In [154]:
from sklearn import preprocessing

raw_data = {'patient': [1, 1, 1, 2, 2],
            'obs': [1, 2, 3, 1, 2],
            'treatment': [0, 1, 0, 1, 0],
            'score': ['strong', 'weak', 'normal', 'weak', 'strong'],
            'label': [1, 0, 0, 0, 1]}

df = pd.DataFrame(raw_data, columns=raw_data.keys())
y = df["label"]
X = df.drop(["label"], axis=1)

In [166]:
X = pd.get_dummies(X)
X

Unnamed: 0,patient,obs,treatment,score_normal,score_strong,score_weak
0,1,1,0,0,1,0
1,1,2,1,0,0,1
2,1,3,0,1,0,0
3,2,1,1,0,0,1
4,2,2,0,0,1,0


In [167]:
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier

cross_val_score(RandomForestClassifier(50), X, y, scoring='accuracy')



array([ 1.,  1.,  1.])

In [168]:
clf = RandomForestClassifier(50)
clf.fit(X, y)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=50, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [171]:
test_data = [1, 2, 0, "weak"]

test_data = pd.get_dummies(test_data)
clf.predict(test_data)

ValueError: Number of features of the model must match the input. Model n_features is 6 and input n_features is 4 

In [85]:
# Create a label (category) encoder object
le = preprocessing.LabelEncoder()

# Fit the encoder to the pandas column
le.fit(df['score'])

LabelEncoder()

In [86]:
# View the labels (if you want)
list(le.classes_)

['normal', 'strong', 'weak']

In [88]:
# Apply the fitted encoder to the pandas column
le.transform(df['score']) 

array([1, 2, 0, 2, 1])

In [87]:
# Convert some integers into their category names
list(le.inverse_transform([2, 2, 1]))

['weak', 'weak', 'strong']

### One-Hot Encode Nominal Categorical Features

In [89]:
from sklearn.preprocessing import LabelBinarizer
from sklearn.preprocessing import OneHotEncoder

# Create Data With One Class Label
x = np.array([['Texas'], 
              ['California'], 
              ['Texas'], 
              ['Delaware'], 
              ['Texas']])

In [91]:
# Method 1 - LabelBinzarizer
one_hot = LabelBinarizer()

print(one_hot.classes_)

# One-hot encode data
one_hot.fit_transform(x)

AttributeError: 'LabelBinarizer' object has no attribute 'classes_'

In [97]:
# Method 2 - Pandas Get Dummies
# Dummy feature
dums = pd.get_dummies(x[:,0])
dums.columns

Index(['California', 'Delaware', 'Texas'], dtype='object')

In [100]:
# method 3 - OneHotEncoder
enc = OneHotEncoder()
enc.fit([[0, 0, 3], [1, 1, 0], [0, 2, 1], [1, 0, 2]])

OneHotEncoder(categorical_features='all', dtype=<class 'numpy.float64'>,
       handle_unknown='error', n_values='auto', sparse=True)

In [98]:
enc.n_values_

array([2, 3, 4])

In [99]:
enc.feature_indices_

array([0, 2, 5, 9])

In [94]:
enc.transform([[0, 1, 1]]).toarray()

array([[ 1.,  0.,  0.,  1.,  0.,  0.,  1.,  0.,  0.]])