### Label Encoder

In [1]:
from sklearn.preprocessing import LabelEncoder
import pandas as pd

In [2]:
data =pd.read_csv("iris.csv",index_col=0)

In [3]:
data.tail()

Unnamed: 0,Sepal.Length,Sepal.Width,Petal.Length,Petal.Width,Species
146,6.7,3.0,5.2,2.3,virginica
147,6.3,2.5,5.0,1.9,virginica
148,6.5,3.0,5.2,2.0,virginica
149,6.2,3.4,5.4,2.3,virginica
150,5.9,3.0,5.1,1.8,virginica


In [4]:
#Changes does not affect the "data" dataframe
data1=data.copy()

In [5]:
labelencoder = LabelEncoder() # we want to label the Species column i.e.y variable
data1.iloc[:, -1] = labelencoder.fit_transform(data1.iloc[:,-1]) # -1: consider last column

In [6]:
data1

Unnamed: 0,Sepal.Length,Sepal.Width,Petal.Length,Petal.Width,Species
1,5.1,3.5,1.4,0.2,0
2,4.9,3.0,1.4,0.2,0
3,4.7,3.2,1.3,0.2,0
4,4.6,3.1,1.5,0.2,0
5,5.0,3.6,1.4,0.2,0
...,...,...,...,...,...
146,6.7,3.0,5.2,2.3,2
147,6.3,2.5,5.0,1.9,2
148,6.5,3.0,5.2,2.0,2
149,6.2,3.4,5.4,2.3,2


### One Hot Encoder

#### Using sklearn

In [14]:
from sklearn.preprocessing import OneHotEncoder

In [15]:
#data2=pd.read_csv("iris.csv",index_col=0)
data2=data.copy()

In [16]:
data2

Unnamed: 0,Sepal.Length,Sepal.Width,Petal.Length,Petal.Width,Species
1,5.1,3.5,1.4,0.2,setosa
2,4.9,3.0,1.4,0.2,setosa
3,4.7,3.2,1.3,0.2,setosa
4,4.6,3.1,1.5,0.2,setosa
5,5.0,3.6,1.4,0.2,setosa
...,...,...,...,...,...
146,6.7,3.0,5.2,2.3,virginica
147,6.3,2.5,5.0,1.9,virginica
148,6.5,3.0,5.2,2.0,virginica
149,6.2,3.4,5.4,2.3,virginica


In [17]:
# creating instance of one-hot-encoder
OHE = OneHotEncoder()# Specifies the way unknown categories are handled during transform.

In [18]:
# convert it to array - to columns
enc_df = pd.DataFrame(OHE.fit_transform(data2[['Species']]).toarray())

In [19]:
enc_df

Unnamed: 0,0,1,2
0,1.0,0.0,0.0
1,1.0,0.0,0.0
2,1.0,0.0,0.0
3,1.0,0.0,0.0
4,1.0,0.0,0.0
...,...,...,...
145,0.0,0.0,1.0
146,0.0,0.0,1.0
147,0.0,0.0,1.0
148,0.0,0.0,1.0


In [20]:
# merge with main df
data_final = data2.iloc[:,0:4].join(enc_df)
data_final

Unnamed: 0,Sepal.Length,Sepal.Width,Petal.Length,Petal.Width,0,1,2
1,5.1,3.5,1.4,0.2,1.0,0.0,0.0
2,4.9,3.0,1.4,0.2,1.0,0.0,0.0
3,4.7,3.2,1.3,0.2,1.0,0.0,0.0
4,4.6,3.1,1.5,0.2,1.0,0.0,0.0
5,5.0,3.6,1.4,0.2,1.0,0.0,0.0
...,...,...,...,...,...,...,...
146,6.7,3.0,5.2,2.3,0.0,0.0,1.0
147,6.3,2.5,5.0,1.9,0.0,0.0,1.0
148,6.5,3.0,5.2,2.0,0.0,0.0,1.0
149,6.2,3.4,5.4,2.3,0.0,0.0,1.0


#### Using Pandas

In [21]:
import pandas as pd

In [22]:
#data3 =pd.read_csv("iris.csv",index_col=0)
data3=data.copy()

In [23]:
data_encoded=pd.get_dummies(data3)

In [24]:
data_encoded

Unnamed: 0,Sepal.Length,Sepal.Width,Petal.Length,Petal.Width,Species_setosa,Species_versicolor,Species_virginica
1,5.1,3.5,1.4,0.2,True,False,False
2,4.9,3.0,1.4,0.2,True,False,False
3,4.7,3.2,1.3,0.2,True,False,False
4,4.6,3.1,1.5,0.2,True,False,False
5,5.0,3.6,1.4,0.2,True,False,False
...,...,...,...,...,...,...,...
146,6.7,3.0,5.2,2.3,False,False,True
147,6.3,2.5,5.0,1.9,False,False,True
148,6.5,3.0,5.2,2.0,False,False,True
149,6.2,3.4,5.4,2.3,False,False,True


### IsolationForest

In [25]:
# Isolation Forest example - useful for anomaly detection along with outliers
from sklearn.ensemble import IsolationForest
import pandas as pd

In [26]:

df=pd.DataFrame({
      'Marks':[50,80,82,95,800],
      'Time':[1,2,3,1.5,60]
})
df

Unnamed: 0,Marks,Time
0,50,1.0
1,80,2.0
2,82,3.0
3,95,1.5
4,800,60.0


In [27]:
# what is outlier here?
#iso=IsolationForest(random_state=10)# expecting 10 % outlier in dataset
iso=IsolationForest(random_state=10,contamination=0.1)
iso.fit(df)
out=iso.predict(df)



In [28]:
out # -1: outlier - last value is an outlier

array([ 1,  1,  1,  1, -1])

In [29]:
df['Score']=out
df

Unnamed: 0,Marks,Time,Score
0,50,1.0,1
1,80,2.0,1
2,82,3.0,1
3,95,1.5,1
4,800,60.0,-1


In [30]:
data =pd.read_csv("iris.csv",index_col=0)
data_encoded=pd.get_dummies(data)

In [31]:
data_encoded

Unnamed: 0,Sepal.Length,Sepal.Width,Petal.Length,Petal.Width,Species_setosa,Species_versicolor,Species_virginica
1,5.1,3.5,1.4,0.2,True,False,False
2,4.9,3.0,1.4,0.2,True,False,False
3,4.7,3.2,1.3,0.2,True,False,False
4,4.6,3.1,1.5,0.2,True,False,False
5,5.0,3.6,1.4,0.2,True,False,False
...,...,...,...,...,...,...,...
146,6.7,3.0,5.2,2.3,False,False,True
147,6.3,2.5,5.0,1.9,False,False,True
148,6.5,3.0,5.2,2.0,False,False,True
149,6.2,3.4,5.4,2.3,False,False,True


In [32]:
# training the model
clf = IsolationForest(random_state=10,contamination=.01)
# contamination: how much percentage of outliers you are expecting in dataset
# eg. in health care domain it will be very low (1% or less) 0.01 or 0.001
clf.fit(data_encoded)



In [33]:
# predictions
y_pred_outliers = clf.predict(data_encoded)

In [34]:
#-1 for outliers and 1 for inliers.
y_pred_outliers

array([ 1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1, -1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1, -1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1])

In [35]:
data_encoded

Unnamed: 0,Sepal.Length,Sepal.Width,Petal.Length,Petal.Width,Species_setosa,Species_versicolor,Species_virginica
1,5.1,3.5,1.4,0.2,True,False,False
2,4.9,3.0,1.4,0.2,True,False,False
3,4.7,3.2,1.3,0.2,True,False,False
4,4.6,3.1,1.5,0.2,True,False,False
5,5.0,3.6,1.4,0.2,True,False,False
...,...,...,...,...,...,...,...
146,6.7,3.0,5.2,2.3,False,False,True
147,6.3,2.5,5.0,1.9,False,False,True
148,6.5,3.0,5.2,2.0,False,False,True
149,6.2,3.4,5.4,2.3,False,False,True


In [36]:
## Let us add a  new data point which is outlier
data_encoded.loc[151]=[20,40,30,50,1,0,0]

In [37]:
data_encoded

Unnamed: 0,Sepal.Length,Sepal.Width,Petal.Length,Petal.Width,Species_setosa,Species_versicolor,Species_virginica
1,5.1,3.5,1.4,0.2,1,0,0
2,4.9,3.0,1.4,0.2,1,0,0
3,4.7,3.2,1.3,0.2,1,0,0
4,4.6,3.1,1.5,0.2,1,0,0
5,5.0,3.6,1.4,0.2,1,0,0
...,...,...,...,...,...,...,...
147,6.3,2.5,5.0,1.9,0,0,1
148,6.5,3.0,5.2,2.0,0,0,1
149,6.2,3.4,5.4,2.3,0,0,1
150,5.9,3.0,5.1,1.8,0,0,1


In [38]:
# training the model
clf = IsolationForest(random_state=10,contamination=.01)
clf.fit(data_encoded)
# predictions
y_pred_outliers = clf.predict(data_encoded)
y_pred_outliers # last has -1 so its an outlier



array([ 1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1, -1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1, -1])

In [None]:
# data_encoded['scores']=clf.decision_function(data_encoded)

In [39]:
data_encoded['anomaly']=clf.predict(data_encoded.iloc[:,0:7])
# we can pinpoint those outliers exactly by applying this filtering score

In [40]:
data_encoded

Unnamed: 0,Sepal.Length,Sepal.Width,Petal.Length,Petal.Width,Species_setosa,Species_versicolor,Species_virginica,anomaly
1,5.1,3.5,1.4,0.2,1,0,0,1
2,4.9,3.0,1.4,0.2,1,0,0,1
3,4.7,3.2,1.3,0.2,1,0,0,1
4,4.6,3.1,1.5,0.2,1,0,0,1
5,5.0,3.6,1.4,0.2,1,0,0,1
...,...,...,...,...,...,...,...,...
147,6.3,2.5,5.0,1.9,0,0,1,1
148,6.5,3.0,5.2,2.0,0,0,1,1
149,6.2,3.4,5.4,2.3,0,0,1,1
150,5.9,3.0,5.1,1.8,0,0,1,1


In [41]:
#Print the outlier data points
data_encoded[data_encoded['anomaly']==-1]# scores are given by decision tree

Unnamed: 0,Sepal.Length,Sepal.Width,Petal.Length,Petal.Width,Species_setosa,Species_versicolor,Species_virginica,anomaly
107,4.9,2.5,4.5,1.7,0,0,1,-1
151,20.0,40.0,30.0,50.0,1,0,0,-1


### PPS score

In [1]:
#install the package
!pip install ppscore



In [2]:
import ppscore as pps
import pandas as pd

In [3]:
data =pd.read_csv("iris.csv",index_col=0)

In [None]:
# weighted F1 Score concept
#  https://towardsdatascience.com/multi-class-metrics-made-simple-part-ii-the-f1-score-ebe8b2c2ca1

In [4]:
data.head()

Unnamed: 0,Sepal.Length,Sepal.Width,Petal.Length,Petal.Width,Species
1,5.1,3.5,1.4,0.2,setosa
2,4.9,3.0,1.4,0.2,setosa
3,4.7,3.2,1.3,0.2,setosa
4,4.6,3.1,1.5,0.2,setosa
5,5.0,3.6,1.4,0.2,setosa


In [5]:
# Syntax: pps.score(df, "feature_column", "target_column")
pps.score(data, "Sepal.Length", "Petal.Length") # ppscore:0.55 so ok kind of score

{'x': 'Sepal.Length',
 'y': 'Petal.Length',
 'ppscore': 0.550422595049248,
 'case': 'regression',
 'is_valid_score': True,
 'metric': 'mean absolute error',
 'baseline_score': 1.4886666666666668,
 'model_score': 0.6692708968366863,
 'model': DecisionTreeRegressor()}

In [6]:
#calculate the whole PPS matrix
pps.matrix(data)# petal width with species is most imp varaible with pps score 0.9276

Unnamed: 0,x,y,ppscore,case,is_valid_score,metric,baseline_score,model_score,model
0,Sepal.Length,Sepal.Length,1.0,predict_itself,True,,0.0,1.0,
1,Sepal.Length,Sepal.Width,0.0,regression,True,mean absolute error,0.330667,0.364704,DecisionTreeRegressor()
2,Sepal.Length,Petal.Length,0.550423,regression,True,mean absolute error,1.488667,0.669271,DecisionTreeRegressor()
3,Sepal.Length,Petal.Width,0.431739,regression,True,mean absolute error,0.644667,0.366339,DecisionTreeRegressor()
4,Sepal.Length,Species,0.471649,classification,True,weighted F1,0.353333,0.658333,DecisionTreeClassifier()
5,Sepal.Width,Sepal.Length,0.006966,regression,True,mean absolute error,0.684667,0.679897,DecisionTreeRegressor()
6,Sepal.Width,Sepal.Width,1.0,predict_itself,True,,0.0,1.0,
7,Sepal.Width,Petal.Length,0.172375,regression,True,mean absolute error,1.488667,1.232058,DecisionTreeRegressor()
8,Sepal.Width,Petal.Width,0.132858,regression,True,mean absolute error,0.644667,0.559017,DecisionTreeRegressor()
9,Sepal.Width,Species,0.156915,classification,True,weighted F1,0.353333,0.454805,DecisionTreeClassifier()


In [7]:
df=pd.DataFrame(pps.matrix(data))
df[df.case!='predict_itself'].iloc[:,[0,1,2,3,5,8]].sort_values('ppscore',ascending=False)
# PPScore is highest in petal.width and species i.e. 0.927652 i.e. petal width is very good feature in predicting species

Unnamed: 0,x,y,ppscore,case,metric,model
19,Petal.Width,Species,0.927652,classification,weighted F1,DecisionTreeClassifier()
14,Petal.Length,Species,0.884812,classification,weighted F1,DecisionTreeClassifier()
17,Petal.Width,Petal.Length,0.798274,regression,mean absolute error,DecisionTreeRegressor()
22,Species,Petal.Length,0.785393,regression,mean absolute error,DecisionTreeRegressor()
23,Species,Petal.Width,0.755749,regression,mean absolute error,DecisionTreeRegressor()
13,Petal.Length,Petal.Width,0.744945,regression,mean absolute error,DecisionTreeRegressor()
2,Sepal.Length,Petal.Length,0.550423,regression,mean absolute error,DecisionTreeRegressor()
10,Petal.Length,Sepal.Length,0.525617,regression,mean absolute error,DecisionTreeRegressor()
4,Sepal.Length,Species,0.471649,classification,weighted F1,DecisionTreeClassifier()
3,Sepal.Length,Petal.Width,0.431739,regression,mean absolute error,DecisionTreeRegressor()
