### Label Encoder

In [13]:
from sklearn.preprocessing import LabelEncoder
import pandas as pd

In [14]:
data =pd.read_csv("iris.csv",index_col=0)

In [15]:
data.tail()

Unnamed: 0,Sepal.Length,Sepal.Width,Petal.Length,Petal.Width,Species
146,6.7,3.0,5.2,2.3,virginica
147,6.3,2.5,5.0,1.9,virginica
148,6.5,3.0,5.2,2.0,virginica
149,6.2,3.4,5.4,2.3,virginica
150,5.9,3.0,5.1,1.8,virginica


In [16]:
#Changes does not effect the "data" dataframe
data1=data.copy()

In [None]:
labelencoder = LabelEncoder() # we want to label the Species column i.e.y variable
data1.iloc[:, -1] = labelencoder.fit_transform(data1.iloc[:,-1]) # -1: consider last column

In [18]:
data1

Unnamed: 0,Sepal.Length,Sepal.Width,Petal.Length,Petal.Width,Species
1,5.1,3.5,1.4,0.2,0
2,4.9,3.0,1.4,0.2,0
3,4.7,3.2,1.3,0.2,0
4,4.6,3.1,1.5,0.2,0
5,5.0,3.6,1.4,0.2,0
...,...,...,...,...,...
146,6.7,3.0,5.2,2.3,2
147,6.3,2.5,5.0,1.9,2
148,6.5,3.0,5.2,2.0,2
149,6.2,3.4,5.4,2.3,2


### One Hot Encoder

#### Using sklearn

In [19]:
from sklearn.preprocessing import OneHotEncoder

In [20]:
#data2=pd.read_csv("iris.csv",index_col=0)
data2=data.copy()

In [None]:
data2

In [22]:
# creating instance of one-hot-encoder
OHE = OneHotEncoder()# Specifies the way unknown categories are handled during transform.

In [23]:
# convert it to array - to columns
enc_df = pd.DataFrame(OHE.fit_transform(data2[['Species']]).toarray())

In [None]:
enc_df

In [None]:
# merge with main df
data_final = data2.iloc[:,0:4].join(enc_df)
data_final

#### Using Pandas

In [26]:
import pandas as pd

In [27]:
#data3 =pd.read_csv("iris.csv",index_col=0)
data3=data.copy()

In [28]:
data_encoded=pd.get_dummies(data3)

In [None]:
data_encoded

### IsolationForest

In [30]:
# Isolation Forest example - useful for anomaly detection along with outliers
from sklearn.ensemble import IsolationForest
import pandas as pd

In [None]:

df=pd.DataFrame({
      'Marks':[50,80,82,95,800],
      'Time':[1,2,3,1.5,60]
})
df

In [None]:
# what is outlier here?
#iso=IsolationForest(random_state=10)# expecting 10 % outlier in dataset
iso=IsolationForest(random_state=10,contamination=0.1)
iso.fit(df)
out=iso.predict(df)

In [33]:
out # -1: outlier - last value is an outlier

array([ 1,  1,  1,  1, -1])

In [34]:
df['Score']=out
df

Unnamed: 0,Marks,Time,Score
0,50,1.0,1
1,80,2.0,1
2,82,3.0,1
3,95,1.5,1
4,800,60.0,-1


In [35]:
from sklearn.ensemble import IsolationForest
import pandas as pd

In [36]:
data =pd.read_csv("iris.csv",index_col=0)
data_encoded=pd.get_dummies(data)

In [37]:
data_encoded

Unnamed: 0,Sepal.Length,Sepal.Width,Petal.Length,Petal.Width,Species_setosa,Species_versicolor,Species_virginica
1,5.1,3.5,1.4,0.2,1,0,0
2,4.9,3.0,1.4,0.2,1,0,0
3,4.7,3.2,1.3,0.2,1,0,0
4,4.6,3.1,1.5,0.2,1,0,0
5,5.0,3.6,1.4,0.2,1,0,0
...,...,...,...,...,...,...,...
146,6.7,3.0,5.2,2.3,0,0,1
147,6.3,2.5,5.0,1.9,0,0,1
148,6.5,3.0,5.2,2.0,0,0,1
149,6.2,3.4,5.4,2.3,0,0,1


In [None]:
# u can detect outliers using percentiles as well. Other than this, boxplot you can use for outlier detection.
data_encoded['Petal.Width'].describe(percentiles = [0.01, 0.02,0.9,0.99,0.991,0.992])

In [None]:
# training the model
clf = IsolationForest(random_state=10,contamination=.01)
# contamination: how much percentage of outliers you are expecting in dataset
# eg. in health care domain it will be very low (1% or less) 0.01 or 0.001
clf.fit(data_encoded)

In [40]:
# predictions
y_pred_outliers = clf.predict(data_encoded)

In [None]:
#-1 for outliers and 1 for inliers.
y_pred_outliers

In [42]:
data_encoded

Unnamed: 0,Sepal.Length,Sepal.Width,Petal.Length,Petal.Width,Species_setosa,Species_versicolor,Species_virginica
1,5.1,3.5,1.4,0.2,1,0,0
2,4.9,3.0,1.4,0.2,1,0,0
3,4.7,3.2,1.3,0.2,1,0,0
4,4.6,3.1,1.5,0.2,1,0,0
5,5.0,3.6,1.4,0.2,1,0,0
...,...,...,...,...,...,...,...
146,6.7,3.0,5.2,2.3,0,0,1
147,6.3,2.5,5.0,1.9,0,0,1
148,6.5,3.0,5.2,2.0,0,0,1
149,6.2,3.4,5.4,2.3,0,0,1


In [43]:
## Let us add a  new data point which is outlier
data_encoded.loc[151]=[20,40,30,50,1,0,0]

In [44]:
data_encoded

Unnamed: 0,Sepal.Length,Sepal.Width,Petal.Length,Petal.Width,Species_setosa,Species_versicolor,Species_virginica
1,5.1,3.5,1.4,0.2,1,0,0
2,4.9,3.0,1.4,0.2,1,0,0
3,4.7,3.2,1.3,0.2,1,0,0
4,4.6,3.1,1.5,0.2,1,0,0
5,5.0,3.6,1.4,0.2,1,0,0
...,...,...,...,...,...,...,...
147,6.3,2.5,5.0,1.9,0,0,1
148,6.5,3.0,5.2,2.0,0,0,1
149,6.2,3.4,5.4,2.3,0,0,1
150,5.9,3.0,5.1,1.8,0,0,1


In [None]:
# training the model
clf = IsolationForest(random_state=10,contamination=.01)
clf.fit(data_encoded)
# predictions
y_pred_outliers = clf.predict(data_encoded)
y_pred_outliers # last has -1 so its an outlier

In [None]:
# data_encoded['scores']=clf.decision_function(data_encoded)

In [46]:
data_encoded['anomaly']=clf.predict(data_encoded.iloc[:,0:7])
# we can pinpoint those outliers exactly by applying this filtering score

In [None]:
data_encoded

In [None]:
#Print the outlier data points
data_encoded[data_encoded['anomaly']==-1]# scores are given by decision tree

### PPS score

In [None]:
#install the package
!pip install ppscore

In [50]:
import ppscore as pps

In [None]:
# weighted F1 Score concept
#  https://towardsdatascience.com/multi-class-metrics-made-simple-part-ii-the-f1-score-ebe8b2c2ca1

In [None]:
data.head()

In [None]:
# Syntax: pps.score(df, "feature_column", "target_column")
pps.score(data, "Sepal.Length", "Petal.Length") # ppscore:0.55 so ok kind of score

In [None]:
#calculate the whole PPS matrix
pps.matrix(data)# petal width with species is most imp varaible with pps score 0.9276

In [53]:
df=pd.DataFrame(pps.matrix(data))
df[df.case!='predict_itself'].iloc[:,[0,1,2,3,5,8]].sort_values('ppscore',ascending=False)

Unnamed: 0,x,y,ppscore,case,metric,model
19,Petal.Width,Species,0.927652,classification,weighted F1,DecisionTreeClassifier()
14,Petal.Length,Species,0.884812,classification,weighted F1,DecisionTreeClassifier()
17,Petal.Width,Petal.Length,0.798274,regression,mean absolute error,DecisionTreeRegressor()
22,Species,Petal.Length,0.785393,regression,mean absolute error,DecisionTreeRegressor()
23,Species,Petal.Width,0.755749,regression,mean absolute error,DecisionTreeRegressor()
13,Petal.Length,Petal.Width,0.744945,regression,mean absolute error,DecisionTreeRegressor()
2,Sepal.Length,Petal.Length,0.550423,regression,mean absolute error,DecisionTreeRegressor()
10,Petal.Length,Sepal.Length,0.525617,regression,mean absolute error,DecisionTreeRegressor()
4,Sepal.Length,Species,0.471649,classification,weighted F1,DecisionTreeClassifier()
3,Sepal.Length,Petal.Width,0.431739,regression,mean absolute error,DecisionTreeRegressor()
