### Label Encoder

In [1]:
from sklearn.preprocessing import LabelEncoder
import pandas as pd

In [6]:
data =pd.read_csv("iris.csv",index_col=0)

In [None]:
data.tail()

In [8]:
#Changes does not affect the "data" dataframe
data1=data.copy()

In [None]:
labelencoder = LabelEncoder() # we want to label the Species column i.e.y variable
data1.iloc[:, -1] = labelencoder.fit_transform(data1.iloc[:,-1]) # -1: consider last column

In [None]:
data1

### One Hot Encoder

#### Using sklearn

In [22]:
from sklearn.preprocessing import OneHotEncoder

In [23]:
#data2=pd.read_csv("iris.csv",index_col=0)
data2=data.copy()

In [None]:
data2

In [25]:
# creating instance of one-hot-encoder
OHE = OneHotEncoder()# Specifies the way unknown categories are handled during transform.

In [27]:
# convert it to array - to columns
enc_df = pd.DataFrame(OHE.fit_transform(data2[['Species']]).toarray())

In [28]:
enc_df

Unnamed: 0,0,1,2
0,1.0,0.0,0.0
1,1.0,0.0,0.0
2,1.0,0.0,0.0
3,1.0,0.0,0.0
4,1.0,0.0,0.0
...,...,...,...
145,0.0,0.0,1.0
146,0.0,0.0,1.0
147,0.0,0.0,1.0
148,0.0,0.0,1.0


In [None]:
# merge with main df
data_final = data2.iloc[:,0:4].join(enc_df)
data_final

#### Using Pandas

In [30]:
import pandas as pd

In [31]:
#data3 =pd.read_csv("iris.csv",index_col=0)
data3=data.copy()

In [32]:
data_encoded=pd.get_dummies(data3)

In [None]:
data_encoded

### IsolationForest

In [34]:
# Isolation Forest example - useful for anomaly detection along with outliers
from sklearn.ensemble import IsolationForest
import pandas as pd

In [None]:

df=pd.DataFrame({
      'Marks':[50,80,82,95,800],
      'Time':[1,2,3,1.5,60]
})
df

In [None]:
# what is outlier here?
#iso=IsolationForest(random_state=10)# expecting 10 % outlier in dataset
iso=IsolationForest(random_state=10,contamination=0.1)
iso.fit(df)
out=iso.predict(df)

In [None]:
out # -1: outlier - last value is an outlier

In [None]:
df['Score']=out
df

In [39]:
data =pd.read_csv("iris.csv",index_col=0)
data_encoded=pd.get_dummies(data)

In [None]:
data_encoded

In [None]:
# training the model
clf = IsolationForest(random_state=10,contamination=.01)
# contamination: how much percentage of outliers you are expecting in dataset
# eg. in health care domain it will be very low (1% or less) 0.01 or 0.001
clf.fit(data_encoded)

In [42]:
# predictions
y_pred_outliers = clf.predict(data_encoded)

In [43]:
#-1 for outliers and 1 for inliers.
y_pred_outliers

array([ 1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1, -1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1, -1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1])

In [None]:
data_encoded

In [46]:
## Let us add a  new data point which is outlier
data_encoded.loc[151]=[20,40,30,50,1,0,0]

In [None]:
data_encoded

In [None]:
# training the model
clf = IsolationForest(random_state=10,contamination=.01)
clf.fit(data_encoded)
# predictions
y_pred_outliers = clf.predict(data_encoded)
y_pred_outliers # last has -1 so its an outlier

In [None]:
# data_encoded['scores']=clf.decision_function(data_encoded)

In [49]:
data_encoded['anomaly']=clf.predict(data_encoded.iloc[:,0:7])
# we can pinpoint those outliers exactly by applying this filtering score

In [None]:
data_encoded

In [None]:
#Print the outlier data points
data_encoded[data_encoded['anomaly']==-1]# scores are given by decision tree

### PPS score

In [None]:
#install the package
!pip install ppscore

In [53]:
import ppscore as pps

In [None]:
# weighted F1 Score concept
#  https://towardsdatascience.com/multi-class-metrics-made-simple-part-ii-the-f1-score-ebe8b2c2ca1

In [None]:
data.head()

In [None]:
# Syntax: pps.score(df, "feature_column", "target_column")
pps.score(data, "Sepal.Length", "Petal.Length") # ppscore:0.55 so ok kind of score

In [56]:
#calculate the whole PPS matrix
pps.matrix(data)# petal width with species is most imp varaible with pps score 0.9276

Unnamed: 0,x,y,ppscore,case,is_valid_score,metric,baseline_score,model_score,model
0,Sepal.Length,Sepal.Length,1.0,predict_itself,True,,0.0,1.0,
1,Sepal.Length,Sepal.Width,0.0,regression,True,mean absolute error,0.330667,0.364704,DecisionTreeRegressor()
2,Sepal.Length,Petal.Length,0.550423,regression,True,mean absolute error,1.488667,0.669271,DecisionTreeRegressor()
3,Sepal.Length,Petal.Width,0.431739,regression,True,mean absolute error,0.644667,0.366339,DecisionTreeRegressor()
4,Sepal.Length,Species,0.471649,classification,True,weighted F1,0.353333,0.658333,DecisionTreeClassifier()
5,Sepal.Width,Sepal.Length,0.006966,regression,True,mean absolute error,0.684667,0.679897,DecisionTreeRegressor()
6,Sepal.Width,Sepal.Width,1.0,predict_itself,True,,0.0,1.0,
7,Sepal.Width,Petal.Length,0.172375,regression,True,mean absolute error,1.488667,1.232058,DecisionTreeRegressor()
8,Sepal.Width,Petal.Width,0.132858,regression,True,mean absolute error,0.644667,0.559017,DecisionTreeRegressor()
9,Sepal.Width,Species,0.156915,classification,True,weighted F1,0.353333,0.454805,DecisionTreeClassifier()


In [57]:
df=pd.DataFrame(pps.matrix(data))
df[df.case!='predict_itself'].iloc[:,[0,1,2,3,5,8]].sort_values('ppscore',ascending=False)
# PPScore is highest in petal.width and species i.e. 0.927652 i.e. petal width is very good feature in predicting species

Unnamed: 0,x,y,ppscore,case,metric,model
19,Petal.Width,Species,0.927652,classification,weighted F1,DecisionTreeClassifier()
14,Petal.Length,Species,0.884812,classification,weighted F1,DecisionTreeClassifier()
17,Petal.Width,Petal.Length,0.798274,regression,mean absolute error,DecisionTreeRegressor()
22,Species,Petal.Length,0.785393,regression,mean absolute error,DecisionTreeRegressor()
23,Species,Petal.Width,0.755749,regression,mean absolute error,DecisionTreeRegressor()
13,Petal.Length,Petal.Width,0.744945,regression,mean absolute error,DecisionTreeRegressor()
2,Sepal.Length,Petal.Length,0.550423,regression,mean absolute error,DecisionTreeRegressor()
10,Petal.Length,Sepal.Length,0.525617,regression,mean absolute error,DecisionTreeRegressor()
4,Sepal.Length,Species,0.471649,classification,weighted F1,DecisionTreeClassifier()
3,Sepal.Length,Petal.Width,0.431739,regression,mean absolute error,DecisionTreeRegressor()
