In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
from collections import defaultdict
#import re

import warnings
warnings.filterwarnings("ignore")

import matplotlib.pyplot as plt
import collections


from sklearn.metrics import precision_recall_curve
from sklearn.metrics import accuracy_score, balanced_accuracy_score, f1_score, precision_score, recall_score, roc_auc_score, average_precision_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

from pathlib import Path
import time

sns.set()

**Dataset**

- [3-2018_Financial_Data](./data/2018_Financial_Data.csv) : Stock goes up or down

``` markdown
We will sub-sample and the consider going down as outlier therefore:
- 0 : Stock goes up
- 1 : Stock goes down
``` 


``` markdown
This dataset (.csv) collects 200+ financial indicators for all the stocks of the US stock market. The financial indicators have been scraped from Financial Modeling Prep API, and are those found in the 10-K filings that publicly traded companies release yearly.

The last column of the dataset represent the class of each stock, where:

if the value of a stock increases during 2015, then class=1;
if the value of a stock decreases during 2015, then class=0.
In other words, stocks that belong to class 1 are stocks that one should buy at the start of year 2015, and sell at the end of year 2015.

This dataset has been developed in order to understand whether or not it is possible to classify the future performance of a stock by looking at the financial information released in the 10-K filings.
```

( more : https://www.kaggle.com/code/prayankkul/complete-financial-analysis)

In [2]:
data_dir = Path('data')
work_with = "2018_Financial_Data.csv"

df = pd.read_csv(data_dir / work_with, encoding='utf-8')
#Impute missing values with 0 since it means the company have no value for that year
df.fillna(0,inplace=True)
df.dropna(inplace=True, axis=0, how='any')

# df.describe()
#df.info()


In [3]:
df.head()

Unnamed: 0.1,Unnamed: 0,Revenue,Revenue Growth,Cost of Revenue,Gross Profit,R&D Expenses,SG&A Expense,Operating Expenses,Operating Income,Interest Expense,...,Receivables growth,Inventory Growth,Asset Growth,Book Value per Share Growth,Debt Growth,R&D Expense Growth,SG&A Expenses Growth,Sector,2019 PRICE VAR [%],Class
0,CMCSA,94507000000.0,0.1115,0.0,94507000000.0,0.0,64822000000.0,75498000000.0,19009000000.0,3542000000.0,...,0.257,0.0,0.3426,0.0722,0.7309,0.0,0.1308,Consumer Cyclical,32.794573,1
1,KMI,14144000000.0,0.032,7288000000.0,6856000000.0,0.0,601000000.0,3062000000.0,3794000000.0,1917000000.0,...,0.0345,-0.092,-0.0024,0.0076,-0.0137,0.0,-0.1265,Energy,40.588068,1
2,INTC,70848000000.0,0.1289,27111000000.0,43737000000.0,13543000000.0,6750000000.0,20421000000.0,23316000000.0,-126000000.0,...,0.1989,0.0387,0.0382,0.1014,-0.0169,0.039,-0.0942,Technology,30.295514,1
3,MU,30391000000.0,0.4955,12500000000.0,17891000000.0,2141000000.0,813000000.0,2897000000.0,14994000000.0,342000000.0,...,0.4573,0.1511,0.2275,0.6395,-0.5841,0.1738,0.0942,Technology,64.213737,1
4,GE,121615000000.0,0.0285,95461000000.0,26154000000.0,0.0,18111000000.0,40711000000.0,-14557000000.0,5059000000.0,...,-0.2781,-0.2892,-0.1575,-0.4487,-0.2297,0.0,0.0308,Industrials,44.75784,1


In [4]:
# data distribution 
# 0 - increased, 1 - otherwise
# df.drop(['isFlaggedFraud'], inplace=True, axis=1)


print(f'before:{df["Class"].value_counts()}')

#switch classes 
df["Class"] = df["Class"].map({1: 0, 0: 1 })
print(f'After: {df["Class"].value_counts()}')

# sample non_increasing class
increased_df = df[df['Class'] == 0]
non_increased_df = df[df['Class'] == 1].sample(n=int(0.04*len(increased_df)), random_state=42)

df = pd.concat([increased_df, non_increased_df])
print(f'Sampled: {df["Class"].value_counts()}')

before:Class
1    3046
0    1346
Name: count, dtype: int64
After: Class
0    3046
1    1346
Name: count, dtype: int64
Sampled: Class
0    3046
1     121
Name: count, dtype: int64


In [5]:
# handle categorical data
cat_columns = df.select_dtypes(include=['object'], exclude=["number"]).columns
print(f"uniques val count:\n{df[[*cat_columns]].nunique()}")

uniques val count:
Unnamed: 0    3167
Sector          11
dtype: int64


In [6]:
# encode Sector type with one hot encoder
df = pd.get_dummies(df, columns=["Sector"], prefix=["S"],dtype="int8")

# # encode Unnamed: 0 and nameDest with label encoder
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df["Unnamed: 0"] = le.fit_transform(df["Unnamed: 0"])
df.head(3)

Unnamed: 0.1,Unnamed: 0,Revenue,Revenue Growth,Cost of Revenue,Gross Profit,R&D Expenses,SG&A Expense,Operating Expenses,Operating Income,Interest Expense,...,S_Communication Services,S_Consumer Cyclical,S_Consumer Defensive,S_Energy,S_Financial Services,S_Healthcare,S_Industrials,S_Real Estate,S_Technology,S_Utilities
0,622,94507000000.0,0.1115,0.0,94507000000.0,0.0,64822000000.0,75498000000.0,19009000000.0,3542000000.0,...,0,1,0,0,0,0,0,0,0,0
1,1602,14144000000.0,0.032,7288000000.0,6856000000.0,0.0,601000000.0,3062000000.0,3794000000.0,1917000000.0,...,0,0,0,1,0,0,0,0,0,0
2,1479,70848000000.0,0.1289,27111000000.0,43737000000.0,13543000000.0,6750000000.0,20421000000.0,23316000000.0,-126000000.0,...,0,0,0,0,0,0,0,0,1,0


In [7]:
target = "Class"

X = df.drop([target], axis=1)
y = df[target]

X.shape, y.shape

((3167, 234), (3167,))

In [8]:
# scaling the data
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X[X.columns] = scaler.fit_transform(X)

X.shape, y.shape

((3167, 234), (3167,))

In [9]:
# split the data into train and test
from sklearn.model_selection import train_test_split
# X_train, X_test, y_train, y_test = train_test_split(X, y,
#                                                     stratify=y,
#                                                     test_size=0.2,
#                                                     random_state=42)
X_train, X_test, y_train, y_test = X, X, y, y
# count fraud in train and test 
print(f"Train: \n {y_train.value_counts()}")
print(f"Test: \n {y_test.value_counts()} ")

Train: 
 Class
0    3046
1     121
Name: count, dtype: int64
Test: 
 Class
0    3046
1     121
Name: count, dtype: int64 


In [10]:
''' Helper functions '''

def get_scores(y_true, y_pred):
    scores = {
        'accuracy': round(accuracy_score(y_true, y_pred),2),
        'balanced': round(balanced_accuracy_score(y_true, y_pred),2),
        'F1': round(f1_score(y_true, y_pred),2),
        'precision': round(precision_score(y_true, y_pred),2),
        'recall': round(recall_score(y_true, y_pred),2),
        'roc_auc': round(roc_auc_score(y_true, y_pred),2),
        'pr_auc': round(average_precision_score(y_true, y_pred),2)
    }
    
    return scores

In [11]:
combined_scores = []

isolation forest

In [12]:
# isolation forest

from pyod.models.iforest import IForest

start_time = time.time()

clf_name = 'IForest'

clf = IForest()
clf.fit(X_train)

duration = round(time.time() - start_time,2)

# get the prediction labels and outlier scores of the training data
y_train_pred = clf.labels_  # binary labels (0: inliers, 1: outliers)
y_train_scores = clf.decision_scores_  # raw outlier scores

# get the prediction on the test data
y_test_pred = clf.predict(X_test)  # outlier labels (0 or 1)
y_test_scores = clf.decision_function(X_test)  # outlier scores

scores = {"clf_name":clf_name, **get_scores(y_test, y_test_pred), "duration": duration}
combined_scores.append(scores)

Local Outlier Factor

In [13]:
#  Local Outlier Factor

from pyod.models.lof import LOF

start_time = time.time()

clf_name = 'LOF'
clf = LOF()
clf.fit(X_train)
duration = round(time.time() - start_time,2)

# get the prediction labels
y_train_pred = clf.labels_  
y_test_pred = clf.predict(X_test)


scores = {"clf_name":clf_name, **get_scores(y_test, y_test_pred), "duration": duration}
combined_scores.append(scores)
# scores

ECOD

In [14]:
#  ECOD

from pyod.models.ecod import ECOD
start_time = time.time()


clf_name = 'ECOD'
clf = ECOD()
clf.fit(X_train)
duration = round(time.time() - start_time,2)

# get the prediction labels
y_train_pred = clf.labels_  
y_test_pred = clf.predict(X_test)


scores = {"clf_name":clf_name, **get_scores(y_test, y_test_pred), "duration": duration}
combined_scores.append(scores)

Local Correlation Integral

In [15]:
# #  Local Correlation Integral (LOCI)

# from pyod.models.loci import LOCI
# start_time = time.time()


# clf_name = 'LOCI'
# clf = LOCI()
# clf.fit(X_train)
# duration = round(time.time() - start_time,2)

# # get the prediction labels
# y_train_pred = clf.labels_  
# y_test_pred = clf.predict(X_test)


# scores = {"clf_name":clf_name, **get_scores(y_test, y_test_pred), "duration": duration}
# combined_scores.append(scores)

LSCP: Locally Selective Combination of Parallel Outlier Ensembles

In [16]:
# #  LSCP

# from pyod.models.lscp import LSCP
# from pyod.models.lof import LOF
# start_time = time.time()


# clf_name = 'LSCP'
# detector_list = [LOF(), LOF()]
# clf = LSCP(detector_list)
# clf.fit(X_train)
# duration = round(time.time() - start_time,2)

# # get the prediction labels
# y_train_pred = clf.labels_  
# y_test_pred = clf.predict(X_test)


# scores = {"clf_name":clf_name, **get_scores(y_test, y_test_pred), "duration": duration}
# combined_scores.append(scores)

COPOD: Copula-Based Outlier Detection

In [17]:
#  COPOD: Copula-Based Outlier Detection

from pyod.models.copod import COPOD
start_time = time.time()


clf_name = 'COPOD'
clf = COPOD()
clf.fit(X_train)
duration = round(time.time() - start_time,2)

# get the prediction labels
y_train_pred = clf.labels_  
y_test_pred = clf.predict(X_test)


scores = {"clf_name":clf_name, **get_scores(y_test, y_test_pred), "duration": duration}
combined_scores.append(scores)

ABOD: Angle-Based Outlier Detection

In [18]:
#  ABOD: Angle-Based Outlier Detection

from pyod.models.abod import ABOD
start_time = time.time()

clf_name = 'ABOD'
clf = ABOD()
clf.fit(X_train)
duration = round(time.time() - start_time,2)

# get the prediction labels
y_train_pred = clf.labels_  
y_test_pred = clf.predict(X_test)


scores = {"clf_name":clf_name, **get_scores(y_test, y_test_pred), "duration": duration}
combined_scores.append(scores)

QMCD: Quasi-Monte Carlo Discrepancy outlier detection

In [19]:
# QMCD: Quasi-Monte Carlo Discrepancy outlier detection

from pyod.models.qmcd import QMCD
start_time = time.time()

clf_name = 'QMCD'
clf = QMCD()
clf.fit(X_train)
duration = round(time.time() - start_time,2)

# get the prediction labels
y_train_pred = clf.labels_  
y_test_pred = clf.predict(X_test)


scores = {"clf_name":clf_name, **get_scores(y_test, y_test_pred), "duration": duration}
combined_scores.append(scores)


Rapid distance-based outlier detection via sampling

In [20]:
# MAD - Rapid distance-based outlier detection via sampling

from pyod.models.sampling import Sampling
start_time = time.time()

clf_name = 'Rapid distance-based'
clf = Sampling()
clf.fit(X_train)
duration = round(time.time() - start_time,2)

# get the prediction labels
y_train_pred = clf.labels_  
y_test_pred = clf.predict(X_test)


scores = {"clf_name":clf_name, **get_scores(y_test, y_test_pred), "duration": duration}
combined_scores.append(scores)



## Book Keeping

In [21]:
# save the results
save_dir = Path('results')
save_dir.mkdir(exist_ok=True)
save_as = save_dir / f"3-{Path(work_with).stem}.csv"

scores_df = pd.DataFrame(combined_scores)
scores_df.to_csv(save_as, index=False)



In [22]:
display(scores_df)

Unnamed: 0,clf_name,accuracy,balanced,F1,precision,recall,roc_auc,pr_auc,duration
0,IForest,0.87,0.48,0.04,0.03,0.07,0.48,0.04,0.11
1,LOF,0.88,0.51,0.06,0.04,0.11,0.51,0.04,0.08
2,ECOD,0.87,0.5,0.05,0.03,0.09,0.5,0.04,0.36
3,COPOD,0.86,0.46,0.01,0.01,0.02,0.46,0.04,0.15
4,ABOD,0.86,0.51,0.07,0.05,0.13,0.51,0.04,1.01
5,QMCD,0.86,0.46,0.01,0.01,0.02,0.46,0.04,1.04
6,Rapid distance-based,0.87,0.5,0.05,0.04,0.1,0.5,0.04,0.01


``` markdown

# methods used
1. [Isolation Forest](https://cs.nju.edu.cn/zhouzh/zhouzh.files/publication/icdm08b.pdf)
2. ...
```