# Anomaly Detection Models

In [31]:
import pandas as pd
import numpy as np
import matplotlib.pylab as plt
import seaborn as sns
plt.style.use('ggplot')
#pd.set_option('max_columns', 200)
from sklearn.model_selection import train_test_split

In [32]:
path = '../data/data_preparation_output.csv'  # ASY local path
# path = "../data/skylab_instagram_datathon_dataset.csv" # gen
df = pd.read_csv(path) #sep='delimiter', header=None) # sep=";", 

## Data preparation for analysis

**Problem:** too many values to train algorithm

<div class="alert alert-block alert-danger">
<b>Assumption:</b> Different subsets of data (different industry, country) have the same distribution of the numberical variables => can take a random subset to train the algorithm
</div>

**Improvement potential**

1. Add class weights to the sampling with replacement pocedure
2. Impute missing value after exploring the structure of missingness (are they MCAR, MAR, or MNAR)
3. Train for a longer time on the cluster to use up to 70% of the data as the Training Set
4. Hyperparameter tuning on the CrossValidation set. 

**Steps**

1. Select only numeric variables
2. Select observations without missing values
3. Normalize
4. Select a random subset small enough to be able to train the selected algorithms

In [35]:
#df.columns
cols_to_ignore = ['compset', 'period_end_date', 'lag_1_date', 'lag_1_company']
cols_of_interest = ['followers', 'pictures', 'videos', 'comments',
       'likes', 'Year', 'Month', 
       'ultimate_parent_vs_legal_entity', 'ultimate_parent_vs_business_entity',
       'legal_entity_vs_business_entity', 'same_ownership',
       'total_involvement', 'total_company_activity', 'conversion_rate_total',
       'return_on_activity', 'ratio_of_videos', 'ratio_of_pictures',
       'likes_per_picture', 'likes_per_video', 'comments_per_picture',
       'comments_per_video',
       'business_entity_doing_business_as_name_cnt_industry',
       'business_entity_doing_business_as_name_cnt_industry_country',
       'followers_sum_industry', 'videos_sum_industry',
       'pictures_sum_industry', 'likes_sum_industry', 'comments_sum_industry',
       'followers_sum_industry_country', 'videos_sum_industry_country',
       'pictures_sum_industry_country', 'likes_sum_industry_country',
       'comments_sum_industry_country', 'fraction_followers_sum_industry',
       'fraction_followers_sum_industry_country',
       'fraction_videos_sum_industry', 'fraction_videos_sum_industry_country',
       'fraction_pictures_sum_industry',
       'fraction_pictures_sum_industry_country', 'fraction_likes_sum_industry',
       'fraction_likes_sum_industry_country', 'fraction_comments_sum_industry',
       'fraction_comments_sum_industry_country', 'lag_1_followers',
        'timediff_1', 'diff_1_followers',
       'lag_1_pictures', 'diff_1_pictures', 'lag_1_videos', 'diff_1_videos',
       'lag_1_comments', 'diff_1_comments'] 

In [37]:
df_analysis = df[cols_of_interest]
no_nans = df_analysis[~df_analysis.isnull().any(axis=1)]
#print(f"{len(df_analysis)} vs {len(no_nans)}")

In [43]:
import random
sample_size = 20000
perc = round(sample_size/len(no_nans), 3)
rand_idx = random.sample(list(no_nans.index), sample_size)
df_short = no_nans.loc[rand_idx]
print(f"Training the data on the {perc}% of the original set")

Training the data on the 0.109% of the original set


### SVM

https://scikit-learn.org/stable/auto_examples/svm/plot_oneclass.html#sphx-glr-auto-examples-svm-plot-oneclass-py

In [79]:
from sklearn import svm

clf_svm = svm.OneClassSVM(nu=0.1, kernel="rbf", gamma=0.1)
df_svm_short_pred = clf_svm.fit(df_short)
pd.Series(df_svm_short_pred).value_counts()

OneClassSVM(gamma=0.1, nu=0.1)    1
dtype: int64

In [80]:
df_svm_short_pred = clf_svm.predict(df_short)
#y_pred_outliers = clf.predict(X_outliers)
pd.Series(df_svm_short_pred).value_counts()

-1    12000
 1     8000
dtype: int64

### LOF

In [50]:
from sklearn.neighbors import LocalOutlierFactor

Train

In [51]:
clf_lof = LocalOutlierFactor(n_neighbors=20, novelty=True, contamination=0.1)
clf_lof.fit(df_short)

LocalOutlierFactor(contamination=0.1, novelty=True)

Predict on the partial dataset

In [56]:
df_lof_short_pred = clf_lof.predict(df_short)



Hypeparameter tuning: grid search

<div class="alert alert-block alert-warning">
 Normally is done in a designated CV set to avoid overfitting, but we are using a small subset anyway (and its 10:08)
</div>

In [77]:
from itertools import product
n_neighbors = [5, 10, 20]
contamination = [0.1, 0.5]
compare = []

for neigh, cont in product(n_neighbors, contamination):
    print(f"{neigh} {cont}")
    clf_lof = LocalOutlierFactor(n_neighbors=neigh, novelty=True, contamination=cont)
    clf_lof.fit(df_short)
    df_lof_short_pred = clf_lof.predict(df_short)
    out = pd.Series(df_short_pred).value_counts()
    out["neigh"] = neigh
    out["cont"] = cont
    compare.append(out)
    

5 0.1




5 0.5




10 0.1




10 0.5




20 0.1




20 0.5




In [76]:
compare_lof_df = pd.DataFrame(compare)
compare_lof_df

Unnamed: 0,1,-1,neigh,cont
0,18234.0,1766.0,5.0,0.1
1,18234.0,1766.0,5.0,0.5
2,18234.0,1766.0,10.0,0.1
3,18234.0,1766.0,10.0,0.5
4,18234.0,1766.0,20.0,0.1
5,18234.0,1766.0,20.0,0.5


Predict on the full dataset: too long for now

In [57]:
#df_lof_pred = clf_lof.predict(no_nans)



### DBSCan

In [70]:
from sklearn import preprocessing

d = preprocessing.normalize(df_short)
scaled_df_short = pd.DataFrame(d, columns=df_short.columns)
#scaled_df_short.head()

In [26]:
from sklearn import metrics
from sklearn.cluster import DBSCAN

db = DBSCAN(eps=0.3, min_samples=10).fit(scaled_df_short)
labels = db.labels_

# Number of clusters in labels, ignoring noise if present.
n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
n_noise_ = list(labels).count(-1)

print("Estimated number of clusters: %d" % n_clusters_)
print("Estimated number of noise points: %d" % n_noise_)

Estimated number of clusters: 1
Estimated number of noise points: 7


In [30]:
#eps_set = [0.01, 0.1, 0.3, 0.5, 1]
eps_set = [0.01, 0.02, 0.03, 0.04, 0.05, 0.06, 0.1]

for eps in eps_set :
    db = DBSCAN(eps=eps, min_samples=10).fit(scaled_df_short)
    labels = db.labels_

    # Number of clusters in labels, ignoring noise if present.
    n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
    n_noise_ = list(labels).count(-1)
    print(eps)
    print("Estimated number of clusters: %d" % n_clusters_)
    print("Estimated number of noise points: %d" % n_noise_)

0.01
Estimated number of clusters: 61
Estimated number of noise points: 4365
0.02
Estimated number of clusters: 46
Estimated number of noise points: 1961
0.03
Estimated number of clusters: 32
Estimated number of noise points: 1113
0.04
Estimated number of clusters: 18
Estimated number of noise points: 745
0.05
Estimated number of clusters: 13
Estimated number of noise points: 526
0.06
Estimated number of clusters: 13
Estimated number of noise points: 370
0.1
Estimated number of clusters: 6
Estimated number of noise points: 126


In [29]:
min_samples_set=[5, 10, 15]

for min_samples in min_samples_set:
    for 
    db = DBSCAN(eps=0.1, min_samples=min_samples).fit(scaled_df_short)
    labels = db.labels_

    # Number of clusters in labels, ignoring noise if present.
    n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
    n_noise_ = list(labels).count(-1)
    print(min_samples)
    print("Estimated number of clusters: %d" % n_clusters_)
    print("Estimated number of noise points: %d" % n_noise_)

5
Estimated number of clusters: 10
Estimated number of noise points: 54
10
Estimated number of clusters: 6
Estimated number of noise points: 126
15
Estimated number of clusters: 3
Estimated number of noise points: 185


In [71]:
eps_set = [0.01, 0.05, 0.1]
min_samples_set = [5, 10, 15]


compare_dbscan = []

for eps, min_sample in product(eps_set, min_samples_set):
    print(f"{eps} {min_sample}")
    db = DBSCAN(eps=eps, min_samples=min_sample).fit(scaled_df_short)
    labels = db.labels_
    # Number of clusters in labels, ignoring noise if present.
    n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
    n_noise_ = list(labels).count(-1)
    out = pd.Series()
    out["eps"] = eps
    out["min_sample"] = min_sample
    out["pers_outl"] = n_noise_ / len(scaled_df_short)
    out["n_noise_"] = n_noise_
    compare_dbscan.append(out)
    
    

0.01 5


  out = pd.Series()


0.01 10


  out = pd.Series()


0.01 15


  out = pd.Series()


0.05 5


  out = pd.Series()


0.05 10


  out = pd.Series()


0.05 15


  out = pd.Series()


0.1 5


  out = pd.Series()


0.1 10


  out = pd.Series()


0.1 15


  out = pd.Series()


In [78]:
compare_dbscan_df = pd.DataFrame(compare_dbscan)
compare_dbscan_df

Unnamed: 0,eps,min_sample,pers_outl,n_noise_
0,0.01,5.0,0.23745,4749.0
1,0.01,10.0,0.34395,6879.0
2,0.01,15.0,0.4275,8550.0
3,0.05,5.0,0.0218,436.0
4,0.05,10.0,0.0379,758.0
5,0.05,15.0,0.0477,954.0
6,0.1,5.0,0.0044,88.0
7,0.1,10.0,0.00775,155.0
8,0.1,15.0,0.0142,284.0
