In [1]:
import findspark
findspark.init()

In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.types import DoubleType
from pyspark.sql.functions import *
from pyspark.mllib.feature import StandardScaler, StandardScalerModel
from pyspark.mllib.linalg import Vectors
from pyspark.mllib.clustering import GaussianMixture, GaussianMixtureModel
from pyspark import SparkContext
import numpy as np
import sys
from scipy.spatial.distance import mahalanobis
import get_outliers


In [3]:
spark = SparkSession \
        .builder \
        .appName("Nulls and Outliers Detection 1") \
        .getOrCreate()


In [4]:
df = spark.read.csv(path = 'GROUP7/bss9-579f_clean.tsv', header = True,inferSchema = True, sep='\t')

In [7]:
#df_temp.select([numeric_cols[2],*numeric_cols_temp])
df = df.withColumn('rid', monotonically_increasing_id())

In [5]:
numeric_cols = []
for col,dtype in df.dtypes:
    if 'string' not in dtype and col!='rid':
        numeric_cols.append(col)
numeric_cols_temp = numeric_cols[:2]

In [10]:
outliers = get_outliers.gmm_outliers(df,numeric_cols_temp)

----------------------------------------
Exception happened during processing of request from ('127.0.0.1', 57295)
Traceback (most recent call last):
  File "/Users/apple/anaconda3/lib/python3.6/socketserver.py", line 317, in _handle_request_noblock
    self.process_request(request, client_address)
  File "/Users/apple/anaconda3/lib/python3.6/socketserver.py", line 348, in process_request
    self.finish_request(request, client_address)
  File "/Users/apple/anaconda3/lib/python3.6/socketserver.py", line 361, in finish_request
    self.RequestHandlerClass(request, client_address, self)
  File "/Users/apple/anaconda3/lib/python3.6/socketserver.py", line 696, in __init__
    self.handle()
  File "/Users/apple/spark-2.3.0-bin-hadoop2.7/python/pyspark/accumulators.py", line 235, in handle
    num_updates = read_int(self.rfile)
  File "/Users/apple/spark-2.3.0-bin-hadoop2.7/python/pyspark/serializers.py", line 685, in read_int
    raise EOFError
EOFError
-------------------------------------

Py4JError: An error occurred while calling o606.approxQuantile

In [28]:
outliers.count()

6

In [34]:
outliers.show()

+---+------------------+
|rid|            dist_c|
+---+------------------+
| 81|4.1479892549341715|
|188| 5.182550241547551|
|189| 5.182550241547551|
|374|2.4710958278221904|
|405| 6.213406538086999|
+---+------------------+



In [5]:
df_count = df_temp.count()

In [6]:
df_count

429

In [9]:
numeric_cols_temp

['comparable_rental_2_gross_sqft',
 'comparable_rental_3_estimated_gross_income']

In [11]:
df_col_rdd = df_temp.select(['rid',*numeric_cols_temp]).rdd

In [None]:
def kmeans_multivariate(df, numeric_cols, k=3, maxIterations=100):
    def addclustercols(x):
        points = np.array(x[1].toArray()).astype(float)
        center = clusters.centers[0]
        mindist = euclidean(points, center)
        c1 = 0

        for i in range(1, len(clusters.centers)):
            center = clusters.centers[i]
            dist = euclidean(points, center)
            if dist < mindist:
                c1 = i
                mindist = dist
        return (int(x[0]), int(c1), float(mindist))

    cols = ['rid']
    cols.extend(numeric_cols)
    df_col_rdd = df[cols].rdd
    label = df_col_rdd.map(lambda x: x[0])
    vso = df_col_rdd.map(lambda x: np.array(x[1:]).astype(float))
    scaler = StandardScaler(withMean=True, withStd=True).fit(vso)
    vso = scaler.transform(vso)

    clusters = KMeans.train(vso, k, initializationMode='random', maxIterations=maxIterations)
    df_col_rdd = label.zip(vso).toDF().rdd
    print(df_col_rdd.collect())
    rdd_w_clusts = df_col_rdd.map(lambda x: addclustercols(x))
    cols = ['rid', 'c_no', 'dist_c']
    kmeans_df = rdd_w_clusts.toDF(cols)
    outlier_all, _ = iqr_outliers(kmeans_df.where(kmeans_df['c_no'] == 0), 'dist_c')
    for i in range(1, k):
        outlier_c, _ = iqr_outliers(kmeans_df.where(kmeans_df['c_no'] == i), 'dist_c')
        outlier_all = outlier_all.unionAll(outlier_c)
    #print_outlier_summary(outlier_all.count(), df.count(), "kMeans (multivariate)")
    return outlier_all

In [None]:
def gmm_outliers(df, numeric_cols, k=3, maxIterations=100):
    def getDistances(x):
        clust_center = x[0]
        rid = x[1][0]
        point = np.array(x[1][1].toArray()).astype(float)
        dist = mahalanobis(clust_center,point,sigmas_inv[clust_center])
        return (int(rid),int(clust_center),float(dist))
    
    
    # Convert to array if only one column (univariate) passed
    if not isinstance(numeric_cols, list):
        numeric_cols = [numeric_cols]
    cols = ['rid']
    cols.extend(numeric_cols)
    df_col_rdd = df[cols].rdd
    rid = df_col_rdd.map(lambda x: x[0])
    vso = df_col_rdd.map(lambda x: np.array(x[1:]).astype(float))
    scaler = StandardScaler(withMean=True, withStd=True).fit(vso)
    vso = scaler.transform(vso)
    
    gmm = GaussianMixture.train(vso, k, initializationMode='random', maxIterations=maxIterations,seed=10)
    
    df_col_rdd = rid.zip(vso).toDF().rdd
    labels = gmm.predict(vso)
    df_col_rdd = labels.zip(df_col_rdd)
    mus = []
    sigmas = []
    sigmas_inv = []
    for i in range(k):
        mus.append(np.array(gmm.gaussians[i].mu.toArray()).astype(float))
        sigmas.append(np.array(gmm.gaussians[i].sigma.toArray()).astype(float))
        sigmas_inv.append(np.linalg.inv(sigmas[i]))
    
    #print(df_col_rdd.collect())
    rdd_w_clusts = df_col_rdd.map(lambda x: getDistances(x))
    cols = ['rid', 'c_no', 'dist_c']
    gmm_df = rdd_w_clusts.toDF(cols)
    outlier_all, _ = iqr_outliers(gmm_df.where(gmm_df['c_no'] == 0), 'dist_c')
    for i in range(1, k):
        outlier_c, _ = iqr_outliers(gmm_df.where(gmm_df['c_no'] == i), 'dist_c')
        outlier_all = outlier_all.unionAll(outlier_c)
    #print_outlier_summary(outlier_all.count(), df.count(), "kMeans (multivariate)")
    return outlier_all

In [14]:
def getDistances(x):
    clust_center = x[0]
    rid = x[1][0]
    point = np.array(x[1][1].toArray()).astype(float)
    dist = mahalanobis(clust_center,point,sigmas_inv[clust_center])
    return (int(rid),int(clust_center),float(dist))
    

In [15]:
rid = df_col_rdd.map(lambda x:x[0])
features = df_col_rdd.map(lambda x: np.array(x[1:]).astype(float))
scaler2 = StandardScaler(withMean=True, withStd=True).fit(features)
features = scaler2.transform(features)
zipped_col = rid.zip(features)
gmm = GaussianMixture.train(features,3,seed=1337)
labels = gmm.predict(features)
mus = []
sigmas = []
sigmas_inv = []
for i in range(3):
    mus.append(np.array(gmm.gaussians[i].mu.toArray()).astype(float))
    sigmas.append(np.array(gmm.gaussians[i].sigma.toArray()).astype(float))
    sigmas_inv.append(np.linalg.inv(sigmas[i]))
final_rdd = labels.zip(zipped_col)
rdd_w_clusts = final_rdd.map(lambda x: getDistances(x))

In [16]:
rdd_w_clusts.collect()

[(0, 1, 1.6157493512284935),
 (1, 2, 5.037896947225772),
 (2, 2, 4.914496888324025),
 (3, 1, 0.480143179996048),
 (4, 1, 0.480143179996048),
 (5, 1, 1.586501215480311),
 (6, 1, 1.1377510226370742),
 (7, 0, 3.672178588786419),
 (8, 2, 3.435728758056047),
 (9, 2, 3.827615459960983),
 (10, 2, 3.3594554444903837),
 (11, 1, 0.7062173884101974),
 (12, 2, 4.766588372501243),
 (13, 2, 4.766588372501243),
 (14, 0, 2.5537540781523216),
 (15, 0, 1.4339641415852962),
 (16, 2, 5.865149131673084),
 (17, 0, 1.4867203615713465),
 (18, 2, 5.865149131673084),
 (19, 0, 3.537943968921163),
 (20, 1, 0.7583379707118146),
 (21, 2, 4.914496888324025),
 (22, 2, 4.914496888324025),
 (23, 2, 4.739445655611526),
 (24, 2, 4.044553866416615),
 (25, 2, 4.914496888324025),
 (26, 0, 3.566614831080036),
 (27, 0, 3.3120996402045684),
 (28, 2, 5.527262479094956),
 (29, 2, 4.914496888324025),
 (30, 2, 4.914496888324025),
 (31, 2, 4.914496888324025),
 (32, 0, 3.3464604082346807),
 (33, 0, 3.5875029455217815),
 (34, 2, 4.91

In [17]:
cols = ['rid', 'c_no', 'dist_c']
gmm_df = rdd_w_clusts.toDF(cols)
outlier_all,_ = get_outliers.iqr_outliers(gmm_df.where(gmm_df['c_no'] == 0), 'dist_c')
print(outlier_all.show())
for i in range(1, 3):
    outlier_c,_ = get_outliers.iqr_outliers(gmm_df.where(gmm_df['c_no'] == i), 'dist_c')
    outlier_all = outlier_all.unionAll(outlier_c)

+---+------------------+
|rid|            dist_c|
+---+------------------+
| 55|10.314062129467214|
| 95| 7.011739220781915|
|106| 12.23565913521114|
|136| 8.902770135532926|
|168|  8.89644305691097|
|181|   7.4663798817476|
|182|45.262610980634086|
|184|13.335141499372472|
|187| 9.651443666794027|
|188|   7.4663798817476|
|189|45.262610980634086|
|238|18.699328415701416|
|283| 6.598462393680384|
|363| 8.444450840662691|
|366| 7.473797560307171|
|367|10.762929003752038|
|368| 20.60267540365328|
+---+------------------+

None


In [19]:
outlier_all.show()

+---+------------------+
|rid|            dist_c|
+---+------------------+
|141| 8.902770135532926|
|184|13.335141499372472|
|269| 9.380789986499021|
|283| 6.598462393680384|
|343|15.943823411949214|
|394| 7.272001113728344|
|398|16.633697351050817|
| 81| 3.789283301944063|
|189| 4.218616088541481|
|405| 5.355396921380845|
|  2| 4.914496888324025|
|  6|  3.98454317146752|
|  8| 3.435728758056047|
|  9| 3.827615459960983|
| 10|3.3594554444903837|
| 12| 4.766588372501243|
| 13| 4.766588372501243|
| 16| 5.865149131673084|
| 18| 5.865149131673084|
| 21| 4.914496888324025|
+---+------------------+
only showing top 20 rows



In [28]:
outlier_all.count()

128

In [16]:
gmm_df.where(gmm_df['c_no'] == 0).show()

+---+----+------------------+
|rid|c_no|            dist_c|
+---+----+------------------+
|  1|   0|0.8117517210082208|
|  2|   0|0.7334522817581633|
|  8|   0|2.1700238041536197|
|  9|   0| 1.011470942919684|
| 10|   0|1.9212433746789441|
| 11|   0| 2.737360322556417|
| 13|   0|0.6546418765833678|
| 14|   0| 1.652383194679387|
| 23|   0|0.7316090442932575|
| 24|   0|0.7334522817581633|
| 25|   0| 1.612225971612107|
| 28|   0|1.0982602132588162|
| 29|   0|0.7334522817581633|
| 31|   0|0.7334522817581633|
| 33|   0|1.4648307464931751|
| 34|   0|0.7334522817581633|
| 35|   0|0.7334522817581633|
| 36|   0|0.7170991967175138|
| 40|   0|1.3364845162594916|
| 41|   0|1.1491516756768534|
+---+----+------------------+
only showing top 20 rows



In [97]:
outlier_all.count()

23

In [101]:
outlier_all.show()

+---+------------------+------------------+
|rid|            dist_c|            dist_c|
+---+------------------+------------------+
|  5|0.8181668075756773|0.8181668075756773|
| 58|0.4197031162934129|0.4197031162934129|
| 93| 2.955095923872292| 2.955095923872292|
|116|1.6243447061841065|1.6243447061841065|
|159| 3.732245738548085| 3.732245738548085|
|214|0.8811070218418926|0.8811070218418926|
|215|0.8811070218418926|0.8811070218418926|
|220| 2.522640762227058| 2.522640762227058|
|228|0.9198827528155038|0.9198827528155038|
|279| 2.125132118809308| 2.125132118809308|
|285|1.1357407816111917|1.1357407816111917|
|340| 2.361567562202725| 2.361567562202725|
|348| 2.155426764956365| 2.155426764956365|
|389|0.8942177110847539|0.8942177110847539|
|420|1.2516598756087944|1.2516598756087944|
|425|1.1725064364430433|1.1725064364430433|
|428|1.2087188464618732|1.2087188464618732|
|451|0.9531737413850345|0.9531737413850345|
|452|3.5158242168139813|3.5158242168139813|
|457|  1.48224191173903|  1.4822

In [89]:
final_rdd = labels.zip(zipped_col)

In [36]:
labels.take(5)

[2, 0, 0, 0, 0]

In [30]:
mus = []
sigmas = []

In [31]:
for i in range(3):
    mus.append(np.array(gmm.gaussians[i].mu.toArray()).astype(float))
    sigmas.append(np.array(gmm.gaussians[i].sigma.toArray()).astype(float))

In [32]:
mus

[array([0.44592339, 0.43449696]),
 array([-0.46776086, -0.35164155]),
 array([3.15045825, 1.74866176])]

In [33]:
sigmas

[array([[0.47591458, 0.07829796],
        [0.07829796, 0.74783396]]), array([[0.06576417, 0.03679376],
        [0.03679376, 0.07849692]]), array([[ 2.13991533, -1.64727693],
        [-1.64727693,  8.41836254]])]

In [119]:
zipped_col.take(5)

[(0, DenseVector([3.2302, 0.1908])),
 (1, DenseVector([-0.8191, 0.402])),
 (2, DenseVector([0.3915, 1.683])),
 (3, DenseVector([0.3915, 1.683])),
 (4, DenseVector([0.3915, 1.683]))]

In [120]:
zipped_col = zipped_col.map(lambda x:x[0]+1)

In [121]:
zipped_col.take(5)

[1, 2, 3, 4, 5]

In [82]:
features = features.map(lambda x:x.tolist())

In [40]:
spark.stop()

ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:57058)
Traceback (most recent call last):
  File "/Users/apple/spark-2.3.0-bin-hadoop2.7/python/lib/py4j-0.10.6-src.zip/py4j/java_gateway.py", line 852, in _get_connection
    connection = self.deque.pop()
IndexError: pop from an empty deque

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/Users/apple/spark-2.3.0-bin-hadoop2.7/python/lib/py4j-0.10.6-src.zip/py4j/java_gateway.py", line 990, in start
    self.socket.connect((self.address, self.port))
ConnectionRefusedError: [Errno 61] Connection refused


In [88]:
rid = rid.collect()

In [89]:
features.toDF().withColumn('rid',rid)

AssertionError: col should be Column

In [None]:
outliers_all = 