In [1]:
import pandas as pd
import numpy as np

# create the data
data = pd.DataFrame({'age' : (20,21,22,23,24,25,26,27,28,29,30,50)})
data

Unnamed: 0,age
0,20
1,21
2,22
3,23
4,24
5,25
6,26
7,27
8,28
9,29


In [2]:

# define mean and std
mean = np.mean(data['age'])
std = np.std(data['age'])

# calculate the Z-score
data['Z-score'] = (data['age'] - mean) / std
data

Unnamed: 0,age,Z-score
0,20,-0.938954
1,21,-0.806396
2,22,-0.673838
3,23,-0.54128
4,24,-0.408721
5,25,-0.276163
6,26,-0.143605
7,27,-0.011047
8,28,0.121512
9,29,0.25407


In [3]:


# print the data
print('---------------------------------------')
print(f"Here is the data with Outliers:\n{data}")
print('---------------------------------------')
# print the outliers
print(f"Here are the outliers based on the Z-score threshold, 2:\n{data[data['Z-score'] > 3]}")
print("---------------------------------------")
# Remove the outliers
data = data[data['Z-score'] <= 3]

# print the data without outliers
print(f"Here is the data without Outliers:\n{data}")



---------------------------------------
Here is the data with Outliers:
    age   Z-score
0    20 -0.938954
1    21 -0.806396
2    22 -0.673838
3    23 -0.541280
4    24 -0.408721
5    25 -0.276163
6    26 -0.143605
7    27 -0.011047
8    28  0.121512
9    29  0.254070
10   30  0.386628
11   50  3.037793
---------------------------------------
Here are the outliers based on the Z-score threshold, 2:
    age   Z-score
11   50  3.037793
---------------------------------------
Here is the data without Outliers:
    age   Z-score
0    20 -0.938954
1    21 -0.806396
2    22 -0.673838
3    23 -0.541280
4    24 -0.408721
5    25 -0.276163
6    26 -0.143605
7    27 -0.011047
8    28  0.121512
9    29  0.254070
10   30  0.386628


# define outliers using scipy

In [4]:
# import libraries
import pandas as pd
from scipy import stats

# create the data 
data = [2.5,2.7,2.9,3.1,3.3,3.5,3.7,3.9,101.0]

# calculate the z-score using stats from scipy
z_score = np.abs(stats.zscore(data))

# define outliers
threshold = 2.5
outliers = np.where(z_score > threshold)[0]

# print the data
print('-------------------------------------')
print(f"data:\n{data}")
print('-------------------------------------')
print("Indices of Outliers:", outliers)
print("outliers", [data[i] for i in outliers])

# remove outliers
data = [data[i] for i in range(len(data)) if i not in outliers]
print('-------------------------------------')
print("data without outliers", data)

-------------------------------------
data:
[2.5, 2.7, 2.9, 3.1, 3.3, 3.5, 3.7, 3.9, 101.0]
-------------------------------------
Indices of Outliers: [8]
outliers [101.0]
-------------------------------------
data without outliers [2.5, 2.7, 2.9, 3.1, 3.3, 3.5, 3.7, 3.9]


# IQR method

In [5]:
# import libraries
import pandas as pd
import numpy as np

# create the data
data = pd.DataFrame({'age' : [20,21,22,23,24,25,26,27,28,29,30,50]})

# calculate Q1 ad Q3 value
Q1 = np.percentile(data['age'], 25, interpolation='midpoint')
Q3 = np.percentile(data['age'], 75, interpolation='midpoint')
# define IQR
IQR = Q3 - Q1

# define upper and lower bound
lower_bound = Q1 - (1.5 * IQR)
upper_bound = Q3 + (1.5 * IQR)

# print the data 
print("-------------------------------------")
print(f"Here is the data with outliers:\n{data}")
print("-------------------------------------")
print(f"Here is the outliers based on the IQR method:\n{data[(data['age'] < lower_bound) | (data['age'] > upper_bound)]}")
# remove the data
data = data[(data['age'] >= lower_bound) & (data['age'] <= upper_bound)]
print("-------------------------------------")
print(f"Here is the data without outliers\n{data}")

-------------------------------------
Here is the data with outliers:
    age
0    20
1    21
2    22
3    23
4    24
5    25
6    26
7    27
8    28
9    29
10   30
11   50
-------------------------------------
Here is the outliers based on the IQR method:
    age
11   50
-------------------------------------
Here is the data without outliers
    age
0    20
1    21
2    22
3    23
4    24
5    25
6    26
7    27
8    28
9    29
10   30


# (K-Means) Clustering Method

In [7]:
# import libraries
from sklearn.cluster import KMeans
# create the data
data = [[2,2], [3,3], [3,4], [30,30], [31,31], [32,32]]
# Create a K-Means model with two clusters (normal and outliers)
kmeans = KMeans(n_clusters=2, n_init=10)
kmeans.fit(data)

# predict cluster labels
labels = kmeans.predict(data)

#Identify outliers based on cluster labels
outliers = [data[i] for i, label in enumerate(labels) if label == 1]

# print data
print("print:", data)
print("outliers:", outliers)
# remove outliers
data = [data[i] for i, label in enumerate(labels) if label == 0]
print("data without outliers:", data)


print: [[2, 2], [3, 3], [3, 4], [30, 30], [31, 31], [32, 32]]
outliers: [[30, 30], [31, 31], [32, 32]]
data without outliers: [[2, 2], [3, 3], [3, 4]]
