Z scored Anomaly Detection

In [2]:
import pandas as pd
import numpy as np

In [10]:
x = pd.Series([2.1, 2.3, 4.5, 2.2, 2.4])

In [6]:
mean = np.mean(x)
std = np.std(x)
outliers = []

In [7]:
for item in x:
  z_score = (item-mean) / std
  if z_score > 1.5:
    outliers.append(item)

print(outliers)

[4.5]


# Interquartile Range for Anomaly Detection

In [8]:
x = pd.Series([2.1, 2.3, 4.5, 2.2, 2.4])

In [9]:
Q1, Q3 = np.percentile(x, [25, 75])
IQR = Q3-Q1
outliers = []

In [11]:
for item in x:
  if item <(Q1-1.5*IQR) or item >(Q3+1.5*IQR):
    outliers.append(item)

print(outliers)

[4.5]


# Dealing with missing values

In [12]:
from numpy import NaN

data1 = {'Name': ['Edison', 'Edward', 'James', 'Neesham'], 'Age': [28, 27, NaN, 36]}

In [13]:
data = pd.DataFrame.from_dict(data1)
data

Unnamed: 0,Name,Age
0,Edison,28.0
1,Edward,27.0
2,James,
3,Neesham,36.0


In [14]:
data.isnull()

Unnamed: 0,Name,Age
0,False,False
1,False,False
2,False,True
3,False,False


In [15]:
data.isnull().sum()

Name    0
Age     1
dtype: int64

In [16]:
data.dropna(inplace = True)
data

Unnamed: 0,Name,Age
0,Edison,28.0
1,Edward,27.0
3,Neesham,36.0


# Filling Missing Values by the Mean

In [17]:
from numpy import NaN

data1 = {'Name': ['Edison', 'Edward', 'James', 'Neesham'], 'Age': [28, 27, NaN, 36]}

In [18]:
data = pd.DataFrame.from_dict(data1)
data

Unnamed: 0,Name,Age
0,Edison,28.0
1,Edward,27.0
2,James,
3,Neesham,36.0


In [19]:
data.fillna(data.mean(), inplace = True)
data

  """Entry point for launching an IPython kernel.


Unnamed: 0,Name,Age
0,Edison,28.0
1,Edward,27.0
2,James,30.333333
3,Neesham,36.0


# Filling Missing Values by the Mode

In [20]:
from numpy import NaN

data1 = {'Name': ['Edison', 'Edward', 'James', 'Neesham'], 'Age': [28, 27, NaN, 36]}

In [21]:
data = pd.DataFrame.from_dict(data1)
data

Unnamed: 0,Name,Age
0,Edison,28.0
1,Edward,27.0
2,James,
3,Neesham,36.0


In [23]:
data['Age'].mode()

0    27.0
1    28.0
2    36.0
dtype: float64

In [24]:
data['Age'].fillna(data['Age'].mode()[0], inplace= True)
data

Unnamed: 0,Name,Age
0,Edison,28.0
1,Edward,27.0
2,James,27.0
3,Neesham,36.0


# Regular Expressions

In [25]:
import re

In [26]:
txt = 'Python is my favorite programming language. I love Python.'
x = re.findall('Python', txt)
x

['Python', 'Python']

In [27]:
len(x)

2

# Finding Digits using Regular Expressions

In [28]:
txt = 'Python was released in 1991.'

In [30]:
re.findall('\d', txt)

['1', '9', '9', '1']

In [31]:
re.findall('\d+', txt)

['1991']

# re.search()

In [32]:
txt = 'Hello World'
match_object = re.search('World', txt)
match_object

<re.Match object; span=(6, 11), match='World'>

In [33]:
match_object.span()

(6, 11)

# re.sub()

In [34]:
txt = 'C is my favorite programming language.'
re.sub(pattern = 'C', repl='Python', string = txt)

'Python is my favorite programming language.'

# Feature Scaling

In [35]:
df1=  {'Age': [28,27,  30, 36, 27], 'Salary': [10000, 15000, 11000, 11000, 13000]}

In [36]:
df = pd.DataFrame.from_dict(df1)
df

Unnamed: 0,Age,Salary
0,28,10000
1,27,15000
2,30,11000
3,36,11000
4,27,13000


# Normalization

In [37]:
df = (df-df.min()) / (df.max()-df.min())
df

Unnamed: 0,Age,Salary
0,0.111111,0.0
1,0.0,1.0
2,0.333333,0.2
3,1.0,0.2
4,0.0,0.6


# Standardization

In [38]:
df = (df-df.mean()) / df.std()
df

Unnamed: 0,Age,Salary
0,-0.423109,-1.0
1,-0.687552,1.5
2,0.105777,-0.5
3,1.692435,-0.5
4,-0.687552,0.5


In [39]:
df.std()

Age       1.0
Salary    1.0
dtype: float64