In [1]:
import numpy as np
import pandas as pd

# Decisions we make are based on the quality of data we have. 

# Median based anomaly detection 
x = pd.Series([2.3,2.1,4.5,2.5,2.4])
median = np.median(x) # median is 2.4
threshold = 2
outliers = []

for item in x:
    if abs(item - median) > threshold:
        outliers.append(item)

print(outliers) 

[4.5]


In [71]:
# Mean based anomaly detection 

x = pd.Series([2.3,2.1,4.5,2.5,2.4])
mean = np.mean(x) # Mean = 2.7600
std = np.std(x) # Standard Deviation = 0.88
                # Mean - Std = 1.88
                # Mean + Std = 3.64
outliers = []
for item in x:
    if (item < mean - std) or (item > mean + std):
        outliers.append(item)

outliers


[4.5]

In [72]:
# Z Score based anomaly detection

x = pd.Series([2.3,2.1,4.5,2.5,2.4])
mean = np.mean(x) # Mean = 2.7600
std = np.std(x) # Standard Deviation = 0.88
outliers = []
for item in x:
    z_score = (item - mean) / std
    if z_score > 1.5:
        outliers.append(item)

outliers

[4.5]

In [74]:
# Interquartile Range for anomaly detection 

x = pd.Series([2.3, 2.2, 4.5, 2.1, 2.5])
Q1, Q3 = np.percentile(x,[25,75])
IQR = Q3 - Q1
outliers = []
for item in x:  
    if item < (Q1 - 1.5 * IQR) or item > (Q3 + 1.5 * IQR):
        outliers.append(item)

outliers

[4.5]

In [113]:
# Dealing with missing values
import numpy as np
import pandas as pd
np.nan

data1 = {'Name': ['Edison', 'Edward', 'James', 'Neesham', 'Stuart'], 'Age': [28,27, nan, 36, 27]}
# Nan = Not a number 

In [114]:
data = pd.DataFrame(data1)
data

Unnamed: 0,Name,Age
0,Edison,28.0
1,Edward,27.0
2,James,
3,Neesham,36.0
4,Stuart,27.0


In [115]:
data.isnull() # True means missing value (python asks a boolean question to each value in df)

Unnamed: 0,Name,Age
0,False,False
1,False,False
2,False,True
3,False,False
4,False,False


In [116]:
data['Age'].mode()        

0    27.0
Name: Age, dtype: float64

In [None]:
data['Age'].fillna(data['Age'].mode()[0], inplace=True)  # fillna using mode      
data['Age'].fillna(data.mean(), inplace=True)          # fillna using mean
data  

In [118]:
# Regular Expression
import re

txt = 'I am learning Python. I am learning Data Science using Python.'
x = re.findall('Python', txt)
print(x)
len(x)

['Python', 'Python']


2

In [148]:
# ^ to check if the string begins with the stated parameter in re.findall

txt = 'Python I am practicing Python'
x = re.findall('^Python', txt)
print(x)

['Python']


In [151]:
#finding numbers using \d in strings
# use + sign to get the output as a whole

txt = 'Python was launched in 1991.'
re.findall(r"\d", txt)   # output [1],[9],[9],[1]
re.findall(r"\d+", txt)
# r-sting is regular expression string similar to f-string - format string

['1991']

In [123]:
txt_list = ['India', 'China', 'Germany', 'New Zealand', 'India']
txt = pd.Series(txt_list)                 
txt

0          India
1          China
2        Germany
3    New Zealand
4          India
dtype: object

In [122]:
re.findall('India', txt.to_string())       # convert series to string

['India', 'India']

In [128]:
txt = 'Hello World'
match_object = re.search('World', txt)     
match_object

<re.Match object; span=(6, 11), match='World'>

In [129]:
match_object.span()            # span denotes the index values to the matched string and returns it.  

(6, 11)

In [131]:
txt = 'C is my favourite programming language.'     # substitute a word in string using re.sub
re.sub(pattern="C", repl="Python", string=txt)

'Python is my favourite programming language.'

In [143]:
mylist = [[28, 10000],[27,15000],[30,11000],[36,11000],[27,13000]]
df = pd.DataFrame(mylist, columns=['Age', 'Salary'])
df

Unnamed: 0,Age,Salary
0,28,10000
1,27,15000
2,30,11000
3,36,11000
4,27,13000


In [144]:
# Feature Scaling 

#  Normalized value = (original value - minimum) / (maximum - minimum) 
# a.k.a = normalization
# This method scales the features in the range [0, 1]
df = (df - df.min())/ (df.max() - df.min())
df

Unnamed: 0,Age,Salary
0,0.111111,0.0
1,0.0,1.0
2,0.333333,0.2
3,1.0,0.2
4,0.0,0.6


In [147]:
# In standardization, for each value of a feature (column), we subtract the mean of that 
# feature (column) from the value and divide the result by the standard deviation of that feature (column).
# Standardized value = (original value - mean) / standard deviation

df = (df - df.mean())/df.std() 
df

Unnamed: 0,Age,Salary
0,-0.423109,-1.0
1,-0.687552,1.5
2,0.105777,-0.5
3,1.692435,-0.5
4,-0.687552,0.5
