In [42]:
from scipy import stats
import numpy as np
import pandas as pd
import matplotlib as plt

In [43]:
data = [1,2,3,4,5,6,7,8,np.nan,10000] # Added NaN to check the robustness of stats calculations later
df = pd.DataFrame(data)
df

Unnamed: 0,0
0,1.0
1,2.0
2,3.0
3,4.0
4,5.0
5,6.0
6,7.0
7,8.0
8,
9,10000.0


<h3>Calculating Z-Scores</h3>
<i>a z-score is the number of standard deviations from the mean a data point is</i>


Unfortunately zscore from scipy's stats doesn't work due to a Nan present in dataset

In [44]:
stats.zscore(df)

array([[nan],
       [nan],
       [nan],
       [nan],
       [nan],
       [nan],
       [nan],
       [nan],
       [nan],
       [nan]])

Calculating it manually, using DataFrame's mean and standard deviation methods.

In [45]:
data = [1,2,3,4,5,6,7,8,np.nan,1000]
df = pd.DataFrame(data)
z_values = (df - df.mean())/df.std(ddof=0) # Calculating it manually instead of scipy.stats.zscores to account for NaN
z_values

Unnamed: 0,0
0,-0.364732
1,-0.361536
2,-0.358339
3,-0.355143
4,-0.351947
5,-0.348751
6,-0.345554
7,-0.342358
8,
9,2.82836


<h5>Calculating p-Value</h5>

In [46]:
p_values = stats.norm.sf(abs(z_values))
p_values

array([[0.35765575],
       [0.35884952],
       [0.36004467],
       [0.36124119],
       [0.36243907],
       [0.3636383 ],
       [0.36483887],
       [0.36604076],
       [       nan],
       [0.00233936]])

<h5>Determining Outliers based upon a certain threshold of z-Values</h5>

In [47]:
outlier_threshold_z_value = 1
outliers = (z_values[0]>outlier_threshold_z_value).sum()
print("Number of Outliers: ",outliers)

Number of Outliers:  1


<h3>Interpolating Inconsistent (Outliers/Missing) Data</h3>

In [48]:
inconsistent_data = pd.DataFrame([1,2,np.nan,4,np.nan,np.nan,np.nan,np.nan,np.nan,10])

inconsistent_data_df = pd.DataFrame(inconsistent_data)
inconsistent_data_df

Unnamed: 0,0
0,1.0
1,2.0
2,
3,4.0
4,
5,
6,
7,
8,
9,10.0


In [49]:
def interpolateNaN(data):
    data = np.array(data,dtype="float")
    bad_indexes = np.isnan(data)
    good_indexes = np.logical_not(bad_indexes)
    good_data = data[good_indexes]
    interpolated = np.interp(bad_indexes.nonzero()[0], good_indexes.nonzero()[0], good_data)
    data[bad_indexes] = interpolated
    return data # returns the numpy array

interpolated_data = interpolateNaN(inconsistent_data_df) # Could supply python list / np array / pd dataframe, returns np array

interpolated_data_df = pd.DataFrame(interpolated_data)
interpolated_data_df

Unnamed: 0,0
0,1.0
1,2.0
2,3.0
3,4.0
4,5.0
5,6.0
6,7.0
7,8.0
8,9.0
9,10.0


<h3>Remove Specific Rows in Data Frame Based on Index</h3>

In [94]:
data_df = pd.DataFrame({"data":[1,2,3,4,5,6,7,8,9,10],"drop":[True,False,True,False,True,False,True,False,True,False]})
data_df

Unnamed: 0,data,drop
0,1,True
1,2,False
2,3,True
3,4,False
4,5,True
5,6,False
6,7,True
7,8,False
8,9,True
9,10,False


In [95]:
df_drop = data_df[ data_df['drop'] == True ]
data_df = data_df.drop(df_drop.index, axis=0)
data_df

Unnamed: 0,data,drop
1,2,False
3,4,False
5,6,False
7,8,False
9,10,False
