# Outlier Detection

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pandas.plotting import scatter_matrix
from scipy import stats

In [2]:
indi_data=pd.read_csv("dataset/CleanedIndividualCases.csv")
loc_data=pd.read_csv("dataset/CleanedLocations.csv")

FileNotFoundError: [Errno 2] File dataset/final_dataset.csv does not exist: 'dataset/final_dataset.csv'

In [None]:
loc_data.columns=loc_data.columns.str.replace('-','')

In [None]:
indi_data.head()

In [None]:
loc_data.head()

## Detecting Outliers in Individual Dataset

In [None]:

indi_data_correlation=indi_data.corr()
sns.heatmap(indi_data_correlation, annot = True)

### The correlation coefficient between age and latitude is 0.079 and between age and longitude is 0.011. This indicates that location has a weak relation with age in the given dataset.

In [None]:
fig, ax = plt.subplots(figsize=(10,5))
ax.scatter(indi_data['latitude'], indi_data['longitude'])
ax.set_xlabel('Latitude')
ax.set_ylabel('Longitude')
plt.show()

### Latitude and Longitude do not have significant outliers

In [None]:
sns.boxplot(x=indi_data['age'])

### Age attribute has an extreme outlier value of 121yrs and age group in the range of 101-106yrs are also considered as outliers for this attribute

In [None]:
indi_data.info()

## Detecting Outliers in Location Dataset

In [None]:
df=pd.DataFrame(data=loc_data, columns=['Lat', 'Long_'])
df.boxplot()

### There are not outliers in the latitude and longitude data columns

In [None]:
df=pd.DataFrame(data=loc_data, columns=['Confirmed', 'Recovered','Active','Incidence_Rate'])
df.boxplot()

### Confirmed, Active features and Recovered cases contain outliers whereas incidence_rate does not have extreme values(outliers).

In [None]:
df=pd.DataFrame(data=loc_data, columns=['Deaths'])
df.boxplot()

### Death attribute has a few adverse outliers

In [None]:
df=pd.DataFrame(data=loc_data, columns=['CaseFatality_Ratio'])
df.boxplot()

### Boxplot for Case-Fatality_ratio shows that it has an outlier above the value of 100 which indicates that the no.of deaths are more than confirmed cases, which is not possible. 


## Dealing With Outliers: Finding Actual Values

In [None]:
sub_loc_data=loc_data.drop(['Province_State','Country_Region','Last_Update','Combined_Key'], axis=1)
#z = np.abs(stats.zscore(sub_loc_data['Confirmed'].dropna()))
#print(z)

In [None]:
sub_indi_data=indi_data.drop(['province','country','date_confirmation','outcome'], axis=1)

### Z-score for Calculation

In [None]:
outliers=[]
def z_outlier(data,threshold=0,negativeAllowed=0):
    global outliers
    outliers=[]
    m=np.mean(data)
    sd=np.std(data)
    threshold=(3*sd)+m
    
    for i in data:
        z_score=(i-m/sd)
        if not negativeAllowed:
            if np.abs(z_score)>threshold:
                outliers.append(i)
        else:
            if i<0:
                outliers.append(i)
    return outliers,threshold
    
    


### For 'Age' Feature

In [None]:
outliers_age=None
threshold_age=0
outliers_age,threshold_age=z_outlier(sub_indi_data['age'])
print(outliers_age,"\n",threshold_age)

### Here the 'age' column has two outlier namely 120 and 121 years. Even though the data is isolated from the rest of the data, we cannot entirely dismiss the possibility of this occurence. There might be people in this age range and thus the values cannot be filtered out as unrealistic

### For 'Latitude' Feature (Individual Dataset)

In [None]:
outliers_latitude=None
threshold_latitude=0
outliers_latitude,threshold_latitude=z_outlier(sub_indi_data['latitude'])
print(outliers_latitude,"\n",threshold_latitude)

### For 'Longitude' Feature (Individual Dataset) 

In [None]:
outliers_longitude=None
threshold_longitude=0
outliers_longitude,threshold_longitude=z_outlier(sub_indi_data['latitude'])
print(outliers_longitude,"\n",threshold_longitude)

### We can conclude that the Individual Data does not have anyNumeric Outliers that need to be subjected to Filtering

### For 'Confirmed' Feature

In [None]:
outliers_confirmed=None
threshold_confirmed=0
outliers_confirmed,threshold_confirmed=z_outlier(sub_loc_data['Confirmed'])
print(outliers_confirmed,"\n",threshold_confirmed)

### We infer that the Confirmed Cases have outliers as shown above. We do not intend to remove or filter out these outliers from the dataset as they can possibly be correct values and this is subject to further sampling/research to confirm their accuracy.

### For 'Active' Feature

In [None]:
outliers_active=None 
threshold_active=0
outliers_active,threshold_active=z_outlier(sub_loc_data['Active'])
print(outliers_active,"\n",threshold_active)

### As we can see, the outliers in 'Active' Column contain some negative values and some values that are higher than any value in the confirmed cases column. Both of these scenarios are not possible, thus it is imperative to clean the dataset and get rid of these outliers

### For 'Recoverd' Feature

In [None]:
outliers_recovered=None 
threshold_recovered=0
outliers_recovered,threshold_recovered=z_outlier(sub_loc_data['Recovered'])
print(outliers_recovered,"\n",threshold_recovered)

### We notice that among all the outliers in the recoverd column, the value 2577446 is very extreme and not possible as it is greated than any value in the Confirmed Cases. The rest of the values are still considerable and thus we remove only one extreme recovered number.

### For 'Deaths' Feature

In [None]:
outliers_death=None 
threshold_death=0
outliers_death,threshold_death=z_outlier(sub_loc_data['Deaths'])
print(outliers_death,"\n",threshold_death)

### Even though the Death column has a few outliers we do not filter these values out because they could very well represent certain areas which were adversely hit by the pandemic and witnesses high date rates

In [None]:
outliers_fatality=None 
threshold_fatality=0
outliers_fatality,threshold_fatality=z_outlier(sub_loc_data['CaseFatality_Ratio'])
print(outliers_fatality,"\n",threshold_fatality)

### The Case-Fatality Ratio has a number of outliers but there are some that are completely unrealistic. As the ratio= deaths/confirmed*100, this value can never surpass 100. Thus, values above and also near 100, need to be removed from the column.

In [None]:
outliers_lat=None 
threshold_lat=0
outliers_lat,threshold_lat=z_outlier(sub_loc_data['Lat'])
print(outliers_lat,"\n",threshold_lat)

## 3) Applying Outlier Removal Strategy to Original Location Dataset

### Removing Outliers From 'Active' Feature of Location Dataset

In [None]:
loc_data.where(loc_data.Active.isin(outliers_active)).dropna().index

In [None]:
rev_loc_data=loc_data.query(f"Active not in {outliers_active}")

In [None]:
print(rev_loc_data)

In [None]:
print("Number of rows in Orginal Dataset: ",loc_data.shape)
print("Number of rows containing outliers: ",len(outliers_active))
print("Number of rows in Revised Dataset: ",rev_loc_data.shape)

## Removing Case-Fatality Ratio Outliers from Loaction Dataset

In [None]:
loc_data.where(loc_data.CaseFatality_Ratio.isin(outliers_fatality)).dropna().index

In [None]:
rev_loc_data=rev_loc_data.query(f"CaseFatality_Ratio not in {outliers_fatality}")
print(rev_loc_data)

In [None]:
print("Number of rows in Orginal Dataset: ",loc_data.shape)
print("Number of rows containing outliers: ",len(outliers_fatality + outliers_active))
print("Number of rows in Revised Dataset: ",rev_loc_data.shape)

### The final revised dataset has fewer rows as compared to original dataset, after removal of necessary outliers

In [None]:
rev_loc_data.to_csv("dataset/revised_location_data.csv", index=False)