https://www.kaggle.com/muhammete/week-23-homework

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
data = pd.read_csv('../input/weather-dataset-rattle-package/weatherAUS.csv')
df = data.copy()

In [None]:
df.shape

In [None]:
df.describe().T

# Missing Values

**1. Missing values analysis**

Variables with missing values

Percentage of missing values for each variable as a dataframe

Drop missing values in RainToday and RainTomorrow variables

In [None]:
df.isnull().sum()

In [None]:
# Features with missing values
miss = df.isnull().sum().sort_values(ascending = False).head(15)
miss_per = (miss/len(df))*100

# Percentage of missing values
pd.DataFrame({'No. missing values': miss, '% of missing data': miss_per.values})

In [None]:
df.dropna(subset=['RainTomorrow','RainToday'], inplace = True)

In [None]:
df.isnull().sum()

# 2. Multivariate imputation with (a) selected algorithm(s) and label encoding

Tip: May consider to drop some categorical variables with too many categories, if it takes too much time to get a result

Tip: Do not forget to encode categorical variables.

Tip: You get error if you have data type such as string or date. The values should be integer or float.

In [None]:
df.head()

In [None]:
df['Date'] = pd.to_datetime(df['Date'], format='%Y/%m/%d')

In [None]:
df.info()

In [None]:
categories_df = df.select_dtypes(include=['object']).copy()

unique_val = []
for i in categories_df.columns:
    u = categories_df[i].nunique()
    unique_val.append(u)
    
pd.DataFrame({"No. of unique values": unique_val}, index=categories_df.columns)

# Multivariate Imputation

In [None]:
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

df_multi_imputation = df.copy()

X = df_multi_imputation.drop(['Location','WindGustDir','WindDir9am','WindDir3pm','Date'], axis=1)
X = pd.get_dummies(X, drop_first=True, columns = ['RainToday','RainTomorrow'], 
                   prefix = ['RainToday','RainTomorrow'])
index = X.index
columns = X.columns

In [None]:
imp_mean = IterativeImputer(random_state=0)
imp_mean.fit(X)
X_imputed = imp_mean.transform(X)
df_imputed_bayesian = pd.DataFrame(X_imputed, index=index, columns=columns)
df_imputed_bayesian.isnull().sum()

# 3. Outliers

Apply Z-score method to detect outliers

Apply IQR method/boxplot visualization for selected variables to detect outliers

Apply Isolation Forest and Local Outlier Factor methods to detect outliers

Tip: Consider standardization based on the algorithm that you use.

Decide on a threshold observation to replace outliers

Replace the outliers with the threshold observation

**Z-score**

In [None]:
from scipy import stats

numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
df_numeric = df.select_dtypes(include=numerics)

z = np.abs(stats.zscore(df_numeric))
threshold = 3
print('list of outliers:', '\n', np.where(z > 3)[0], '\n', np.where(z > 3)[1])
print('number of outliers:', np.where(z > 3)[0].shape)

**IQR**

In [None]:
# IQR values for each variable
Q1 = df_numeric.quantile(0.25)
Q3 = df_numeric.quantile(0.75)
IQR = Q3 - Q1
print(IQR)

In [None]:
# Example: MaxTemp
Q1_MaxTemp = df.MaxTemp.quantile(0.25)
Q3_MaxTemp = df.MaxTemp.quantile(0.75)
IQR_MaxTemp = Q3_MaxTemp - Q1_MaxTemp
upper_MaxTemp = Q3_MaxTemp + 1.5 * IQR_MaxTemp
lower_MaxTemp = Q1_MaxTemp - 1.5 * IQR_MaxTemp
print(f"""
Q1_MaxTemp = {Q1_MaxTemp}
Q3_MaxTemp = {Q3_MaxTemp}
IQR_MaxTemp = {IQR_MaxTemp}
upper_MaxTemp = {upper_MaxTemp}
lower_MaxTemp = {lower_MaxTemp}
""")

In [None]:
df.MaxTemp[df.MaxTemp<lower_MaxTemp].shape

In [None]:
df.MaxTemp[df.MaxTemp>upper_MaxTemp].shape

In [None]:
import plotly.express as px

fig = px.box(df, y="MaxTemp",)
fig.show()

In [None]:
# # Example: MaxTemp
# for a in df_numeric.columns:
#     Q1_a = df.a.quantile(0.25)
#     Q3_a = df.a.quantile(0.75)
#     IQR_a = Q3_a - Q1_a
#     upper_a = Q3_a + 1.5 * IQR_a
#     lower_a = Q1_a - 1.5 * IQR_a
#     print(f"""
#     Q1_a = {Q1_a}
#     Q3_a = {Q3_a}
#     IQR_a = {IQR_a}
#     upper_a = {upper_a}
#     lower_a = {lower_a}
#     """)

**Isolation Forest**

In [None]:
from sklearn.ensemble import IsolationForest
from sklearn.model_selection import train_test_split

df_isolation_forest = df_imputed_bayesian.copy()

y_iso = df_isolation_forest['MaxTemp']
X_iso = df_isolation_forest.drop(['MaxTemp'], axis=1)

clf = IsolationForest(n_estimators=100, max_samples='auto', contamination=0.10, random_state=42)
clf.fit(X_iso)
y_pred = clf.predict(X_iso)

# the model will predict an inlier with a label of +1 and an outlier with a label of -1

outliers_values = X_iso[clf.predict(X_iso) == -1]
outliers_values.shape

**Local Outlier Factor**

In [None]:
from sklearn.neighbors import LocalOutlierFactor
from sklearn.preprocessing import MinMaxScaler

df_lof = df_imputed_bayesian.copy()

scaler = MinMaxScaler()

columns = df_lof.columns

#note that we transform the data with MinMaxScaler
df_lof_scaled = scaler.fit_transform(df_lof)
df_lof_scaled = pd.DataFrame(df_lof, columns=columns)

y = df_lof_scaled['MaxTemp']
X = df_lof_scaled.drop(['MaxTemp'], axis=1)

# fit the model for outlier detection (default)
clf = LocalOutlierFactor(n_neighbors=20, contamination=0.1)

# use fit_predict to compute the predicted labels of the training samples
# (when LOF is used for outlier detection, the estimator has no predict,
# decision_function and score_samples methods).

y_pred = clf.fit_predict(X)
X_scores = clf.negative_outlier_factor_
np.sort(X_scores)[:100]

In [None]:
threshold = np.sort(X_scores)[14078]
threshold

In [None]:
X[(X_scores > threshold)==False].shape

# Deleting observations based on the LOF threshold

In [None]:
# Deleting observations based on the LOF threshold
df_numeric_no2 = df_imputed_bayesian[(X_scores > threshold)==True]
print(df_imputed_bayesian.shape)
print(df_numeric_no2.shape)

In [None]:
#Before deleting
fig = px.box(df, y=df.MaxTemp)
fig.show()

In [None]:
#After deleting
fig = px.box(df_numeric_no2, y=df_numeric_no2.MaxTemp)
fig.show()

In [None]:
# the threshold observation
df_imputed_bayesian[(X_scores == threshold)]

In [None]:
threshold_row = df_imputed_bayesian[(X_scores == threshold)]
outliers = df_imputed_bayesian[(X_scores < threshold)]
# We get rid of the indexes of outliers and transformed them into array
outliers.to_records(index = False)
# We define a variable for outlier array
outliers_array = outliers.to_records(index = False)

# We replace all the outliers with the threshold row
outliers_array[:] = threshold_row.to_records(index = False)

df_numeric_no3 = df_imputed_bayesian.copy()

# the outlier observations after replacement by the threshold observation
df_numeric_no3[(X_scores < threshold)] = pd.DataFrame(outliers_array, index = df_numeric_no3[(X_scores < threshold)].index)
df_numeric_no3[(X_scores < threshold)].head()

In [None]:
fig = px.box(df_numeric_no3, y=df_numeric_no3.MaxTemp)
fig.show()