<a href="https://colab.research.google.com/github/siddas18/Natural-Disaster-Damage-Prediction/blob/Kartik/Data_cleaning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
# Generic Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
from datetime import datetime as dt
from math import sqrt, isnan, pi, sin, cos, atan2
import requests
import gzip
from functools import reduce
import scipy as scp
import seaborn as sns
import scipy.interpolate

# Preprocessing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import GridSearchCV

# Models
from sklearn.linear_model import LinearRegression, Ridge, Lasso, RidgeCV, LassoCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from xgboost import XGBRegressor

# Metrics
from sklearn.metrics import mean_squared_error, r2_score


import warnings
warnings.catch_warnings()
warnings.simplefilter("ignore")

In [None]:
def get_NOAA_data():
    
    URL = "https://www.ncei.noaa.gov/pub/data/swdi/stormevents/csvfiles/"
    r = requests.get(URL)
    file_names = pd.read_html(r.text)[0]['Name']
    events_file_names = file_names[file_names.str.contains("details",na=False)]
    noaa_list = []
    
    print("Extracting CSV files...")
    for file in events_file_names:
        full_URL = URL + file
        with gzip.open(requests.get(full_URL, stream=True).raw) as f:
            noaa_list.append(pd.read_csv(f))
        
    df = pd.concat(noaa_list)
    
    print("Completed")
    return df

def pickle_source_data():
    noaa_source_df = get_NOAA_data()
    home_dir = os.getcwd()
    data_dir = os.path.join(home_dir, "Data")
    try:
        os.mkdir(data_dir)
        os.chdir(data_dir)
    except OSError:
        os.chdir(data_dir)
        for file in os.listdir():
            os.remove(file)
    noaa_source_df.to_pickle('noaa_source_data.pkl')
    os.chdir(home_dir)
    return noaa_source_df

NOAA_df = pickle_source_data()
    

Extracting CSV files...
Completed


In [3]:
NOAA_df = pd.read_pickle('/content/drive/MyDrive/noaa_source_data.pkl')

In [4]:
def replace_str2num(x):
    if type(x) == float or type(x) == int:
        return float(x)
    num = 1 if x[:-1] == '' else x[:-1]        
    if x[-1] == 'T':
        return float(num) * 1000000000000
    elif x[-1] == 'B':
        return float(num) * 1000000000
    elif x[-1] == 'M':
        return float(num) * 1000000
    elif x[-1] == 'K' or x[-1] == 'k':
        return float(num) * 1000
    elif x[-1] == 'h' or x[-1] == 'H':
        return float(num) * 100
    elif x[-1] == '?':
        return float(num)
    else:
        return float(x)

def winds(x):
    if x['MAGNITUDE_TYPE'] in ['EG', 'E', 'M', 'ES', 'MG', 'MS']:
        return x['MAGNITUDE']

def hail(x):
    if x['MAGNITUDE_TYPE'] not in ['EG', 'E', 'M', 'ES', 'MG', 'MS']:
        return x['MAGNITUDE']

def missing_swap(df, col1, col2):
    df.loc[~df[col1].isnull() & df[col2].isnull(), col2] = df.loc[~df[col1].isnull() & df[col2].isnull(), col1]
    df.loc[df[col1].isnull() & ~df[col2].isnull(), col1] = df.loc[df[col1].isnull() & ~df[col2].isnull(), col2]
    return df

rename_event_dict = {
    'TORNADOES, TSTM WIND, HAIL': 'Tornadoes, Thunderstorm Wind, Hail',
    'THUNDERSTORM WINDS LIGHTNING': 'Thunderstorm Wind, Lightning',
    'THUNDERSTORM WINDS/ FLOOD': 'Thunderstorm Wind, Flood',
    'THUNDERSTORM WINDS/FLOODING': 'Thunderstorm Wind, Flood',
    'THUNDERSTORM WIND/ TREES': 'Thunderstorm Wind, Trees',
    'THUNDERSTORM WIND/ TREE': 'Thunderstorm Wind, Trees',
    'THUNDERSTORM WINDS/HEAVY RAIN': 'Thunderstorm Wind, Heavy Rain',
    'TORNADO/WATERSPOUT': 'Tornado, Waterspout',
    'THUNDERSTORM WINDS FUNNEL CLOU': 'Thunderstorm Wind, Funnel Cloud',
    'THUNDERSTORM WINDS/FLASH FLOOD': 'Thunderstorm Wind, Flash Flood',
    'HAIL/ICY ROADS': 'Hail, Icy Roads',
    'HAIL FLOODING': 'Hail, Flood',
    'THUNDERSTORM WINDS HEAVY RAIN': 'Thunderstorm Wind, Heavy Rain',
    'Hurricane (Typhoon)': 'Hurricane'
}

timezone_dict = {
    'GST' : ['GST10'],
    'AST' : ['AST-4', 'AST'],
    'EST' : ['EST', 'EST-5', 'ESt', 'EDT'],
    'CST' : ['CST', 'CST-6', 'CSt', 'CSC', 'SCT', 'GMT', 'UNK', 'CDT'],
    'MST' : ['MST', 'MST-7', 'MDT'],
    'PST' : ['PST', 'PST-8', 'PDT'],
    'AKST': ['AKST-9'],
    'HST' : ['HST-10', 'HST'],
    'SST' : ['SST-11', 'SST']
}

def timezone_mapping(x):
    for key, val in timezone_dict.items():
        if x in val:
            return key

azimuth_mapping ={ 'N/A': ['ND', 'EE', 'TO', 'MI', 'M', 'EST', 'EAS', 'TH', 'WES'] }

def dict_mapping(x):
    for key, val in timezone_dict.items():
        if x in val:
            return key

In [5]:
# Damage variables cleaning

NOAA_df['DAMAGE_PROPERTY'] = NOAA_df.DAMAGE_PROPERTY.map(replace_str2num)

NOAA_df['DAMAGE_CROPS'] = NOAA_df.DAMAGE_CROPS.map(replace_str2num)

In [None]:
# Using data after Year 1950 and checking how many columns have NULL values

NOAA_df = NOAA_df[NOAA_df['YEAR']> 1950]

for i in NOAA_df.columns:
    if NOAA_df[i].isna().sum() > 0:
        print(NOAA_df[i].name, ":", NOAA_df[i].isna().sum(), "\n")
        print((NOAA_df[i].isna().sum()/NOAA_df.shape[0])*100, "\n")
    else:
        continue

In [7]:
# Removing inconsistencies
NOAA_df = missing_swap(NOAA_df, 'BEGIN_RANGE', 'END_RANGE')
NOAA_df = missing_swap(NOAA_df, 'BEGIN_LAT', 'END_LAT')
NOAA_df = missing_swap(NOAA_df, 'BEGIN_LON', 'END_LON')
NOAA_df = missing_swap(NOAA_df, 'BEGIN_AZIMUTH', 'END_AZIMUTH')
NOAA_df = missing_swap(NOAA_df, 'BEGIN_LOCATION', 'END_LOCATION')

In [8]:
# Removing Outliers

NOAA_df.shape

Q1 = NOAA_df.quantile(0.25)
Q3 = NOAA_df.quantile(0.75)
IQR = Q3 - Q1

NOAA_df = NOAA_df[~((NOAA_df < (Q1 - 1.5 * IQR)) |(NOAA_df > (Q3 + 1.5 * IQR))).any(axis=1)]

NOAA_df.shape

(1198875, 51)

In [9]:
# calculating distance w.r.t Latitude and Longitude

def geo_distance(x):
    # Source : https://en.wikipedia.org/wiki/Haversine_formula
    p = pi/180
    lat1 = x['BEGIN_LAT']
    lat2 = x['END_LAT']
    lon1 = x['BEGIN_LON']
    lon2 = x['END_LON']
    R = 6371
    dLat = p * (lat2-lat1)
    dLon = p * (lon2-lon1)
    a = sin(dLat/2) * 2 + cos(p*lat1) * cos(p*lat2) * sin(dLon/2) * 2
    if a < 0 :
      return a
    else:
      return 2 * R * atan2(sqrt(a), sqrt(1-a))

NOAA_df['GEO_DISTANCE'] = NOAA_df.apply(lambda x: geo_distance(x), axis=1)

In [10]:
# NOAA_df['DURATION_OF_STORM'] = (pd.to_datetime(NOAA_df['END_DATE_TIME']) - pd.to_datetime(NOAA_df['BEGIN_DATE_TIME']))
# NOAA_df['DURATION_OF_STORM'] = NOAA_df['DURATION_OF_STORM'].astype('str').str[:-13]

# calculate duration of the storm event

def calc_duration(x):
    begin_dt = dt.strptime(x['BEGIN_DATE_TIME'], "%d-%b-%y %H:%M:%S")
    end_dt = dt.strptime(x['END_DATE_TIME'], "%d-%b-%y %H:%M:%S")
    difference = end_dt - begin_dt
    difference_days = difference.days + difference.seconds/86400
    return difference_days

NOAA_df['DURATION_OF_STORM'] = NOAA_df.apply(lambda x: calc_duration(x), axis=1)

In [None]:
# Update EVENT_TYPE column

NOAA_df.replace({'EVENT_TYPE' : { 'THUNDERSTORMWIND/TREE' : 'ThunderstormWind', 'THUNDERSTORMWIND/TREES' : 'ThunderstormWind', 'THUNDERSTORMWINDS/FLASHFLOOD' : 'ThunderstormWind'
                                 , 'THUNDERSTORMWINDS/FLOODING' : 'ThunderstormWind', 'THUNDERSTORMWINDS/HEAVYRAIN' : 'ThunderstormWind'
                                 , 'THUNDERSTORMWINDSFUNNELCLOU': 'ThunderstormWind', 'THUNDERSTORMWINDSHEAVYRAIN': 'ThunderstormWind'
                                 , 'THUNDERSTORMWINDSLIGHTNING': 'ThunderstormWind'
                                 , 'HAIL/ICYROADS': 'Hail', 'HAILFLOODING': 'Hail', 'Hurricane(Typhoon)': 'Hurricane'}})


In [11]:
# Dropping NAN values - if required

NOAA_df.dropna(subset=["DAMAGE_PROPERTY"], inplace=True)
# NOAA_df.dropna(subset=["DAMAGE_CROPS"], inplace=True)


In [13]:
# impute DAMAGE_CROPS with mean value of damage per EVENT_TYPE

X = NOAA_df.groupby('EVENT_TYPE')

Y = X['DAMAGE_CROPS'].mean()

reset_index = Y.reset_index()

NOAA_df = pd.merge(NOAA_df, reset_index, on='EVENT_TYPE', how='inner')

NOAA_df.drop(columns=['DAMAGE_CROPS_x'], inplace=True)

NOAA_df = NOAA_df.rename(columns={"DAMAGE_CROPS_y": "DAMAGE_CROPS"})


In [None]:
def impute_NOAA_data(df):
    
    # magnitude converted into other variables
    drop_list = ['EVENT_NARRATIVE', 'EPISODE_NARRATIVE', 'EPISODE_ID', 'MAGNITUDE', 'BEGIN_LAT', 'END_LAT', 'BEGIN_LON', 'END_LON','BEGIN_DATE_TIME','END_DATE_TIME','STATE_FIPS'
                , 'TOR_OTHER_CZ_FIPS', 'WFO', 'SOURCE', 'CATEGORY', 'CZ_FIPS','DATA_SOURCE', 'TOR_OTHER_WFO', 'EVENT_ID', 'BEGIN_YEARMONTH','BEGIN_DAY','BEGIN_TIME', 'END_YEARMONTH'
                , 'END_DAY', 'END_TIME']
    #  impute_mean_list = ['BEGIN_LAT', 'END_LAT', 'BEGIN_LON', 'END_LON'] # removed because for events like DROUGHT there should be no travelled distance and added to drop list
    # Instead impute GEO_DISTANCE to 0
    
    # imputing damage columns with 0 for the time-being
    impute_zero_list = ['BEGIN_RANGE', 'END_RANGE', 'WIND_SPEED', 'HAIL_SIZE','GEO_DISTANCE', 'TOR_LENGTH' ,'TOR_WIDTH']
    # ['BEGIN_RANGE', 'END_RANGE', 'WIND_SPEED', 'HAIL_SIZE','GEO_DISTANCE', 'TOR_LENGTH' ,'TOR_WIDTH', 'DAMAGE_PROPERTY']

    impute_NA_list = ['CZ_NAME', 'STATE', 'MAGNITUDE_TYPE', 'BEGIN_AZIMUTH', 'END_AZIMUTH', 'BEGIN_LOCATION', 'END_LOCATION', 'FLOOD_CAUSE', 'TOR_F_SCALE'
                    , 'TOR_OTHER_CZ_STATE', 'TOR_OTHER_CZ_NAME']
      
    # Splitting magnitude variable into constituent attributes
    df['WIND_SPEED'] = df.apply(winds, axis = 1)
    df['HAIL_SIZE'] = df.apply(hail, axis = 1)
    
    df['EVENT_TYPE'] = df['EVENT_TYPE'].apply(lambda x: rename_event_dict[x] if rename_event_dict.get(x)!= None else x)
    df['COLD_WEATHER_EVENT'] = df['EVENT_TYPE'].str.contains('Hail|Winter|Snow|Chill|Cold|Frost|Freeze|Blizzard|Ice|Avalanche').map({True: 1, False:0})
    df['WINDY_EVENT'] = df['EVENT_TYPE'].str.contains('Wind|Tornado|Thunderstorm|Cloud|Storm').map({True: 1, False:0})
    df['WATER_EVENT'] = df['EVENT_TYPE'].str.contains('Flood|Marine|Rain|Hurricane|Tide|Lake|Seiche|Tsunami|Sleet|Water').map({True: 1, False:0})
    df.loc[:,'CZ_TIMEZONE'] = df.loc[:,'CZ_TIMEZONE'].apply(lambda x: timezone_mapping(x))
    df.loc[:,'BEGIN_AZIMUTH'] = df.loc[:,'BEGIN_AZIMUTH'].str.upper().apply(lambda x: dict_mapping(x) if dict_mapping(x) != None else x)
    df.loc[:,'END_AZIMUTH'] = df.loc[:,'END_AZIMUTH'].str.upper().apply(lambda x: dict_mapping(x) if dict_mapping(x) != None else x)
    
    # Imputing string columns with missing values with NA
    for col in impute_NA_list:
        df[col] = df[col].astype('str').apply(lambda x: 'NA' if x=='nan' else x) # changed from N/A to NA
        
    # Imputing float columns having missing values with 0.0
    for col in impute_zero_list:
        df[col] = df[col].fillna(0.0)
        
    # Imputing latitude and longitudes with average value
    # for col in impute_mean_list:
    #     df[col] = df[col].fillna(np.mean)
        
    # Dropping text and ID columns
    for col in drop_list:
        df.drop(col, axis=1, inplace=True)
    
    return df

imputed_NOAA_df = impute_NOAA_data(NOAA_df.copy())
imputed_NOAA_df.head()

In [15]:
#cleaned_df = pd.merge(imputed_NOAA_df, imputed_EPA_df, on='YEAR', how='inner')
cleaned_df = imputed_NOAA_df
cleaned_df.to_pickle('/content/drive/MyDrive/cleaned_NAN_removed.pkl')

In [16]:
import pandas as pd

NOAA = pd.read_pickle('/content/drive/MyDrive/cleaned_NAN_removed.pkl')

# df_train = NOAA[(NOAA["YEAR"] > 2005) & (NOAA["EVENT_TYPE"]=='Flood')]

# df_train = NOAA[(NOAA["EVENT_TYPE"]=='Tornado')]

df_train = NOAA[(NOAA["YEAR"] > 2005)]

In [None]:
# Normalizing DAMAGE_PROPERTY variable - if required

# df_train['HasDamage'] = pd.Series(len(df_train['DAMAGE_PROPERTY']), index=df_train.index)
# df_train['HasDamage'] = 0 
# df_train.loc[df_train['DAMAGE_PROPERTY']>0,'HasDamage'] = 1

# df_train.loc[df_train['HasDamage']==1,'DAMAGE_PROPERTY'] = np.log(df_train['DAMAGE_PROPERTY'])

# df_train.drop(columns=['HasDamage'], inplace=True)

# df_train.info()

In [None]:
# assign labels to categorical columns - RUN THIS ONLY IF YOU WANT TO DO "LABEL ENCODING" OF CATEGORICAL VARIABLES

def mapping(xx):
    dict = {}
    count = -1
    for x in xx:
        dict[x] = count + 1
        count = count + 1
    return dict

for i in ['STATE', 'MONTH_NAME', 'EVENT_TYPE', 'CZ_TYPE',  'MAGNITUDE_TYPE', 'BEGIN_AZIMUTH', 'END_AZIMUTH', 'FLOOD_CAUSE', 'TOR_F_SCALE', 'TOR_LENGTH', 'TOR_WIDTH', 'TOR_OTHER_CZ_STATE', 'TOR_OTHER_CZ_NAME'
          , 'CZ_NAME',  'TOR_OTHER_CZ_NAME', 'BEGIN_LOCATION', 'END_LOCATION','CZ_TIMEZONE']:
    unique_tag = df_train[i].value_counts().keys().values
    dict_mapping = mapping(unique_tag)
    df_train[i] = df_train[i].map(lambda x: dict_mapping[x] if x in dict_mapping.keys() else -1)


In [None]:
# GET DUMMIES and assign labels to categorical columns

def mapping(xx):
    dict = {}
    count = -1
    for x in xx:
        dict[x] = count + 1
        count = count + 1
    return dict

for i in ['CZ_NAME', 'BEGIN_LOCATION', 'END_LOCATION',	'TOR_OTHER_CZ_STATE',	'TOR_OTHER_CZ_NAME']:
    unique_tag = df_train[i].value_counts().keys().values
    dict_mapping = mapping(unique_tag)
    df_train[i] = df_train[i].map(lambda x: dict_mapping[x] if x in dict_mapping.keys() else -1)

df_train = pd.get_dummies(df_train, prefix=['STATE', 'MONTH_NAME', 'EVENT_TYPE', 'CZ_TYPE', 'CZ_TIMEZONE', 'BEGIN_AZIMUTH', 'MAGNITUDE_TYPE', 'FLOOD_CAUSE', 'TOR_F_SCALE', 'END_AZIMUTH']
                          , columns=['STATE', 'MONTH_NAME', 'EVENT_TYPE', 'CZ_TYPE', 'CZ_TIMEZONE','BEGIN_AZIMUTH', 'MAGNITUDE_TYPE', 'FLOOD_CAUSE', 'TOR_F_SCALE', 'END_AZIMUTH'])
df_train.head()


In [18]:
# remove any pending NAN from DAMAGE_CROPS and DAMAGE_PROPERTY

# df_train.dropna(subset=["DAMAGE_PROPERTY"], inplace=True)
# df_train.dropna(subset=["DAMAGE_CROPS"], inplace=True)

# check for NULL values
for i in df_train.columns:
    if df_train[i].isna().sum() > 0:
        print(df_train[i].name, ":", df_train[i].isna().sum(), "\n")
        print((df_train[i].isna().sum()/df_train.shape[0])*100, "\n")
    else:
        continue

In [None]:
# correlation

corrmat = df_train.corr()

f, ax = plt.subplots(figsize=(22, 19))
sns.heatmap(corrmat, vmax=1.0, square=True)
plt.show()

#DAMAGE_PROPERTY correlation matrix
# k = 20 #number of variables for heatmap
# cols = corrmat.nlargest(k, 'DAMAGE_PROPERTY')['DAMAGE_PROPERTY'].index
# cm = np.corrcoef(df_train[cols].values.T)
# sns.set(font_scale=1.25)
# hm = sns.heatmap(cm, cbar=True, annot=True, square=True, fmt='.2f', annot_kws={'size': 15}, yticklabels=cols.values, xticklabels=cols.values)
# plt.show()

In [19]:
# Create train test split
X = df_train.loc[:, ~df_train.columns.isin(['DAMAGE_PROPERTY'])]

Y = df_train['DAMAGE_PROPERTY']

# split the dataset into train and test
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=100)

# print(X_train)

In [None]:
# Scaling the data if required

# sc = StandardScaler()
# sc.fit(X_train)
# X_train = sc.transform(X_train)
# X_test = sc.transform(X_test)


# Normalize the data

# X_train = (X_train - X_train.mean()) / X_train.std()

In [None]:
# PCA code if we use one-hot encoding

# pca = PCA(0.95)
# pca = PCA(n_components=25)
# pca.fit(X_train)


# X_train = pca.transform(X_train)
# X_test = pca.transform(X_test)

# print(X_train.shape)
# pca.components_ 

In [None]:
model = LinearRegression()
model.fit(X_train, y_train)

print(model)

# make predictions for test data
y_pred = model.predict(X_test)
y_pred_score = [round(value) for value in y_pred]

MSE = mean_squared_error(y_pred,y_test)
print(MSE)

print(model.score(X_train,y_train))

print(r2_score(y_test, y_pred))


In [None]:
clf_rf = RandomForestRegressor(n_estimators= 100, oob_score = 'TRUE', n_jobs = -1,random_state =50, max_features = "auto", min_samples_leaf = 50)
# RandomForestRegressor(max_depth=10, random_state=0)

# perform training
clf_rf.fit(X_train, y_train)

# make predictions

# prediction on test using all features
y_pred = clf_rf.predict(X_test)

MSE = mean_squared_error(y_pred,y_test)
print(MSE)

print(clf_rf.score(X_train,y_train))

print(r2_score(y_test, y_pred))

In [None]:
# np.argsort(clf_rf.feature_importances_)[::-1]
df_train.columns[np.argsort(clf_rf.feature_importances_)[::-1]]

In [None]:
# rfc = XGBRegressor()

# parameters = {
#     'max_depth': range(2, 10, 1),
#     'n_estimators': range(60, 220, 40),
#     'learning_rate': [0.1, 0.01, 0.05]
# }

# xgb = GridSearchCV(rfc, parameters, cv=5, scoring='accuracy')


xgb = XGBRegressor(learning_rate =0.01,subsample =0.7, max_depth=5, n_estimators=100, colsample_bytree=0.8)
xgb.fit(X_train, y_train, eval_set=[(X_train, y_train)])

# print(xgb.feature_importances_)

# make predictions for test data
y_pred = xgb.predict(X_test)
y_pred_score = [round(value) for value in y_pred]

MSE = mean_squared_error(y_pred,y_test)
print(MSE)

print(xgb.score(X_train,y_train))

print(r2_score(y_test, y_pred))

In [None]:
alpha_range = 10.**np.arange(-2, 3)

ridgeregcv = RidgeCV(alphas=alpha_range, normalize=True, scoring='neg_mean_squared_error')
ridgeregcv.fit(X_train, y_train)
ridgeregcv.alpha_

y_pred = ridgeregcv.predict(X_test)

print("R-Square Value",r2_score(y_test,y_pred))

R-Square Value 0.03581437685072386


In [None]:
lassoregcv = LassoCV(n_alphas=100, normalize=True, random_state=1)
lassoregcv.fit(X_train, y_train)
print('alpha : ',lassoregcv.alpha_)

y_pred = lassoregcv.predict(X_test)

print("R-Square Value",r2_score(y_test,y_pred))

In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedKFold
from xgboost import XGBRegressor

model = XGBRegressor()
# define model evaluation method
cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)
# evaluate model
scores = cross_val_score(model, X, Y, scoring='neg_mean_absolute_error', cv=cv, n_jobs=-1,verbose=3)
# force scores to be positive
scores = absolute(scores)
print('Mean MAE: %.3f (%.3f)' % (scores.mean(), scores.std()) )

In [None]:
# identify outliers with standard deviation
from numpy.random import seed
from numpy.random import randn
from numpy import mean
from numpy import std
# seed the random number generator
seed(1)
# generate univariate observations
data = NOAA_df
# calculate summary statistics
data_mean, data_std = mean(data), std(data)
# identify outliers
cut_off = data_std * 3
lower, upper = data_mean - cut_off, data_mean + cut_off
# identify outliers
outliers = [x for x in data if x < lower or x > upper]
print('Identified outliers: %d' % len(outliers))
# remove outliers
outliers_removed = [x for x in data if x >= lower and x <= upper]
print('Non-outlier observations: %d' % len(outliers_removed))

In [None]:
from sklearn.neighbors import LocalOutlierFactor
# identify outliers in the training dataset
lof = LocalOutlierFactor()
yhat = lof.fit_predict(X_train)
# select all rows that are not outliers
mask = yhat != -1
X_train, y_train = X_train[mask, :], y_train[mask]
# summarize the shape of the updated training dataset
print(X_train.shape, y_train.shape)