In [8]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [9]:
#Help funcs for colored output
from termcolor import colored, cprint
def green(txt):
    return colored(txt, 'green')
def red(txt):
    return colored(txt, 'red')
def blue(txt):
    return colored(txt, 'cyan')
def bold(txt):
    return colored(txt, attrs=['bold'])

In [10]:
#Get raw Data
df = pd.read_csv("data/house_data_training.csv", sep=';') 
# remove unnamed column
df = df.iloc[:, 1:]
#Transform string to datetime
df["date"] = pd.to_datetime(df["date"])
df.tail()
#Drop rows with NaN values

maeList = []#for model selection

In [11]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error

def reg_train_test(X_train, X_test, y_train, y_test):
    '''Function for building Basic Regression Model'''

    # fit the model
    model = LinearRegression()
    model.fit(X_train, y_train)

    # evaluate the model
    ypred = model.predict(X_test)
    
    # evaluate predictions
    mae = mean_absolute_error(y_test, ypred)
    maeList.append(mae)
    print(f'{bold("Mean Absolute Error")}: {blue(np.round(mae))}\n')
    
    print(f'{bold("Regression coefficients:")} \n{blue(model.coef_)}\n')
    
    print(bold("Prediction Examples:"))
    for i in range(0, 5):
        y_pre= ypred[i]
        print(f'{blue(i)} Actual y: {blue(y_test[i])} Predicted y: {blue(np.round(y_pre))}')
 
    return model

In [12]:
df = df.dropna()
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 14997 entries, 0 to 14999
Data columns (total 25 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   id             14997 non-null  int64         
 1   date           14997 non-null  datetime64[ns]
 2   price          14997 non-null  float64       
 3   bedrooms       14997 non-null  int64         
 4   bathrooms      14997 non-null  float64       
 5   sqft_living    14997 non-null  int64         
 6   sqft_lot       14997 non-null  int64         
 7   floors         14997 non-null  float64       
 8   waterfront     14997 non-null  float64       
 9   dis_super      14997 non-null  float64       
 10  view           14997 non-null  float64       
 11  condition      14997 non-null  float64       
 12  grade          14997 non-null  float64       
 13  sqft_above     14997 non-null  float64       
 14  sqft_basement  14997 non-null  float64       
 15  yr_built       1499

In [13]:
#Univariate outlier detection based on descriptive statistics (three standard deviations)
#can be useful to identify extreme outliers

feature_list = df.columns
outliers_dict = {}#dict for storing outlierts for an outlier summary df
outliers_plot_dict = {}#dict for plotting outliers in scatterplot
outlier_list_unique = []
print(bold("Potential Outliers:"))
for feature in feature_list:
    feature_data = df[feature]
    price_data = df["price"]
    # if feature_data.isnumeric():
    
    if not feature == "yr_renovated" or feature == yr_renovated:
        df_feature = pd.concat([feature_data, price_data], axis=1)
        df_feature["outlier"] = 0

        three_std=feature_data.std()*3
        mean=feature_data.mean()

        inlier_low=mean-three_std
        inlier_high=mean+three_std
        #print("mean: ",mean, "lower boundary inlier: ",inlier_low, "upper boundary inlier",inlier_high,"\n")

        outlier_list = [] #list for storing indexes of outliers
        for i, value in enumerate(feature_data):
            if value < inlier_low or value > inlier_high:
                outlier_list.append(i)
                df_feature.iloc[i,2] = 1      

        print(f'{bold(feature)} detected: {blue(len(outlier_list))}')
        if not len(outlier_list) == 0:
            outliers_dict[str(feature)]=outlier_list
            outliers_plot_dict[str(feature)]=df_feature
            outlier_list_unique =  list(set(outlier_list_unique) | set(outlier_list))
        #print(type(df_feature), df_feature.head())

#store indexes from the original df of every potential outlier existing in a new df for every column
df_outliers = pd.DataFrame(dict([ (k,pd.Series(v)) for k,v in outliers_dict.items() ]))
#print(df_outliers.tail())
#print(df_feature.tail())   

print(len(outlier_list_unique))  

[1mPotential Outliers:[0m
[1mid[0m detected: [36m0[0m
[1mdate[0m detected: [36m0[0m
[1mprice[0m detected: [36m421[0m
[1mbedrooms[0m detected: [36m50[0m
[1mbathrooms[0m detected: [36m120[0m
[1msqft_living[0m detected: [36m144[0m
[1msqft_lot[0m detected: [36m241[0m
[1mfloors[0m detected: [36m6[0m
[1mwaterfront[0m detected: [36m102[0m
[1mdis_super[0m detected: [36m40[0m
[1mview[0m detected: [36m559[0m
[1mcondition[0m detected: [36m23[0m
[1mgrade[0m detected: [36m85[0m
[1msqft_above[0m detected: [36m169[0m
[1msqft_basement[0m detected: [36m168[0m
[1myr_built[0m detected: [36m0[0m


NameError: name 'yr_renovated' is not defined

In [None]:
from sklearn.model_selection import train_test_split

df_1 = df.drop(outlier_list_unique)
# Select price as label
X, y = df_1, df_1["price"]
# remove price_data from list
X = X.drop(columns=["price"])

# Transform Column to a numeric value
X[["date"]] = X[["date"]].apply(pd.to_numeric)

# Dataframes in numpy-Arrays konvertieren
X,y  = np.array(X.values.tolist()), np.array(y.values.tolist())

#split Data and train the model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=1)
model_1 = reg_train_test(X_train, X_test, y_train, y_test)

[1mMean Absolute Error[0m: [36m3585970.0[0m

[1mRegression coefficients:[0m 
[36m[-6.44274132e-05  5.16359323e-12 -8.70121419e+01  6.47377643e+02
 -4.62884802e+01 -3.80603444e+00  5.20325462e+02 -3.59925760e+00
  3.61666333e+02 -9.00278821e+01 -6.63201785e+02  5.78118881e+02
  4.64612686e+02 -9.20804822e+01  6.91442343e+04  4.63768354e+02
  1.20216427e+04 -2.58409180e+01  4.96428463e+01 -2.47366709e+02
 -1.21237858e+01  7.70977537e+02  4.50611073e+03  5.27708826e+03][0m

[1mPrediction Examples:[0m
[36m0[0m Actual y: [36m550000.0[0m Predicted y: [36m-784176.0[0m
[36m1[0m Actual y: [36m337000.0[0m Predicted y: [36m3496217.0[0m
[36m2[0m Actual y: [36m840000.0[0m Predicted y: [36m599041.0[0m
[36m3[0m Actual y: [36m780000.0[0m Predicted y: [36m3911485.0[0m
[36m4[0m Actual y: [36m247000.0[0m Predicted y: [36m2281210.0[0m


In [None]:
from sklearn.model_selection import train_test_split

df_1 = df.drop(outlier_list_unique)
# Select price as label
X, y = df_1, df_1["price"]
# remove price_data from list
X = X.drop(columns=["price"])

# Transform Column to a numeric value
X[["date"]] = X[["date"]].apply(pd.to_numeric)

# Dataframes in numpy-Arrays konvertieren
X,y  = np.array(X.values.tolist()), np.array(y.values.tolist())

#split Data and train the model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=1)
model_1 = reg_train_test(X_train, X_test, y_train, y_test)

In [None]:
# #Univariate outlier detection based on descriptive statistics (three standard deviations)
# #can be useful to identify extreme outliers
# feature_list=['bedrooms', 'bathrooms', 'sqft_living',
#        'sqft_lot', 'floors', 'dis_super', 'view', 'condition',
#        'grade', 'sqft_above', 'sqft_basement', 'yr_built',
#        'sqft_living15', 'sqft_lot15']
       
# #feature_list = df.columns
# outliers_dict = {}#dict for storing outlierts for an outlier summary df
# outliers_plot_dict = {}#dict for plotting outliers in scatterplot
# outlier_list_unique = []
# print(bold("Potential Outliers:"))
# for feature in feature_list:
#     feature_data = df[feature]
#     price_data = df["price"]

#     df_feature = pd.concat([feature_data, price_data], axis=1)
#     df_feature["outlier"] = 0

#     three_std=feature_data.std()*10
#     mean=feature_data.mean()

#     inlier_low=mean-three_std
#     inlier_high=mean+three_std
#     #print("mean: ",mean, "lower boundary inlier: ",inlier_low, "upper boundary inlier",inlier_high,"\n")

#     outlier_list = [] #list for storing indexes of outliers
#     for i, value in enumerate(feature_data):
#         if value < inlier_low or value > inlier_high:
#             outlier_list.append(i)
#             df_feature.iloc[i,2] = 1      

#     print(f'{bold(feature)} detected: {blue(len(outlier_list))}')
#     if not len(outlier_list) == 0:
#         outliers_dict[str(feature)]=outlier_list
#         outliers_plot_dict[str(feature)]=df_feature
#         outlier_list_unique =  list(set(outlier_list_unique) | set(outlier_list))
#     #print(type(df_feature), df_feature.head())

# #store indexes from the original df of every potential outlier existing in a new df for every column
# df_outliers = pd.DataFrame(dict([ (k,pd.Series(v)) for k,v in outliers_dict.items() ]))

# print(len(outlier_list_unique))  

In [None]:
import numpy as np 
import matplotlib.pyplot as plt 
from scipy.stats import norm 

fig = plt.figure(figsize=(20,10))
i=1
for key, value in outliers_plot_dict.items():
    plt.subplot(3,6,i)
    data = value[key]
    min = data.min()
    max = data.max()
    if not max <= 10:
        if max >1000000:
            step = np.round(max/20)
        elif max <100:
            step = np.round(max/10)
        elif max >5:
             step = np.round(max/5)    
        #print(max)
        mean = data.mean()
        sd = data.std()
        x_axis = np.arange(min, max, step) 

        fig.subplots_adjust(hspace = .2, wspace=0.4)
        plt.ylabel("Dichte")
        plt.xlabel(key)
        ax = plt.gca()
        #ax.axes.yaxis.set_ticklabels([])
        plt.yticks()
        plt.plot(x_axis, norm.pdf(x_axis, mean, sd)) 
        plt.plot(x_axis, norm.pdf(x_axis, mean, sd*3)) 
        i+=1

plt.show()

In [None]:
# print(len(outliers_plot_dict))
# fig, ax = plt.subplots(1,len(outliers_plot_dict), figsize=(20,20))
# i=0
# for key, value in outliers_plot_dict.items():
#     print(key)
#     print(value)
#     ax[i].plot(value['price'], value[key], 'o')
#     ax[i].set_xlabel(key)
#     ax[i].set_title('price')   
#     #sns.scatterplot(data=value, x=key, y="price", hue="outlier")
#     i+=1
# plt.show()

fig = plt.figure(figsize=(20,10))
i=1
for key, value in outliers_plot_dict.items():
    plt.subplot(3,6,i)
    sns.scatterplot(data=value, x=key, y="price", hue="outlier")
    i+=1
plt.show()     

In [None]:
#normalize data to identify outliers
scaler = preprocessing.MinMaxScaler()
X= scaler.fit_transform(X_train)

#determine k nearest neighbors ()
k=3
nbrs = NearestNeighbors(n_neighbors=k+1, algorithm='ball_tree').fit(X)
distances, indices = nbrs.kneighbors(X)

#plot Distances to k-nearest neighbor, top 100
plt.title('Distances to k-nearest neighbor, top 100')
top_distance_min=0
top_distance_max=100
plt.plot(indices[top_distance_min:top_distance_max,0],-np.sort(-distances[:,k])[top_distance_min:top_distance_max])
plt.show()

#plot Distances to k-nearest neighbor, selected
plt.title('Distances to k-nearest neighbor, selected')
top_distance_min=10
top_distance_max=40
plt.plot(indices[top_distance_min:top_distance_max,0],-np.sort(-distances[:,k])[top_distance_min:top_distance_max])
plt.show()

In [None]:
#k= number of outliers // approx. 10% of data 
num_outliers= 181
outlier_indices=np.argpartition(distances[:,1],-num_outliers)[-num_outliers:]
inlier_indices=np.delete(indices[:,0], outlier_indices)

print("Indices of outliers: ",outlier_indices)
#print("Indices of inliers: ",inlier_indices)
#print("Distances of outliers: ",distances[outlier_indices,1])
#print("Max 10 distances: ",-np.sort(-distances[:,1])[0:10])

# select all rows that are not outliers (inlier=1, outlier=-1)
X_train_red, y_train_red = X_train[inlier_indices, :], y_train[inlier_indices]
# Inliers vs. Outliers
print(bold("Inliers: "),blue(X_train_red.shape[0]),bold("Outliers"),blue(X_train.shape[0]-X_train_red.shape[0]),"\n")
model_02 = reg_train_test(X_train_red ,X_test, y_train_red, y_test)

In [None]:
clustering = DBSCAN(eps=0.42, min_samples=5).fit(X)

inliers=[]
outliers=[]
index_upper=distances[:,1].size

for index in range (0,index_upper):
    if clustering.labels_[index] == -1:
        outliers.append([index, distances[index,1]])
    else:
        inliers.append([index, distances[index,1]])

inliers_df=pd.DataFrame(inliers,columns=['index','distance'])
outliers_df=pd.DataFrame(outliers,columns=['index','distance'])
# print("inliers: ",inliers_df.describe())
# print("\noutliers: ",outliers_df.describe())