In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from sklearn import preprocessing
from IPython.display import display, HTML
from pandas.plotting import scatter_matrix
from sklearn.neighbors import KNeighborsRegressor
from sklearn import linear_model
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.cluster import KMeans
import pickle
import seaborn as sns

In [2]:
data = pd.read_csv("data/housing.csv", sep=";")

FileNotFoundError: File b'data/housing.csv' does not exist

Aufgabe 1:
Teil 1:
- Mehrdimensionale Lineare Regression auf einem Datensatz (https://github.com/ageron/handson-ml/blob/master/datasets/housing/)
- Vorverarbeitung der Daten (Skalierung, kategorische Attribute: http://scikit-learn.org/stable/modules/preprocessing.html#encoding-categorical-features)
- Aufteilung in Test & Train
- Berechnung der Fehler der Vorhersage auf dem Test-Datensatz
- Anpassung der Inputdaten (Feature Engineering)
- Niedrigster MSE auf einem (für alle der gleiche) Test-Datensatz gewinnt
Teil 2
So wie Teil 1 nur mit kNeighborsRegression als Vorhersageverfahren

# Looking at the data

In [None]:
print(data.isnull().sum().sum()) #nr of nans
#data.isnull().values.any() # only gives true/false
print(data.shape)

In [None]:
data.head()

In [None]:
data.describe()

# Preprocessing

In [None]:
#X=data[(data['median_house_value']!=500001) | (data['median_income']>=2)]
X=data[(data['median_income']<=2)]
X

In [None]:
plt.scatter(data["median_income"],data["median_house_value"])

In [None]:
plt.scatter(data["population"],data["median_house_value"])

### Encoding categorical features

In [None]:
enc_labels = preprocessing.LabelEncoder()

In [None]:
enc_labels.fit(data["ocean_proximity"])
new_ocean_prox = enc_labels.transform(data["ocean_proximity"])
print(type(new_ocean_prox))

In [None]:
enc_ohe = preprocessing.OneHotEncoder(sparse=False) #easier to read
new_ocean_arr = enc_ohe.fit_transform(new_ocean_prox.reshape(-1, 1))
new_ocean_df = pd.DataFrame(data = new_ocean_arr, columns = list(enc_labels.classes_))

In [None]:
data_new = pd.concat([data,new_ocean_df],axis=1)
display(data_new)
data_new.drop("ocean_proximity",axis = 1,inplace = True)
data_new

### (Replacing NaNs)

In [None]:
# replacing the 207 NaNs in column total_bedroom by mean of the column

In [None]:
"""
data_new["total_bedrooms"].fillna(data_new["total_bedrooms"].mean(),inplace = True)
with pd.option_context('display.max_rows', 1000, 'display.max_columns', 300): # shows more of the df
    display(data_new["total_bedrooms"])
    """

## Scatter matrices

In [None]:
fig = plt.figure(figsize = (30,30))
for col in range(0,8):
    plt.subplot(4,2,col+1)
    plt.scatter(data_new.iloc[:,col],data_new["median_house_value"])
    plt.title(data_new.columns[col],fontsize=40)

### Dropping rows with NaNs

In [None]:
data_new["total_bedrooms"].isnull().sum()

In [None]:
cols = data_new.columns.tolist()

In [None]:
data_new.dropna(inplace = True)
data_new.reset_index(inplace = True)
data_new.head()

In [None]:
data_new = data_new[['longitude','latitude','housing_median_age','total_rooms', 'total_bedrooms', 'population', 'households','median_income','<1H OCEAN', 'INLAND', 'ISLAND', 'NEAR BAY', 'NEAR OCEAN','median_house_value']]

In [None]:
with pd.option_context('display.max_rows', 1000, 'display.max_columns', 300): # shows more of the df
    display(data_new)
data_new.shape

In [None]:
data_new.plot(kind="scatter",x="longitude",y="latitude",alpha=0.4,
                s=data_new["population"]/100,label="population",figsize=(10,7),
                c="median_house_value",cmap=plt.get_cmap("jet"),colorbar=True,)
plt.legend()

In [None]:
import seaborn as sns
sns.set(style="ticks")
plot_vars = data_new.iloc[:,[0,1,2,3,4,5,6,7,13]]
sns.pairplot(plot_vars)


In [None]:
corr_matrix = data_new.iloc[:,[0,1,2,3,4,5,6,7,13]].corr()
plt.subplots(figsize=(20,10))
sns.heatmap(corr_matrix, annot=True, linewidths=1, cmap='viridis');

In [None]:
rooms_combined = data_new.copy()
rooms_combined["bedr_per_rooms"] = rooms_combined["total_bedrooms"]/rooms_combined["total_rooms"]
rooms_combined.drop(["total_rooms","total_bedrooms"],axis=1,inplace=True)
rooms_combined = rooms_combined[['longitude','latitude','housing_median_age', 'bedr_per_rooms', 'population', 'households','median_income','<1H OCEAN', 'INLAND', 'ISLAND', 'NEAR BAY', 'NEAR OCEAN','median_house_value']]
rooms_combined.head()

In [None]:
"""corr_matrix = rooms_combined.iloc[:,[0,1,2,3,4,5,6,12]].corr()
plt.subplots(figsize=(20,10))
sns.heatmap(corr_matrix, annot=True, linewidths=1, cmap='viridis');
"""

In [None]:
rooms_pophouse_comb = rooms_combined.copy()
rooms_pophouse_comb["househ_per_pop"] = rooms_pophouse_comb["households"]/rooms_pophouse_comb["population"]
rooms_pophouse_comb = rooms_pophouse_comb[['longitude','latitude','housing_median_age', 'bedr_per_rooms', 'househ_per_pop', 'median_income','<1H OCEAN', 'INLAND', 'ISLAND', 'NEAR BAY', 'NEAR OCEAN','median_house_value']]
rooms_pophouse_comb.head()

## Converting housing median age to categorical variables

In [None]:
agecats_data = data_new.copy()
agecats_data["age_cat"] = np.zeros(agecats_data.shape[0])
agecats_data

In [None]:
for row in range(0,agecats_data.shape[0]):
    if agecats_data.loc[row,"housing_median_age"] < 18:
        agecats_data.loc[row,"age_cat"] = 1
    elif agecats_data.loc[row,"housing_median_age"] < 29:
        agecats_data.loc[row,"age_cat"] = 2
    elif agecats_data.loc[row,"housing_median_age"] < 37:
        agecats_data.loc[row,"age_cat"] = 3
    elif agecats_data.loc[row,"housing_median_age"] < 52:
        agecats_data.loc[row,"age_cat"] = 4
    elif agecats_data.loc[row,"housing_median_age"] == 52:
        agecats_data.loc[row,"age_cat"] = 5

agecats_data.drop("housing_median_age",axis = 1,inplace = True)
agecats_data.head()

In [None]:
agecats_data = agecats_data[['longitude','latitude','age_cat','total_rooms', 'total_bedrooms', 'population', 'households','median_income','<1H OCEAN', 'INLAND', 'ISLAND', 'NEAR BAY', 'NEAR OCEAN','median_house_value']]

In [None]:
agecats_rooms_pophouse_comb = rooms_pophouse_comb.copy()
agecats_rooms_pophouse_comb["age_cat"] = agecats_data["age_cat"]

In [None]:
agecats_rooms_pophouse_comb.head()

In [None]:
agecats_rooms_pophouse_comb.drop("housing_median_age",axis=1,inplace = True)

In [None]:
agecats_rooms_pophouse_comb = agecats_rooms_pophouse_comb[['longitude','latitude','age_cat', 'bedr_per_rooms', 'househ_per_pop', 'median_income','<1H OCEAN', 'INLAND', 'ISLAND', 'NEAR BAY', 'NEAR OCEAN','median_house_value']]

agecats_rooms_pophouse_comb.head()

In [None]:
binaryagecats_rooms_pophouse_comb = agecats_rooms_pophouse_comb.copy()

In [None]:
enc_age_ohe = preprocessing.OneHotEncoder(sparse=False) #easier to read
new_age_cats_arr = enc_age_ohe.fit_transform(binaryagecats_rooms_pophouse_comb["age_cat"].values.reshape(-1, 1))
new_age_cats_df = pd.DataFrame(data = new_age_cats_arr, columns = ["age1","age2","age3","age4","age5"])
binaryagecats_rooms_pophouse_comb = pd.concat([binaryagecats_rooms_pophouse_comb,new_age_cats_df],axis=1)

In [None]:
binaryagecats_rooms_pophouse_comb.drop("age_cat",axis=1,inplace=True)

In [None]:
binaryagecats_rooms_pophouse_comb.head()

In [None]:
coords_df = binaryagecats_rooms_pophouse_comb.loc[:,["longitude","latitude","median_house_value"]]
coords_df.head()

In [None]:
mima = preprocessing.MinMaxScaler()
coords_df = mima.fit_transform(coords_df)

In [None]:
coords_df = pd.DataFrame(data=coords_df,columns = ["longitude","latitude","median_house_value"])
coords_df

In [None]:
km = KMeans(n_clusters = 2)
    
clusters=km.fit_predict(coords_df)
centroids = km.cluster_centers_

In [None]:
clusters

In [None]:
centroids_coords = centroids[:,0:2]
centroids_coords

In [None]:
def k_mean_distance(data, cx, cy, i_centroid, cluster_labels):
        distances = [np.sqrt((x-cx)**2+(y-cy)**2) for (x, y) in data[cluster_labels == i_centroid]]
        return distances
test = coords_df.loc[:,["longitude","latitude"]]
k_mean_distance(np.array(test),0.26,0.57,0,clusters)
#test[clusters == 1]

In [None]:
distances = []
only_coords = coords_df.loc[:,["longitude","latitude"]]
for i, (cx, cy) in enumerate(centroids_coords):
    mean_distance = k_mean_distance(np.array(only_coords), cx, cy, i, clusters)
    distances.append(mean_distance)

dist = distances

In [None]:
only_coords["label"] = clusters
only_coords.head()

In [None]:
only_coords.loc[only_coords["label"]== 0,"dist"] = dist[0]

In [None]:
only_coords.loc[only_coords["label"]== 1,"dist"] = dist[1]

In [None]:
only_coords.head()

In [None]:
only_coords.plot(kind="scatter",x="longitude",y="latitude",alpha=0.4,
                figsize=(10,7),#label=clusters,
                c="dist",cmap=plt.get_cmap("jet"),colorbar=True,)
centers = np.array(centroids)
plt.scatter(centers[:,0], centers[:,1], marker="x", color='black')
plt.legend()

In [None]:
optimized_df = binaryagecats_rooms_pophouse_comb.copy()

In [None]:
optimized_df["dist"] = only_coords["dist"]

In [None]:
optimized_df.drop(["longitude","latitude"],axis=1,inplace=True)

In [None]:
optimized_df = optimized_df[['dist','bedr_per_rooms', 'househ_per_pop', 'median_income','<1H OCEAN', 'INLAND', 'ISLAND', 'NEAR BAY', 'NEAR OCEAN','age1','age2','age3','age4','age5','median_house_value']]
optimized_df.head()

In [None]:
fig = plt.figure(figsize = (30,30))
for col in range(0,4):
    plt.subplot(2,2,col+1)
    plt.scatter(optimized_df.iloc[:,col],optimized_df["median_house_value"])
    plt.title(optimized_df.columns[col],fontsize=40)

In [None]:
class ItsMagic: 
    def __init__(self):
        self.results = pd.DataFrame() # contains the predicted ys
        self.input_data = {} # dict of dicts, contains the input data frames
        self.algo_names = [] # list of all algo names that were added
        self.algos = {} # dict of dicts, contains the algo names and algos
        self.analysis_summaries = {} # contains all analyses that have already been run and details about them 
                                        # (i.e. which algo and input data was used)

    def add_input_prep(self, input_data_name, input_data, colidx_with_cont_data = list(range(0,8))):
        """accepts an input data frame, splits in training and test, scales the continuous features,
        and stores the input df in input_data. colidx.. should be a list of column indices of continous features
        input_data should be a manipulated version of data_new (i.e. ocean proximity 
        already converted, NaNs in total bedrooms already dropped,index reset.)"""
        if input_data_name not in self.input_data:
            self.input_data[input_data_name] = {'raw_data': input_data}

            #split in train and test
            splitdata = {}
            self.input_data[input_data_name]["prep_data"] = splitdata
            splitdata["train_X"], splitdata["test_X"], splitdata["train_y"],splitdata["test_y"] = train_test_split(self.input_data[input_data_name]["raw_data"].loc[:, self.input_data[input_data_name]["raw_data"].columns != 'median_house_value'],\
                                                               self.input_data[input_data_name]["raw_data"]["median_house_value"], test_size=0.2, random_state=42)
            nr_features = splitdata["train_X"].shape[1]

            if colidx_with_cont_data:
                #standard scaling of continuous variables
                scaler = preprocessing.StandardScaler()
                train_df_cont = self.input_data[input_data_name]["prep_data"]["train_X"].iloc[:,colidx_with_cont_data]
                scaler.fit(train_df_cont) 
                self.input_data[input_data_name]["prep_data"]["train_X"].iloc[:,colidx_with_cont_data] = scaler.transform(train_df_cont)
                self.input_data[input_data_name]["prep_data"]["test_X"].iloc[:,colidx_with_cont_data] = scaler.transform(self.input_data[input_data_name]["prep_data"]["test_X"].iloc[:,colidx_with_cont_data])
            
            print(f"Input data: {input_data_name}.")
            print(f"Scaled {len(colidx_with_cont_data)} features.")
            print(f"The scaled features are: {splitdata['train_X'].iloc[:,colidx_with_cont_data].columns.values}")
            colindices = list(range(0,splitdata['train_X'].shape[1]))
            condition = [c not in colidx_with_cont_data for c in colindices]
            not_scaled = self.input_data[input_data_name]['prep_data']['train_X'].iloc[:,condition].columns.values 
            print(f"Did NOT scale these ones: {not_scaled}")
  
            return self.input_data
        else:
            print("Input data name already exists!")
    
    def run_analysis(self,analysis_name,input_data_name,algo_name,):
        """runs an analysis with a given input_data and algo_name. Calculates error rates and stores
        them in self.analysis_summaries. For LinReg, also calculates beta coefficients."""
        
        if analysis_name not in self.analysis_summaries:
            self.analysis_summaries[analysis_name] = {"input_data_name":input_data_name,"algo_name":algo_name,"fit":-1}
            
            # get the data from input_data
            train_X, test_X, train_y, test_y = self.input_data[input_data_name]['prep_data']["train_X"],\
                self.input_data[input_data_name]['prep_data']["test_X"],\
                self.input_data[input_data_name]['prep_data']["train_y"],\
                self.input_data[input_data_name]['prep_data']["test_y"]
            
            if self.analysis_summaries[analysis_name]['fit'] == -1:
                print("self analysis summary is -1")
                self.analysis_summaries[analysis_name]['fit'] = self.algos[algo_name]['algo'].fit(train_X, train_y) 
                pred = self.algos[algo_name]['algo'].predict(test_X)
                self.results[analysis_name] = pred 
                
                # error and r2 calculation
                mse= mean_squared_error(pred,test_y)
                rmse = np.sqrt(mse)
                self.analysis_summaries[analysis_name]["RMSE"] = rmse
                R2 = self.analysis_summaries[analysis_name]['fit'].score(test_X, test_y)
                self.analysis_summaries[analysis_name]["R2"] = R2
                rss = (sum((pred - test_y)**2))
                rse = np.sqrt(rss/(test_X.shape[0]-test_X.shape[1]))
                self.analysis_summaries[analysis_name]["RSE"] = rse
                print(f"RSE: {rse}")
                print(f"RMSE: {rmse}")
                print(f"R2: {R2}")
                if isinstance(self.algos[algo_name]["algo"],linear_model.base.LinearRegression):
                    betas = self.analysis_summaries[analysis_name]['fit'].coef_
                    betas_df = pd.DataFrame(data = betas.reshape(1,train_X.shape[1]),columns = train_X.columns)
                    self.analysis_summaries[analysis_name]["betas"] = betas_df
        else: print("Analysis name already exists!")
    
    def run_all_combis(self):
        for i in self.input_data.keys():
            for a in self.algos:
                print(a)
                ananame = str(a) + "_" + str(i)
                print(f"\nRunning {ananame} analysis...")
                self.run_analysis(ananame,i,a)
                print("Finished!")
                      
    def calculate_error_rates(self):
        """returns a df with R2, RMSE and RSE for each analysis (i.e. each combination of 
        input_data and algo that was run with run_analysis)"""
        rates = pd.DataFrame(columns = ["Input DF","Algo Name","R2","RMSE","RSE"],index = self.analysis_summaries.keys())
        for ana in self.analysis_summaries:
            rates.loc[ana,"Input DF"] = self.analysis_summaries[ana]["input_data_name"]
            rates.loc[ana,"Algo Name"] = self.analysis_summaries[ana]["algo_name"]
            rates.loc[ana,"R2"] = self.analysis_summaries[ana]["R2"]
            rates.loc[ana,"RMSE"] = self.analysis_summaries[ana]["RMSE"]
            rates.loc[ana,"RSE"] = self.analysis_summaries[ana]["RSE"]
        #display(rates)
        return rates
    
    def pretty_barplots(self):
        "prints a barplot sumarizing the error rates and R2 for the analyses that were run before"
        pass
         
    def add_algo(self, algo, algo_name):
        """adds a new algo including its configurations (e.g. n_neighbors). Needs to be
        given a unique name"""
        if algo_name not in self.algo_names:
            self.algo_names.append(algo_name)
            self.algos[algo_name] = {'algo_name': algo_name, 'algo': algo}
        else: print("This algo config already exists!!")
            
    def delete_input_data(self,input_data_name):
        self.input_data.pop(input_data_name,None) # None means nothing happens if key not found
        print(f"{input_data_name} was deleted")
        
    def get_results(self):
        return self.results
        

In [None]:
#input_data.loc[:, input_data.columns != 'median_house_value']

In [None]:
data_new.head()

In [None]:
analysis_new = ItsMagic()
analysis_new.add_input_prep("agecats_data",agecats_data,list(range(0,8)))
analysis_new.add_input_prep("optimized",optimized_df,list(range(0,4)))
analysis_new.add_input_prep("agecats_rooms_pophouse_comb",agecats_rooms_pophouse_comb,list(range(0,6)))
analysis_new.add_algo(linear_model.LinearRegression(),'standard_linreg')
analysis_new.add_algo(KNeighborsRegressor(n_neighbors=10), 'knr_10')
analysis_new.add_input_prep("binaryagecats_rooms_pophouse_comb",binaryagecats_rooms_pophouse_comb,[0,1,2,3,4])

In [None]:
analysis_new.add_input_prep("rooms_pophouse_comb",rooms_pophouse_comb,[0,1,2,3,4,5])
analysis_new.add_input_prep("orig_data",data_new,list(range(0,8)))
analysis_new.add_input_prep("only_coords",data_new.iloc[:,[0,1,13]],[0,1])

In [None]:
analysis_new.run_all_combis()

In [None]:
analysis_new.calculate_error_rates().sort_values(["R2","RMSE"],ascending=[False,True])

In [None]:
analysis = ItsMagic()
analysis.add_input_prep("orig_data",data_new,list(range(0,8)))
analysis.add_input_prep("no_coords",data_new.iloc[:,list(range(2,14))],list(range(0,6)))
analysis.add_input_prep("orig_data",data_new,list(range(0,8))) # is not added again (already exists)
analysis.add_input_prep("no_oceanprox",data_new.iloc[:,[0,1,2,3,4,5,6,7,13]],list(range(0,8)))
analysis.add_input_prep("only_income",data_new.iloc[:,[7,13]],[0])
analysis.add_input_prep("only_coords",data_new.iloc[:,[0,1,13]],[0,1])
analysis.add_input_prep("only_long",data_new.iloc[:,[0,13]],[0])
analysis.add_input_prep("rooms_combined",rooms_combined,list(range(0,7)))
analysis.add_input_prep("coords_income",data_new.iloc[:,[0,1,7,13]],[0,1,2])
analysis.add_input_prep("rooms_pophouse_comb",rooms_pophouse_comb,[0,1,2,3,4,5])

analysis.input_data

In [None]:
analysis.add_algo(KNeighborsRegressor(n_neighbors=1), 'knr_1') 
analysis.add_algo(KNeighborsRegressor(n_neighbors=2), 'knr_2')
analysis.add_algo(KNeighborsRegressor(n_neighbors=3), 'knr_3')
analysis.add_algo(KNeighborsRegressor(n_neighbors=4), 'knr_4')
analysis.add_algo(KNeighborsRegressor(n_neighbors=5), 'knr_5')
analysis.add_algo(KNeighborsRegressor(n_neighbors=6), 'knr_6')
analysis.add_algo(KNeighborsRegressor(n_neighbors=7), 'knr_7')
analysis.add_algo(KNeighborsRegressor(n_neighbors=10), 'knr_10')
analysis.add_algo(KNeighborsRegressor(n_neighbors=125), 'knr_125')
analysis.add_algo(linear_model.LinearRegression(),'standard_linreg')
analysis.algos

In [None]:
analysis.delete_input_data('rooms_pophouse_comb')

In [None]:
analysis.add_input_prep("rooms_pophouse_comb",rooms_pophouse_comb,[0,1,2,3,4,5])

In [None]:
analysis.run_all_combis()

In [None]:
res_allcombis = analysis.calculate_error_rates()

In [None]:
with pd.option_context('display.max_rows', 1000, 'display.max_columns', 300):
    display(res_allcombis.sort_values(["R2","RMSE"],ascending=[False,True]))

In [None]:
#analysis.run_analysis('knr1_orig','orig_data','knr_1')

In [None]:
best_r2 = res_allcombis.sort_values("R2",ascending=False).iloc[0:15,:]
#display(best_r2)
lowest_rmse = res_allcombis.sort_values("RMSE").iloc[0:15,:]
#display(lowest_rmse)

### Kalibrierung des Modells

In [None]:
analysis.results["standard_linreg_rooms_pophouse_comb"].mean()

In [None]:
analysis.input_data["rooms_pophouse_comb"]["prep_data"]["test_y"].mean()

# Pickling of the created analysis object

In [None]:
# open the file for writing
fileObject = open("analysis_pickled",'wb') 
# this writes the object analysis to the
# file named 'analysis_pickled'
pickle.dump(analysis,fileObject)
# here we close the fileObject
fileObject.close()

In [None]:
# we open the file for reading
fileObject = open("analysis_pickled",'rb')  
# load the object from the file into var b
loaded_analysis = pickle.load(fileObject)
# here we close the fileObject
fileObject.close()

In [None]:
loaded_analysis.algos

# ToDo

In [None]:
# incorporate feature selector for LinReg

#http://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.RFE.html