In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns # graphical display
import matplotlib.pyplot as plt
%matplotlib inline


from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error as MSE

from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn import metrics

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## 1.0 - Import dataset

In [None]:
all_data = pd.read_csv('../input/real-estate-dataset/data.csv')
all_data.head()

### 1.1 - Separate train and test set. Test set will be used to compute predictions.

In [None]:
train_df, test_df, y_train, y_test = train_test_split(all_data.drop(columns='CRIM'), all_data['CRIM'], test_size=0.21, random_state=0)
print(train_df.shape)

In [None]:
train_df.head()

## 2.0 - Investigate quality and content of data

#### 2.1 - Data Quality

In [None]:
#Print Dataframe Infos
train_df.info()

In [None]:
print('Dataset is composed of {} rows and {} columns'.format(train_df.shape[0], train_df.shape[1]))

Column RM contains missing values. This will be addressed later

#### 2.2 - Data Content

In [None]:
fig, axes = plt.subplots(nrows=2, ncols=7, figsize=(20,7))

for col, ax in zip(train_df.columns, axes.flatten()):
    print(col)
    sns.distplot(train_df[col], ax=ax,kde_kws = {'bw' : 10}) #kde_kws = {'bw' : 10} manually added to prevent "Selected KDE bandwidth is 0." error
    ax.set_title(col)
    
plt.tight_layout()
plt.show()

## 3.0 - Exploratory Data Analysis

### 3.1 - Normal Distribution

#### 3.1.1 - Let's first evaluate the R^2 value between a given column and the CRIM target variable. Do a log transform provide better relationship?

In [None]:
lr = LinearRegression()
linMod = []

for col in train_df.columns.drop(['CHAS']):
    
    #Simple model
    X = train_df[col].fillna(value=train_df[col].median()).values
    lr.fit(X[:,np.newaxis],y_train)
    score_s=lr.score(X[:,np.newaxis], y_train)
    
    #only logarithm
    X_log = np.log1p(X)
    lr.fit(X_log[:,np.newaxis], y_train)
    score_l=lr.score(X_log[:,np.newaxis], y_train)
    
    linMod.append({
        'simple': score_s,
        'log': score_l,
    })
    
linMod = pd.DataFrame(linMod)
linMod['features'] = train_df.columns.drop(['CHAS'])
linMod.sort_values(by='simple', ascending=False, inplace=True)

#### 3.1.2 - Graphically show R^2 results

In [None]:
plt.scatter(np.arange(linMod.shape[0]), linMod['simple'], color='C0', alpha =0.5, s=75, label='Simple model')
plt.scatter(np.arange(linMod.shape[0]), linMod['log'], color='C1',alpha =0.5, s=75,label='log1p(feature)')
plt.xticks(np.arange(linMod.shape[0]), linMod['features'], rotation=90)
plt.ylabel('R^2 score')
plt.legend()
plt.title('Comparison between [x vs. y] and [log1p(x) vs. y]')
plt.show()

In [None]:
#Here I extract the features that seem to benefit from a log transformation.

col_lg = linMod.loc[linMod['log']>linMod['simple'], 'features']
col_lg

#### 3.1.3 - Confirm results

In [None]:
#Let's try, in fact, to calculate the MAE of a LinearModel

# Take the columns "as they are"
df_simple = train_df.drop(columns=['CHAS']).copy() 
for c in train_df.columns.drop(['CHAS']):
    df_simple[c].fillna(df_simple[c].median(), inplace=True)
    
# Take the log1p(x) only for the columns where the log1p(x) has a higher R^2 score
df_log = train_df.drop(columns=['CHAS']).copy() 
for c in train_df.columns.drop(['CHAS']):
    df_log[c].fillna(df_log[c].median(), inplace=True)
for c in col_lg:
    df_log[c] = np.log1p(df_log[c])
    
# Take the log1p(x) for all columns
df_alllog = train_df.drop(columns=['CHAS']).copy() 
for c in train_df.columns.drop(['CHAS']):
    df_alllog[c].fillna(df_alllog[c].median(), inplace=True)
    df_alllog[c] = np.log1p(df_alllog[c])
    

# Scale the features
scaler = StandardScaler()
X_s = scaler.fit_transform(df_simple.values)
X_l = scaler.fit_transform(df_log.values)
X_al = scaler.fit_transform(df_alllog.values)
#y = scaler.fit_transform(y)

In [None]:
lr = LinearRegression()

#simple model
lr.fit(X_s, y_train)
print('MSE for simple model: {:.2f}'.format(MSE(y_train, lr.predict(X_s))))

#log model
lr.fit(X_l, y_train)
print('MSE for log model: {:.2f}'.format(MSE(y_train, lr.predict(X_l))))

#ALL log model
lr.fit(X_al, y_train)
print('MSE for ALL log model: {:.2f}'.format(MSE(y_train, lr.predict(X_al))))


### 3.2 - Feature engineering

#### 3.2.1 - Column ZN contains several zeros

In [None]:
# I add a binary column for the features RAD, ZN and TAX

cont_col = train_df.drop(columns=['CHAS']).columns

# Take the log1p(x) only for the columns where the log1p(x) has a higher R^2 score
df_log = train_df.drop(columns=['CHAS']).copy() 
for c in train_df.columns.drop(['CHAS']):
    df_log[c].fillna(df_log[c].median(), inplace=True)
for c in col_lg:
    df_log[c] = np.log1p(df_log[c])
    
df_log['ZN_binary'] = [1 if x>0 else 0 for x in df_log['ZN']]
df_log['RAD_binary'] = [1 if x>20 else 0 for x in df_log['RAD']]
df_log['TAX_binary'] = [1 if x>600 else 0 for x in df_log['TAX']]

X_l = scaler.fit_transform(df_log.values)

#log model
lr.fit(X_l, y_train)
print('MSE for log model after binary addition: {:.2f}'.format(MSE(y_train, lr.predict(X_l))))

#### 3.2.2 - Add Polynomial Features

In [None]:
# Add polynomial features to continuous columns

for c in cont_col:
    for d in [0.5, 2, 3]:
        name = '{}**{}'.format(c, d)
        df_log[name] = df_log[c]**d
        
X_l = scaler.fit_transform(df_log.values)

#log model
lr.fit(X_l, y_train)
print('MSE for log model after polynomial feature: {:.2f}'.format(MSE(y_train, lr.predict(X_l))))

## 4.0 Create Preprocessing function

In [None]:
def preprocess(df1, df2): # df1 is the dataframe to preprocess, based on df2 informations
    
    df1 = df1.copy() #work on a copy
    
    #set column names
    cont_col = df1.drop(columns=['CHAS']).columns
    col_lg = ['MEDV', 'NOX', 'DIS', 'RM', 'ZN']
    
    #compute log transform
    for c in cont_col:
        df1[c].fillna(df2[c].median(), inplace=True)
    for c in col_lg:
        df1[c] = np.log1p(df1[c])
        
    #Feature engineering
    df1['ZN_binary'] = [1 if x>0 else 0 for x in df1['ZN']]
    df1['RAD_binary'] = [1 if x>20 else 0 for x in df1['RAD']]
    df1['TAX_binary'] = [1 if x>600 else 0 for x in df1['TAX']]
    
    #Polynomial features
    for c in cont_col:
        for d in [0.5, 2, 3]:
            name = '{}**{}'.format(c, d)
            df1[name] = df1[c]**d
            
    #One-Hot Encoding
    df1 = pd.get_dummies(df1, dummy_na=False)
    
    return df1

## 5.0 - Models

### 5.1 Preprocess the data

In [None]:
train_df_preprocessed = preprocess(train_df, train_df)
test_df_preprocessed = preprocess(test_df, train_df)

#ensure same columns
test_df_preprocessed = test_df_preprocessed.reindex(columns=train_df_preprocessed.columns, fill_value=0) #Ensure same columns

### 5.2 Linear Regression Model

In [None]:
scaler = StandardScaler()
train_df_preproc_scaled = scaler.fit_transform(train_df_preprocessed)
test_df_preproc_scaled = scaler.transform(test_df_preprocessed)

In [None]:
linreg = LinearRegression() #creates the object
linreg.fit(train_df_preproc_scaled, y_train) #fit the model using the train data rescaled
mae_model1 = MSE(y_test, linreg.predict(test_df_preproc_scaled))
print('MSE Linear Regression: {:.6f}'.format(mae_model1))

Or alternatively:

In [None]:
#Create pipeline object
pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('logreg', LinearRegression())#, ))
])

#Create Cross-Validation object
grid = {}

#Create shufflesplit cross-validation
grid_cv = GridSearchCV(pipe, grid, cv=KFold(n_splits=5, shuffle=True), return_train_score=True, n_jobs=-1)

In [None]:
#Fit the model and get the results in a DataFrame
grid_cv.fit(train_df_preprocessed, y_train)

In [None]:
linreg_predictions = grid_cv.predict(test_df_preprocessed)
print('MSE on test set using Linear Regression: {:.2f}'.format(MSE(y_test, linreg_predictions)))

### 5.3 Ridge Regression
#### 5.3.1 Combine Grid search and Cross validation

In [None]:
#Create pipeline object
pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('ridge', Ridge(alpha=1))#, ))
])

#Create Cross-Validation object
grid = {'ridge__alpha': np.logspace(-1,5, num=100)}

#Create shufflesplit cross-validation
grid_cv = GridSearchCV(pipe, grid, cv=KFold(n_splits=5), return_train_score=True, n_jobs=-1)

In [None]:
#Fit the model
grid_cv.fit(train_df_preprocessed, y_train)

In [None]:
#Compute predicitons on the test set
ridge_predictions = grid_cv.predict(test_df_preprocessed)
print('MSE on test set using Ridge Regression: {:.2f}'.format(MSE(y_test, ridge_predictions)))

#### 5.3.2 Extract best parameters and compute predictions on full dataset

In [None]:
best_alpha = grid_cv.best_params_
best_alpha['ridge__alpha']

#### 5.3.4 Retrain the model using the full dataset

In [None]:
target = all_data['CRIM'].values
X = all_data.drop(columns='CRIM')

X_proc = preprocess(X, X)
scaler = StandardScaler()
X_proc_scaled = scaler.fit_transform(X_proc.values)

In [None]:
ridge = Ridge(alpha=best_alpha['ridge__alpha'])
ridge.fit(X_proc_scaled, target)
ridge_predictions = ridge.predict(X_proc_scaled)
print('MSE on test set using Ridge Regression: {:.2f}'.format(MSE(target, ridge_predictions)))

In [None]:
pred_df = pd.DataFrame()
pred_df["CRIM"] = all_data["CRIM"]
pred_df["abs_err"] = abs(all_data["CRIM"] - ridge_predictions)
pred_df.head()

In [None]:
pred_df.to_csv('Predictions_ridge.csv')