In [289]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import pearsonr

In [290]:
df_train = pd.read_csv('train.csv', sep=',')
df_train.head()

Unnamed: 0,house_id,dt,n_bedrooms,n_bathrooms,n_floors,S_above,S_basement,S_living,S_lot,lat,long,year_built,status,price_target
0,101826,2014-07-17,2,1.75,1.5,1740,0,1740,6620,47.526,-121.828,2002,3,350000.0
1,105715,2015-04-21,2,1.0,1.5,1090,0,1090,5265,47.6638,-122.292,1947,4,577000.0
2,118631,2014-09-26,3,2.0,1.0,1310,0,1310,7000,47.303,-122.383,1979,4,196500.0
3,116653,2014-05-02,3,2.5,3.0,1600,170,1770,1235,47.6965,-122.342,2007,3,436110.0
4,119014,2015-03-12,3,3.25,2.0,1090,190,1280,1730,47.7032,-122.36,2005,3,375000.0


In [291]:
df_test = pd.read_csv('test.csv', sep=',')
df_test.head()

Unnamed: 0,house_id,dt,n_bedrooms,n_bathrooms,n_floors,S_above,S_basement,S_living,S_lot,lat,long,year_built,status
0,121076,2015-04-01,3,2.25,1.0,1930,440,2370,38639,47.771,-122.099,1978,3
1,107763,2015-03-30,3,2.5,2.0,2420,920,3340,70131,47.2666,-122.015,1994,3
2,115852,2014-12-12,3,1.0,1.0,1090,0,1090,10296,47.7743,-122.26,1950,4
3,107325,2014-10-15,4,2.5,1.0,1560,880,2440,9350,47.5614,-122.13,1976,4
4,119391,2014-11-13,2,1.5,2.0,840,140,980,1296,47.7075,-122.336,2001,3


In [292]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 14 columns):
house_id        10000 non-null int64
dt              10000 non-null object
n_bedrooms      10000 non-null int64
n_bathrooms     10000 non-null float64
n_floors        10000 non-null float64
S_above         10000 non-null int64
S_basement      10000 non-null int64
S_living        10000 non-null int64
S_lot           10000 non-null int64
lat             10000 non-null float64
long            10000 non-null float64
year_built      10000 non-null int64
status          10000 non-null int64
price_target    10000 non-null float64
dtypes: float64(5), int64(8), object(1)
memory usage: 1.1+ MB


In [293]:
for i in df_train.columns:
    if df_train[i].dtype == object:
        print('\n')
        print(i, " | ",df_train[i].dtype)
        print(df_train[i].value_counts())   
    if df_train[i].dtype == np.int64 or df_train[i].dtype == np.float64: 
        print('\n') 
        print(i, " | ",df_train[i].dtype)
        print('Pearson Correlation and Zero correlation hypothesis:',pearsonr(df_train['price_target'],df_train[i]))  
        if len(set(df_train[i])) < 10:    
            print('====')
            print('Discrete variable (less than 10 distinct values)')            
            print(df_train[i].value_counts())    
            print('====')



house_id  |  int64
Pearson Correlation and Zero correlation hypothesis: (0.0010835210965460971, 0.9137271070778473)


dt  |  object
2014-06-26    72
2014-06-23    70
2015-04-21    67
2015-04-27    67
2014-07-09    62
              ..
2014-11-30     1
2014-12-14     1
2014-08-24     1
2014-07-13     1
2015-02-07     1
Name: dt, Length: 358, dtype: int64


n_bedrooms  |  int64
Pearson Correlation and Zero correlation hypothesis: (0.2935817129836846, 5.648487400945433e-198)


n_bathrooms  |  float64
Pearson Correlation and Zero correlation hypothesis: (0.5108393995926445, 0.0)


n_floors  |  float64
Pearson Correlation and Zero correlation hypothesis: (0.2604509387267565, 9.56841354307679e-155)
====
Discrete variable (less than 10 distinct values)
1.0    4913
2.0    3822
1.5     902
3.0     283
2.5      78
3.5       2
Name: n_floors, dtype: int64
====


S_above  |  int64
Pearson Correlation and Zero correlation hypothesis: (0.6100431794980192, 0.0)


S_basement  |  int64
Pearson Correla

In [294]:
from sklearn.linear_model import Ridge, Lasso
from sklearn import metrics
from sklearn.metrics import mean_squared_log_error
from datetime import datetime

In [295]:
def date_range(start, end, intv):
    from datetime import datetime
    start = datetime.strptime(start,"%Y-%m-%d")
    end = datetime.strptime(end,"%Y-%m-%d")
    diff = (end  - start ) / intv
    for i in range(intv):
        yield (start + diff * i).strftime("%Y-%m-%d")
    yield end.strftime("%Y-%m-%d")

In [296]:
time_split = list(date_range(df_train['dt'].min(), df_train['dt'].max(), 3))
time_split

['2014-05-02', '2014-09-09', '2015-01-17', '2015-05-27']

In [297]:
def normalization(df, col):
    if df_train[col].dtype == np.int64 or df_train[col].dtype == np.float64:         
        avg = df[col].mean()
        stdev = df[col].std()
        df[col] = (df[col]-avg)/stdev   
    #if df_train[col].dtype == object:
    #dt = np.where(datetime(df[col]) <= datetime(time_split[1]), 0, 
    #              np.where(datetime(df[col]) <= datetime(time_split[2]), 1, 2 ))
    
    return df[col]

In [298]:
def clean_df(df_in):
    df_out = df_in.copy()
    for i in df_out.columns:
            df_out[i] = normalization(df_out, i)  
    df_out = df_out.drop('house_id', axis = 1)
    df_out = df_out.drop('dt', axis = 1)    

    return df_out  

In [299]:
df_clean = df_train.pipe(clean_df)

In [300]:
df_clean

Unnamed: 0,n_bedrooms,n_bathrooms,n_floors,S_above,S_basement,S_living,S_lot,lat,long,year_built,status,price_target
0,-1.471264,-0.489556,0.007234,-0.074930,-0.666051,-0.389281,-0.209426,-0.237560,2.733898,1.053898,-0.633285,-0.532913
1,-1.471264,-1.465284,0.007234,-0.852183,-0.666051,-1.096928,-0.241976,0.761407,-0.564338,-0.826799,0.905308,0.090239
2,-0.411657,-0.164313,-0.920182,-0.589113,-0.666051,-0.857417,-0.200298,-1.854176,-1.211190,0.267425,0.905308,-0.954296
3,-0.411657,0.486173,2.789481,-0.242338,-0.282104,-0.356621,-0.338784,0.998463,-0.919751,1.224870,-0.633285,-0.296527
4,-0.411657,1.461902,0.934650,-0.852183,-0.236934,-0.890077,-0.326893,1.047034,-1.047700,1.156481,-0.633285,-0.464284
...,...,...,...,...,...,...,...,...,...,...,...,...
9995,-0.411657,0.160930,0.934650,0.020732,-0.666051,-0.302186,-0.165562,1.174623,-0.166275,0.233230,-0.633285,-0.206238
9996,-0.411657,0.160930,0.934650,0.582746,-0.666051,0.209497,-0.018092,0.207553,0.985264,0.335814,-0.633285,0.002394
9997,-0.411657,-0.164313,0.007234,0.654492,-0.666051,0.274818,6.025010,0.740384,2.278969,0.404203,-0.633285,-0.093687
9998,1.707557,-1.465284,0.934650,-0.409747,-0.666051,-0.694114,-0.223551,-0.520287,-0.493255,-1.031966,0.905308,-0.835018


In [301]:
from sklearn.model_selection import train_test_split, cross_val_score
X = df_clean.loc[:, df_clean.columns != 'price_target']
y = df_clean.loc[:, df_clean.columns == 'price_target']
#X_train, X_test, y_train, y_test = train_test_split(
#    X,
#    y, test_size=0.3, random_state=42)

In [302]:
sorted(metrics.SCORERS.keys())

['accuracy',
 'adjusted_mutual_info_score',
 'adjusted_rand_score',
 'average_precision',
 'balanced_accuracy',
 'brier_score_loss',
 'completeness_score',
 'explained_variance',
 'f1',
 'f1_macro',
 'f1_micro',
 'f1_samples',
 'f1_weighted',
 'fowlkes_mallows_score',
 'homogeneity_score',
 'jaccard',
 'jaccard_macro',
 'jaccard_micro',
 'jaccard_samples',
 'jaccard_weighted',
 'max_error',
 'mutual_info_score',
 'neg_log_loss',
 'neg_mean_absolute_error',
 'neg_mean_squared_error',
 'neg_mean_squared_log_error',
 'neg_median_absolute_error',
 'normalized_mutual_info_score',
 'precision',
 'precision_macro',
 'precision_micro',
 'precision_samples',
 'precision_weighted',
 'r2',
 'recall',
 'recall_macro',
 'recall_micro',
 'recall_samples',
 'recall_weighted',
 'roc_auc',
 'v_measure_score']

In [303]:
model = Ridge(alpha=1)
np.mean(cross_val_score(model, X, y, scoring = 'neg_mean_squared_error', cv = 15))

-0.38996641301502766

In [304]:
model.fit(X,y)

Ridge(alpha=1, copy_X=True, fit_intercept=True, max_iter=None, normalize=False,
      random_state=None, solver='auto', tol=0.001)

In [305]:
df_clean_test = df_test.pipe(clean_df)
model_output = model.predict(df_clean_test)
#df_clean_test

In [306]:
model_output

array([[ 0.51876606],
       [ 0.30766779],
       [-0.32463054],
       ...,
       [ 0.70363919],
       [-0.46383231],
       [-0.65654307]])

In [307]:
model_output = model_output*df_train['price_target'].std()+df_train['price_target'].mean()

In [221]:
pd.DataFrame(
    zip(
        df_test['house_id'],
        model_output
        ), 
    columns = ['house_id', 'price_target']).to_csv('linear_pred.csv', sep = ',', index = False)

In [227]:
## Without target normalization

In [279]:
def normalization(df, col):
    if (df_train[col].dtype == np.int64 or df_train[col].dtype == np.float64) and df_train[col].name != 'price_target':         
        avg = df[col].mean()
        stdev = df[col].std()
        df[col] = (df[col]-avg)/stdev   
    #if df_train[col].dtype == object:
    #dt = np.where(datetime(df[col]) <= datetime(time_split[1]), 0, 
    #              np.where(datetime(df[col]) <= datetime(time_split[2]), 1, 2 ))
    
    return df[col]

In [280]:
def clean_df(df_in):
    df_out = df_in.copy()
    for i in df_out.columns:
            df_out[i] = normalization(df_out, i)  
    df_out = df_out.drop('house_id', axis = 1)
    df_out = df_out.drop('dt', axis = 1)    

    return df_out  


In [281]:
df_clean = df_train.pipe(clean_df)

In [282]:
from sklearn.model_selection import train_test_split, cross_val_score
X = df_clean.loc[:, df_clean.columns != 'price_target']
y = df_clean.loc[:, df_clean.columns == 'price_target']
#X_train, X_test, y_train, y_test = train_test_split(
#    X,
#    y, test_size=0.3, random_state=42)

In [283]:
model = Lasso(alpha=1)
np.mean(cross_val_score(model, X, y, scoring = 'neg_mean_squared_error', cv = 15))

-51747782625.88517

In [284]:
model.fit(X,y)


Lasso(alpha=1, copy_X=True, fit_intercept=True, max_iter=1000, normalize=False,
      positive=False, precompute=False, random_state=None, selection='cyclic',
      tol=0.0001, warm_start=False)

In [285]:
df_clean_test = df_test.pipe(clean_df)
model_output = model.predict(df_clean_test)
#df_clean_test

In [249]:
pd.DataFrame(
    zip(
        df_test['house_id'],
        model_output
        ), 
    columns = ['house_id', 'price_target']).to_csv('linear_pred.csv', sep = ',', index = False)

In [288]:
model_output.min()

-90283.26722590032