In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LinearRegression, RidgeCV, LassoCV
from sklearn.metrics import r2_score
from sklearn.preprocessing import PolynomialFeatures, StandardScaler

#### Submission 1

In [2]:
df_train = pd.read_csv('../data/roman_df.csv')
df_test = pd.read_csv('../data/roman_df_test.csv')
lr = LinearRegression()

In [3]:
X = df_train[['Overall Qual', 'Gr Liv Area']]
y = df_train['SalePrice']

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [5]:
lr.fit(X_train, y_train)

LinearRegression()

In [6]:
lr.score(X_train, y_train)

0.7630773279547887

In [7]:
lr.score(X_test, y_test)

0.7076116494890882

In [8]:
preds = lr.predict(df_test[['Overall Qual', 'Gr Liv Area']])

In [9]:
df_test['SalePrice'] = preds

In [10]:
lr_submission = df_test[['Id', 'SalePrice']]

In [11]:
lr_submission.set_index('Id', inplace=True)

In [12]:
lr_submission.to_csv('../data/sub_1.csv')

#### Submission 2

In [13]:
df_train = pd.read_csv('../data/roman_df.csv')
df_test = pd.read_csv('../data/roman_df_test.csv')
lr = LinearRegression()

In [14]:
df_train['total_area'] = df_train['Gr Liv Area'] * df_train['Total Bsmt SF']
df_test['total_area'] = df_test['Gr Liv Area'] * df_test['Total Bsmt SF']

In [15]:
X = df_train[['Overall Qual', 'total_area']]
y = df_train['SalePrice']
X_train, X_test, y_train, y_test = train_test_split(X, y)
lr.fit(X_train, y_train)
lr.score(X_train, y_train)

0.8083633851264382

In [16]:
lr.score(X_test, y_test)

0.8122382898786129

In [17]:
preds = lr.predict(df_test[['Overall Qual', 'total_area']])
df_test['SalePrice'] = preds
lr_submission = df_test[['Id', 'SalePrice']]
lr_submission.set_index('Id', inplace=True)
lr_submission.to_csv('../data/sub_2.csv')

#### Ridge

In [18]:
df_train = pd.read_csv('../data/roman_df.csv')
df_test = pd.read_csv('../data/roman_df_test.csv')

In [19]:
X_train, X_test, y_train, y_test = train_test_split(X, y)
sc = StandardScaler()
Z_train = sc.fit_transform(X_train)
Z_test = sc.transform(X_test)

In [20]:
ridge_cv = RidgeCV(alphas=np.logspace(0, 5, 100))

In [21]:
ridge_cv.fit(Z_train, y_train)

RidgeCV(alphas=array([1.00000000e+00, 1.12332403e+00, 1.26185688e+00, 1.41747416e+00,
       1.59228279e+00, 1.78864953e+00, 2.00923300e+00, 2.25701972e+00,
       2.53536449e+00, 2.84803587e+00, 3.19926714e+00, 3.59381366e+00,
       4.03701726e+00, 4.53487851e+00, 5.09413801e+00, 5.72236766e+00,
       6.42807312e+00, 7.22080902e+00, 8.11130831e+00, 9.11162756e+00,
       1.02353102e+01, 1.14975700e+0...
       6.89261210e+03, 7.74263683e+03, 8.69749003e+03, 9.77009957e+03,
       1.09749877e+04, 1.23284674e+04, 1.38488637e+04, 1.55567614e+04,
       1.74752840e+04, 1.96304065e+04, 2.20513074e+04, 2.47707636e+04,
       2.78255940e+04, 3.12571585e+04, 3.51119173e+04, 3.94420606e+04,
       4.43062146e+04, 4.97702356e+04, 5.59081018e+04, 6.28029144e+04,
       7.05480231e+04, 7.92482898e+04, 8.90215085e+04, 1.00000000e+05]))

In [22]:
ridge_cv.alpha_

7.220809018385464

In [23]:
print(ridge_cv.score(Z_train, y_train))
print(ridge_cv.score(Z_test, y_test))

0.802528829943782
0.8289467097471821


#### Submission 3: Lasso

In [24]:
df_train

Unnamed: 0,Id,PID,MS SubClass,MS Zoning,Lot Frontage,Lot Area,Street,Alley,Lot Shape,Land Contour,...,Misc Feature,Misc Val,Mo Sold,Yr Sold,Sale Type,SalePrice,Bldg Type_2fmCon,Bldg Type_Duplex,Bldg Type_Twnhs,Bldg Type_TwnhsE
0,109,533352170,60,RL,,13517,Pave,,IR1,Lvl,...,,0,3,2010,WD,130500.0,0,0,0,0
1,544,531379050,60,RL,43.0,11492,Pave,,IR1,Lvl,...,,0,4,2009,WD,220000.0,0,0,0,0
2,153,535304180,20,RL,68.0,7922,Pave,,Reg,Lvl,...,,0,1,2010,WD,109000.0,0,0,0,0
3,318,916386060,60,RL,73.0,9802,Pave,,Reg,Lvl,...,,0,4,2010,WD,174000.0,0,0,0,0
4,255,906425045,50,RL,82.0,14235,Pave,,IR1,Lvl,...,,0,3,2010,WD,138500.0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2042,1587,921126030,20,RL,79.0,11449,Pave,,IR1,HLS,...,,0,1,2008,WD,298751.0,0,0,0,0
2043,785,905377130,30,RL,,12342,Pave,,IR1,Lvl,...,,0,3,2009,WD,82500.0,0,0,0,0
2044,916,909253010,50,RL,57.0,7558,Pave,,Reg,Bnk,...,,0,3,2009,WD,177000.0,0,0,0,0
2045,639,535179160,20,RL,80.0,10400,Pave,,Reg,Lvl,...,,0,11,2009,WD,144000.0,0,0,0,0


In [25]:
df_train = pd.read_csv('../data/roman_df.csv')
df_test = pd.read_csv('../data/roman_df_test.csv')
df_test['Central Air'] = df_test['Central Air'].map({'Y': 1, 'N': 0})

In [26]:
df_train['total_area'] = df_train['Gr Liv Area'] * df_train['Total Bsmt SF']
df_test['total_area'] = df_test['Gr Liv Area'] * df_test['Total Bsmt SF']

In [27]:
X = df_train[['total_area', 'Lot Area', 'Overall Qual', 'Overall Cond', 'Year Built', 'Garage Area', 'Garage Cars', 'Gr Liv Area', 'Total Bsmt SF', 'Fireplaces', 'Central Air', 'Year Remod/Add', 'Full Bath',
             'BsmtFin SF 1', 'Lot Area', 'Bldg Type_2fmCon', 'Bldg Type_Duplex', 'Bldg Type_Twnhs', 'Bldg Type_TwnhsE', '1st Flr SF', 'TotRms AbvGrd']]
y = df_train['SalePrice']

In [28]:
X_train, X_test, y_train, y_test = train_test_split(X, y)
sc = StandardScaler()
Z_train = sc.fit_transform(X_train)
Z_test = sc.transform(X_test)
# remember to CHANGE THIS to match df_train!!!
df_test_sc = sc.transform(df_test[['total_area', 'Lot Area', 'Overall Qual', 'Overall Cond', 'Year Built', 'Garage Area', 'Garage Cars', 'Gr Liv Area', 'Total Bsmt SF', 'Fireplaces', 'Central Air', 'Year Remod/Add', 'Full Bath',
             'BsmtFin SF 1', 'Lot Area', 'Bldg Type_2fmCon', 'Bldg Type_Duplex', 'Bldg Type_Twnhs', 'Bldg Type_TwnhsE', '1st Flr SF', 'TotRms AbvGrd']])

In [29]:
lasso_cv = LassoCV(alphas=np.logspace(-3, 0, 100))
lasso_cv.fit(Z_train, y_train)

LassoCV(alphas=array([0.001     , 0.00107227, 0.00114976, 0.00123285, 0.00132194,
       0.00141747, 0.00151991, 0.00162975, 0.00174753, 0.00187382,
       0.00200923, 0.00215443, 0.00231013, 0.00247708, 0.00265609,
       0.00284804, 0.00305386, 0.00327455, 0.00351119, 0.00376494,
       0.00403702, 0.00432876, 0.00464159, 0.00497702, 0.0053367 ,
       0.00572237, 0.00613591, 0.00657933, 0.0070548 , 0.00756463,
       0.008...
       0.09326033, 0.1       , 0.10722672, 0.1149757 , 0.12328467,
       0.13219411, 0.14174742, 0.15199111, 0.16297508, 0.17475284,
       0.18738174, 0.2009233 , 0.21544347, 0.23101297, 0.24770764,
       0.26560878, 0.28480359, 0.30538555, 0.32745492, 0.35111917,
       0.37649358, 0.40370173, 0.43287613, 0.46415888, 0.49770236,
       0.53366992, 0.57223677, 0.61359073, 0.65793322, 0.70548023,
       0.75646333, 0.81113083, 0.869749  , 0.93260335, 1.        ]))

In [30]:
print(lasso_cv.score(Z_train, y_train))
print(lasso_cv.score(Z_test, y_test))

0.88563080539815
0.877514998976516


In [31]:
preds = lasso_cv.predict(df_test_sc)
df_test['SalePrice'] = preds
la_submission = df_test[['Id', 'SalePrice']]
la_submission.set_index('Id', inplace=True)
la_submission.to_csv('../data/sub_3.csv')

In [32]:
# last sub not yet submitted!