In [1]:
import numpy as np
import pandas as pd

from sklearn.linear_model import Ridge, Lasso, ElasticNet, LinearRegression, RidgeCV, LassoCV, ElasticNetCV, LogisticRegression
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, train_test_split

import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('fivethirtyeight')

%config InlineBackend.figure_format = 'retina'
sns.set_style('whitegrid')
plt.style.use('fivethirtyeight')
%matplotlib inline

In [2]:
df = pd.read_csv('./Data/train (2).csv')
kaggle = pd.read_csv('./Data/test (2).csv')

In [3]:
#set(df.columns ^ kaggle.columns)

In [4]:
df.head()

Unnamed: 0,Id,PID,MS SubClass,MS Zoning,Lot Frontage,Lot Area,Street,Alley,Lot Shape,Land Contour,...,Screen Porch,Pool Area,Pool QC,Fence,Misc Feature,Misc Val,Mo Sold,Yr Sold,Sale Type,SalePrice
0,109,533352170,60,RL,,13517,Pave,,IR1,Lvl,...,0,0,,,,0,3,2010,WD,130500
1,544,531379050,60,RL,43.0,11492,Pave,,IR1,Lvl,...,0,0,,,,0,4,2009,WD,220000
2,153,535304180,20,RL,68.0,7922,Pave,,Reg,Lvl,...,0,0,,,,0,1,2010,WD,109000
3,318,916386060,60,RL,73.0,9802,Pave,,Reg,Lvl,...,0,0,,,,0,4,2010,WD,174000
4,255,906425045,50,RL,82.0,14235,Pave,,IR1,Lvl,...,0,0,,,,0,3,2010,WD,138500


In [5]:
kaggle.head()

Unnamed: 0,Id,PID,MS SubClass,MS Zoning,Lot Frontage,Lot Area,Street,Alley,Lot Shape,Land Contour,...,3Ssn Porch,Screen Porch,Pool Area,Pool QC,Fence,Misc Feature,Misc Val,Mo Sold,Yr Sold,Sale Type
0,2658,902301120,190,RM,69.0,9142,Pave,Grvl,Reg,Lvl,...,0,0,0,,,,0,4,2006,WD
1,2718,905108090,90,RL,,9662,Pave,,IR1,Lvl,...,0,0,0,,,,0,8,2006,WD
2,2414,528218130,60,RL,58.0,17104,Pave,,IR1,Lvl,...,0,0,0,,,,0,9,2006,New
3,1989,902207150,30,RM,60.0,8520,Pave,,Reg,Lvl,...,0,0,0,,,,0,7,2007,WD
4,625,535105100,20,RL,,9500,Pave,,IR1,Lvl,...,0,185,0,,,,0,7,2009,WD


## Obtaining the Numerical Features

In [6]:
df_numerical_features = list(df._get_numeric_data().drop(['Id', 'PID', 'SalePrice'], axis =1).columns)
kaggle_numerical_features = list(kaggle._get_numeric_data().drop(['Id', 'PID'], axis =1).columns)

In [7]:
df[df_numerical_features].head()
kaggle[kaggle_numerical_features].head()

Unnamed: 0,MS SubClass,Lot Frontage,Lot Area,Overall Qual,Overall Cond,Year Built,Year Remod/Add,Mas Vnr Area,BsmtFin SF 1,BsmtFin SF 2,...,Garage Area,Wood Deck SF,Open Porch SF,Enclosed Porch,3Ssn Porch,Screen Porch,Pool Area,Misc Val,Mo Sold,Yr Sold
0,190,69.0,9142,6,8,1910,1950,0.0,0.0,0.0,...,440.0,0,60,112,0,0,0,0,4,2006
1,90,,9662,5,4,1977,1977,0.0,0.0,0.0,...,580.0,170,0,0,0,0,0,0,8,2006
2,60,58.0,17104,7,5,2006,2006,0.0,554.0,0.0,...,426.0,100,24,0,0,0,0,0,9,2006
3,30,60.0,8520,5,6,1923,2006,0.0,0.0,0.0,...,480.0,0,0,184,0,0,0,0,7,2007
4,20,,9500,6,5,1963,1963,247.0,609.0,0.0,...,514.0,0,76,0,0,185,0,0,7,2009


In [8]:
df[df_numerical_features] = df[df_numerical_features].fillna(0)

In [9]:
kaggle[kaggle_numerical_features] = kaggle[kaggle_numerical_features].fillna(0)

In [10]:
#df[df_numerical_features].shape
#kaggle[kaggle_numerical_features].shape
#X_num.isnull().sum()

## Obtaining Categorical Features

In [11]:
#df.columns

In [12]:
df_categorical_columns = list(df.select_dtypes(include=['object']).columns)
kaggle_categorical_columns = list(kaggle.select_dtypes(include=['object']).columns)

In [13]:
df[df_categorical_columns] = df[df_categorical_columns].fillna('N/A')
kaggle[kaggle_categorical_columns] = kaggle[kaggle_categorical_columns].fillna('N/A')

In [14]:
for col in df_categorical_columns:
    df_values = sorted(list(df[col].unique()))
    kaggle_values = sorted(list(kaggle[col].unique()))
    categories = set(df_values + kaggle_values)
    df[col] = pd.Categorical(df[col], categories=categories)
    kaggle[col] = pd.Categorical(kaggle[col], categories=categories)

In [15]:
df_dummies = pd.get_dummies(df[df_categorical_columns])
kaggle_dummies = pd.get_dummies(kaggle[kaggle_categorical_columns])

In [16]:
df_dummies.shape

(2051, 278)

In [17]:
kaggle_dummies.shape

(879, 278)

## Merging both categorical and numerical features

In [18]:
X_df = pd.concat([df_dummies, df[df_numerical_features]], axis=1)
X_kaggle = pd.concat([kaggle_dummies, kaggle[kaggle_numerical_features]], axis=1)

In [19]:
X = X_df
y = df['SalePrice']
X.shape

(2051, 314)

## Spliting the training set

In [20]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [21]:
X_train.shape

(1538, 314)

In [22]:
y_train.shape

(1538,)

## Creating Models

In [23]:
# pipe = Pipeline([
#     ('ss', StandardScaler()),
#     ('lr', LinearRegression())
# ])

In [24]:
pipe = Pipeline([
    ('pf', PolynomialFeatures()),
    ('ss', StandardScaler()),
    ('lcv', LassoCV(n_alphas=500, max_iter=1000))
])

In [25]:
params_grid ={
    'ss__with_mean': [True, False],
    'ss__with_std': [True, False],
    #'lr__fit_intercept': [True, False]
}

In [None]:
gs = GridSearchCV(pipe, params_grid, n_jobs=-1, verbose=5, cv=3)

In [None]:
gs.fit(X_train, y_train)

Fitting 3 folds for each of 4 candidates, totalling 12 fits


In [None]:
# using only numerical values, Standard scalar and lasso


# print(gs.score(X_train, y_train))
# print(gs.score(X_test, y_test))
# 0.8116566871090924
# 0.8553893937154086

# gs.best_score_
#0.7580631750172739

# gs.best_params_
#{'ss__with_mean': False, 'ss__with_std': True}

# pred = gs.predict(kaggle[numerical_features])

# kaggle['SalePrice'] = pred

# kaggle[['Id', 'SalePrice']].to_csv('kaggleNum', index=False)

In [None]:
# using only numerical values, Standard scalar and linear Regression
# print(gs.score(X_train, y_train))
# print(gs.score(X_test, y_test))
# # 0.8304063761949465
# # 0.8642925481095182

# # gs.best_score_
# # 0.7555992236039646

# gs.best_params_
# {'lr__fit_intercept': True, 'ss__with_mean': False, 'ss__with_std': True}

# pred = gs.predict(kaggle[numerical_features])

# kaggle['SalePrice'] = pred

# kaggle[['Id', 'SalePrice']].to_csv('kaggleNum2', index=False)

In [None]:
# using both numerical and categorical values, Standard scalar and lasso
print(gs.score(X_train, y_train))
print(gs.score(X_test, y_test))

In [None]:
gs.best_score_

In [None]:
X_train.shape

In [None]:
X_kaggle.shape

pred = gs.predict(X_kaggle)

kaggle['SalePrice'] = pred

kaggle[['Id', 'SalePrice']].to_csv('kaggleNum6', index=False)