In [1]:
import pandas as pd
import numpy as np

from sklearn.linear_model import LinearRegression, Ridge
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import Perceptron
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier

In [2]:
train_df = pd.read_csv('./train.csv')
test_df = pd.read_csv('./test.csv')

In [3]:
train_df = pd.get_dummies(train_df)
test_df = pd.get_dummies(test_df)

In [4]:
for column in train_df.columns:
    if column in test_df.columns or column=='SalePrice':
        train_df[column] = train_df[column].fillna(train_df[column].median())
    else:
        train_df = train_df.drop(column, axis=1)
for column in test_df.columns:
    if column in train_df.columns:
        test_df[column] = test_df[column].fillna(test_df[column].median())
    else:
        test_df = test_df.drop(column, axis=1)

In [5]:
X_train = train_df.drop("SalePrice", axis=1)
Y_train = train_df["SalePrice"].values
X_test = test_df.copy()
X_train.shape, Y_train.shape, X_test.shape

((1460, 271), (1460,), (1459, 271))

In [6]:
from sklearn.model_selection import train_test_split
X_train, X_cross, y_train, y_cross = train_test_split(X_train, Y_train, test_size=0.2, random_state=42)

In [7]:
X_train.shape, Y_train.shape, X_test.shape, X_cross.shape, y_cross.shape

((1168, 271), (1460,), (1459, 271), (292, 271), (292,))

In [8]:
from sklearn.model_selection import cross_val_score

In [9]:
from sklearn.metrics import mean_squared_error, make_scorer
# Define error measure for official scoring : RMSE
scorer = make_scorer(mean_squared_error, greater_is_better = False)

def rmse_cv_train(model):
    rmse= np.sqrt(-cross_val_score(model, X_train, y_train, scoring = scorer, cv = 10))
    return(rmse)

def rmse_cv_test(model):
    rmse= np.sqrt(-cross_val_score(model, X_cross, y_cross, scoring = scorer, cv = 10))
    return(rmse)

In [10]:
#Linear Regression

linreg = LinearRegression()
linreg.fit(X_train, y_train)
#Y_pred = linreg.predict(X_cross)
#acc_log = round(linreg.score(X_train, Y_train) * 100, 2)
print("RMSE on Training set :", rmse_cv_train(linreg).mean())
print("RMSE on Test set :", rmse_cv_test(linreg).mean())
y_train_pred = linreg.predict(X_train)
y_test_pred = linreg.predict(X_cross)

RMSE on Training set : 41441.2281523
RMSE on Test set : 77009.1702948


In [11]:
from sklearn.linear_model import LinearRegression, RidgeCV
ridge = RidgeCV(alphas = [0.01, 0.03, 0.06, 0.1, 0.3, 0.6, 1, 3, 6, 10, 30, 60])
ridge.fit(X_train, y_train)
alpha = ridge.alpha_
print("Best alpha :", alpha)

print("Try again for more precision with alphas centered around " + str(alpha))
ridge = RidgeCV(alphas = [alpha * .6, alpha * .65, alpha * .7, alpha * .75, alpha * .8, alpha * .85, 
                          alpha * .9, alpha * .95, alpha, alpha * 1.05, alpha * 1.1, alpha * 1.15,
                          alpha * 1.25, alpha * 1.3, alpha * 1.35, alpha * 1.4], 
                cv = 10)
ridge.fit(X_train, y_train)
alpha = ridge.alpha_
print("Best alpha :", alpha)

print("Ridge RMSE on Training set :", rmse_cv_train(ridge).mean())
print("Ridge RMSE on Test set :", rmse_cv_test(ridge).mean())
y_train_rdg = ridge.predict(X_train)
y_test_rdg = ridge.predict(X_cross)

Best alpha : 10.0
Try again for more precision with alphas centered around 10.0
Best alpha : 14.0
Ridge RMSE on Training set : 31146.6975844
Ridge RMSE on Test set : 32791.8047278


In [12]:
y_final = ridge.predict(X_test)

In [13]:
y_final

array([ 102557.62915583,  141693.24917772,  171443.73665521, ...,
        152043.85777631,  100729.49694577,  224364.16786446])

In [None]:
submission = pd.DataFrame({
        "Id": test_df["Id"],
        "SalePrice": y_final
    })
submission.to_csv('./submission2.csv', index=False)

In [None]:
from sklearn.linear_model import LinearRegression, LassoCV
lasso = LassoCV(alphas = [0.0001, 0.0003, 0.0006, 0.001, 0.003, 0.006, 0.01, 0.03, 0.06, 0.1, 
                          0.3, 0.6, 1], 
                max_iter = 50000, cv = 10)
lasso.fit(X_train, y_train)
alpha = lasso.alpha_
print("Best alpha :", alpha)

print("Try again for more precision with alphas centered around " + str(alpha))
lasso = LassoCV(alphas = [alpha * .6, alpha * .65, alpha * .7, alpha * .75, alpha * .8, 
                          alpha * .85, alpha * .9, alpha * .95, alpha, alpha * 1.05, 
                          alpha * 1.1, alpha * 1.15, alpha * 1.25, alpha * 1.3, alpha * 1.35, 
                          alpha * 1.4], 
                max_iter = 50000, cv = 10)
lasso.fit(X_train, y_train)
alpha = lasso.alpha_
print("Best alpha :", alpha)

print("Lasso RMSE on Training set :", rmse_cv_train(lasso).mean())
print("Lasso RMSE on Test set :", rmse_cv_test(lasso).mean())
y_train_las = lasso.predict(X_train)
y_test_las = lasso.predict(X_cross)

Best alpha : 1.0
Try again for more precision with alphas centered around 1.0
Best alpha : 1.4
Lasso RMSE on Training set : 40928.685166




In [None]:
y_final2 = lasso.predict(X_test)
y_final2

In [None]:
submission = pd.DataFrame({
        "Id": test_df["Id"],
        "SalePrice": y_final2
    })
submission.to_csv('./submission3.csv', index=False)