In [1]:
# Importing cupy, cudf and cuml
import cupy as cupy
import cudf as cudf
import cuml as cuml

In [2]:
# Importing the training dataset
df_train=cudf.read_csv('../input/house-prices-advanced-regression-techniques/train.csv')
df_test=cudf.read_csv('../input/house-prices-advanced-regression-techniques/test.csv')

In [3]:
# Splitting the df_train into X_train and Y_train dataframes
X_train=df_train.iloc[:,:-1]
Y_train=df_train.iloc[:,-1]

In [4]:
X_train.info()

In [5]:
# Counting total number of NaN values in each column
X_train.isnull().sum().sort_values(ascending=False).head(40)

In [6]:
# Dropping values with large number of NaN values
X_train=X_train.drop(['Id','PoolQC','MiscFeature','Alley','Fence','FireplaceQu'],axis=1)

In [7]:
# Filling the NaN values with the mode of all the values
for col in X_train:
    if X_train[col].dtype=='O': # if col contains object values
        X_train[col].fillna(X_train[col].mode()[0], inplace=True)
    else:
        X_train[col].fillna(X_train[col].mean(), inplace=True)
X_train

In [8]:
# Checking if any NaN value is still left
X_train.isnull().sum().sort_values(ascending=False).head(50)

In [9]:
# Encoding Categorical features into Numerical 
from cuml.preprocessing import LabelEncoder
obj_cols=list(X_train.select_dtypes(['object']).columns)
le=LabelEncoder()
for col in obj_cols:
    X_train[col]=le.fit_transform(X_train[col])
X_train

In [10]:
# Test Data Set
X_test=df_test
X_test.shape

In [11]:
# Dropping columns from the test data set
X_test=X_test.drop(['Id','PoolQC','MiscFeature','Alley','Fence','FireplaceQu'],axis=1)

In [12]:
# Replacing NaN values using mean and mode
for col in X_test:
    if X_test[col].dtype=='O':
        X_test[col].fillna(X_test[col].mode()[0], inplace=True)
    else:
        X_test[col].fillna(X_test[col].mean(), inplace=True)
X_test

In [13]:
# Changing categorical variables to numerical
for col in obj_cols:
    X_test[col]=le.fit_transform(X_test[col])
X_test

In [14]:
# Normalizing the X-train and X_test values
from cuml.preprocessing import StandardScaler
ss=StandardScaler()
X_train_scaled=ss.fit_transform(X_train)
X_test_scaled=ss.fit_transform(X_test)

In [15]:
X_train_scaled

In [16]:
import seaborn as sns
plot=sns.heatmap(X_train_scaled.corr().as_matrix())

In [18]:
# Splitting X_train into train and test dataset
from cuml.model_selection import train_test_split
X_train1, X_test1, y_train1, y_test1 = train_test_split(X_train_scaled, Y_train, train_size=0.7, random_state=42)

In [19]:
# Linear Regression using all algos
from cuml.linear_model import LinearRegression
algorithms = ['svd', 'eig', 'qr', 'svd-qr', 'svd-jacobi']
for algo in algorithms:
    print("Algorithm:",algo)
    lr = LinearRegression(fit_intercept = True, normalize = False, algorithm = algo)
    reg = lr.fit(X_train1,y_train1)
    preds = lr.predict(X_test1)
    #print(preds.dtype)
    print("MSE:",cuml.metrics.regression.mean_squared_error(y_test1.astype('float64'),preds))
    print("R2 Score:",cuml.metrics.regression.r2_score(y_test1.astype('float64'),preds))
    print("MAE:",cuml.metrics.regression.mean_absolute_error(y_test1.astype('float64'),preds))
    print("\n")

In [20]:
# Ridge Regression using svd and eig
from cuml import Ridge
alpha=cupy.array([10])
Ridge_algos=['svd','eig']
for algo in Ridge_algos:
    print("Algorithm:",algo)
    ridge = Ridge(alpha=alpha, fit_intercept=True, normalize=False, solver=algo)
    ridge_res = ridge.fit(X_train1,y_train1)
    preds=ridge_res.predict(X_test1)
    print("MSE:",cuml.metrics.regression.mean_squared_error(y_test1.astype('float64'),preds))
    print("R2 Score:",cuml.metrics.regression.r2_score(y_test1.astype('float64'),preds))
    print("MAE:",cuml.metrics.regression.mean_absolute_error(y_test1.astype('float64'),preds))
    print("\n")

In [21]:
# Lasso regression
from cuml.linear_model import Lasso
alpha=0.1
lasso = Lasso(alpha=alpha)
lasso_res = lasso.fit(X_train1,y_train1)
preds = lasso_res.predict(X_test1)
print("MSE:",cuml.metrics.regression.mean_squared_error(y_test1.astype('float64'),preds))
print("R2 Score:",cuml.metrics.regression.r2_score(y_test1.astype('float64'),preds))
print("MAE:",cuml.metrics.regression.mean_absolute_error(y_test1.astype('float64'),preds))
print("\n")

In [27]:
# Fitting Linear regression and predicting SalePrice values
lr = LinearRegression(fit_intercept = True, normalize = False, algorithm = 'svd')
reg_res = lr.fit(X_train_scaled,Y_train)
reg_pred = reg_res.predict(X_test_scaled)

In [28]:
df_ans=cudf.DataFrame()
df_ans['Id']=df_test['Id']
df_ans['SalePrice']=reg_pred
df_ans

In [26]:
filename='./submission.csv'
df_ans.to_csv(filename,index=False)