## Load in Data

In [None]:
import cudf

train = cudf.read_csv("/kaggle/input/tabular-playground-series-feb-2021/train.csv")
test = cudf.read_csv("/kaggle/input/tabular-playground-series-feb-2021/test.csv")
sample_submission = cudf.read_csv('../input/tabular-playground-series-feb-2021/sample_submission.csv')

In [None]:
train.sample(3)

In [None]:
train.shape

In [None]:
test.sample(3)

In [None]:
test.shape

In [None]:
sample_submission.sample(3)

In [None]:
sample_submission.shape

### About the train...

In [None]:
train.info()

In [None]:
train.describe()

In [None]:
train.describe(exclude='number')

## Baseline Model

In [None]:
from cuml.preprocessing import train_test_split

X = train.drop('target', axis=1)
y = train.target

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2)

In [None]:
import numpy as np


def baseline_model(n_preds, pred):
    # just predict the average
    return cudf.Series([pred for n in range(n_preds)])

# make baseline preds
baseline_preds = baseline_model(len(y_test), np.mean(y_train))

In [None]:
baseline_preds

https://scikit-learn.org/stable/modules/generated/sklearn.metrics.mean_squared_error.html

In [None]:
from cuml.metrics import mean_squared_error

mean_squared_error(y_true=y_test,
                   y_pred=baseline_preds,
                   squared=False)

## Define a simple model...

In [None]:
train.columns

In [None]:
%%time
pd.get_dummies(t)

In [None]:
%%time
cudf.get_dummies(train)

In [None]:
from cuml.preprocessing import train_test_split

X = train.drop('target', axis=1)
X = cudf.get_dummies(X)

y = train.target

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2)

In [None]:
from cuml.linear_model import LinearRegression

lr = LinearRegression()

lr.fit(X_train, y_train)

In [None]:
simple_preds = lr.predict(X_test)

simple_preds.tail(5)

In [None]:
from cuml.metrics import mean_squared_error

mean_squared_error(y_true=y_test,
                   y_pred=simple_preds,
                   squared=False)

My model is better than the baseline!! I'm lazy so let's submit and call it a day :)

In [None]:
%%time
import cudf
from cuml.linear_model import LinearRegression

# data load
train = cudf.read_csv("/kaggle/input/tabular-playground-series-feb-2021/train.csv")
test = cudf.read_csv("/kaggle/input/tabular-playground-series-feb-2021/test.csv")
sample_submission = cudf.read_csv('../input/tabular-playground-series-feb-2021/sample_submission.csv')

# data prep
X = train.drop('target', axis=1)
X = cudf.get_dummies(X)

y = train.target

test = cudf.get_dummies(test)

# modeling
lr = LinearRegression()

lr.fit(X, y)

simple_preds = lr.predict(test)

# save results & submit
sample_submission['target'] = simple_preds

sample_submission.to_csv('submission.csv', index=False)

How to solve `ValueError: Expected 71 columns but got 70 columns.`...

- "Expected 71" <- model was trained on 71 columns
- "but got 70" <- test data only has 70 columns

In [None]:
for c in X.columns:
    if c not in test.columns:
        print(c)

In [None]:
X.cat6_G.value_counts()

Instead of deleting, I'm going to add this column to `test`...

In [None]:
train.cat6.value_counts()

In [None]:
test['cat6_G'] = 0

In [None]:
%%time
import cudf
from cuml.linear_model import LinearRegression

# data load
train = cudf.read_csv("/kaggle/input/tabular-playground-series-feb-2021/train.csv")
test = cudf.read_csv("/kaggle/input/tabular-playground-series-feb-2021/test.csv")
sample_submission = cudf.read_csv('../input/tabular-playground-series-feb-2021/sample_submission.csv')

# data prep
X = train.drop('target', axis=1)
X = cudf.get_dummies(X)

y = train.target

test = cudf.get_dummies(test)
test['cat6_G'] = 0  # fix lack of Gs in test data

# modeling
lr = LinearRegression()

lr.fit(X, y)

simple_preds = lr.predict(test)

# save results & submit
sample_submission['target'] = simple_preds

sample_submission.to_csv('submission.csv', index=False)