In [None]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

In [None]:
# Get tabular data

path = "../input/petfinder-pawpularity-score/"

data = pd.read_csv(path+"/train.csv")
X, y = data.drop(["Id", "Pawpularity"], axis=1), data["Pawpularity"]

# Create training and test sets
# Use stratified sampling
sssplit = StratifiedShuffleSplit(n_splits=1, test_size=0.2)
for train_index, test_index in sssplit.split(X, y):
    X_train, y_train = X.iloc[train_index], y.iloc[train_index]
    X_test, y_test = X.iloc[test_index], y.iloc[test_index]
    
    
# Visually check distribution of target in training and test sets
y_train.hist(label="Training set")
y_test.hist(label="Test set")
plt.title("Pawpularity score distribution in training and test set")
plt.xlabel("Pawpularity score")
plt.ylabel("Count")
plt.legend(loc="upper right")
plt.show()

In [None]:
# Very baseline with tabular data. Let's do as if this was a regression problem with structured data

linreg = LinearRegression()
linreg.fit(X_train, y_train)

In [None]:
# Let's measure the LinearRegression model RMSE on the whole training set

pawpularity_predictions = linreg.predict(X_test)
linreg_rmse = np.sqrt(mean_squared_error(y_test, pawpularity_predictions))
linreg_rmse

In [None]:
# Output submission
submission = pd.read_csv(path+"/test.csv")
submission_features = submission.drop("Id", axis=1)
submission_output = pd.DataFrame(np.round(linreg.predict(submission_features), 2))
submission_output.columns = ["Pawpularity"]
submission_output = pd.concat([submission["Id"], submission_output], axis=1)

In [None]:
# Output file
submission_output.to_csv("submission.csv", index=False)