In [None]:
import numpy as np 
import pandas as pd 
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import LabelBinarizer
from scipy.sparse import hstack, csr_matrix
from sklearn.linear_model import SGDRegressor
from sklearn.metrics import mean_squared_log_error
import re
import gc
import matplotlib.pyplot as plt


# Pandas setting to display more dataset rows and columns
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_colwidth', None)
pd.set_option('display.float_format', lambda x: '%.5f' % x)


# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
!apt-get install p7zip
!p7zip -d -f -k /kaggle/input/mercari-price-suggestion-challenge/train.tsv.7z
!unzip -o /kaggle/input/mercari-price-suggestion-challenge/sample_submission_stg2.csv.zip
!unzip -o /kaggle/input/mercari-price-suggestion-challenge/test_stg2.tsv.zip

In [None]:
train = pd.read_csv("train.tsv", low_memory=False, sep='\t')#, nrows=100000)
test = pd.read_csv("test_stg2.tsv", low_memory=False, sep='\t')#, nrows=100000)
train.info(memory_usage="deep")

In [None]:
test.info(memory_usage="deep")

In [None]:
train.head()

* train_id or test_id - the id of the listing
* name - the title of the listing. Note that we have cleaned the data to remove text that look like prices (e.g. 20 USD) to avoid leakage. These removed prices are represented as [rm]
* item_condition_id - the condition of the items provided by the seller
* category_name - category of the listing
* brand_name
* price - the price that the item was sold for. This is the target variable that you will predict. The unit is USD. This column doesn't exist in test.tsv since that is what you will predict.
* shipping - 1 if shipping fee is paid by seller and 0 by buyer
* item_description - the full description of the item. Note that we have cleaned the data to remove text that look like prices (e.g. 20 USD) to avoid leakage. These removed prices are represented as [rm]

# **EDA**

In [None]:
# Amount of missing values in both train and test datasets
df = pd.concat([train.drop(["train_id", "price"], axis=1).isna().sum(), test.drop(["test_id"], axis=1).isna().sum()], axis=1)
df.columns = ["Train", "Test"]

fig, axs = plt.subplots(nrows=2, ncols=6, figsize=(16,6))

i=0
fig.set_facecolor('white')
for r in np.arange(1):
    for c in np.arange(6):
        explode = (0, 0.1)
        axs[r, c].pie([len(train)-df.iloc[i]["Train"], df.iloc[i]["Train"]], 
                      labels=["Filled\nvalues", "Missing\nvalues"],
                      explode=explode, autopct="%1.1f%%", labeldistance=1.2,
                      colors=["teal", "salmon"], startangle=55)
        axs[r+1, c].pie([len(test)-df.iloc[i]["Test"], df.iloc[i]["Test"]], 
                      labels=["Filled\nvalues", "Missing\nvalues"],
                      explode=explode, autopct="%1.1f%%", labeldistance=1.2,
                      colors=["teal", "salmon"], startangle=55)
        axs[r, c].set_title("Train: " + df.index[i], pad=10)
        axs[r+1, c].set_title("Test: " + df.index[i], pad=10)
        i += 1
fig.suptitle("Amount of missing values in train and test datasets")
plt.show();

In [None]:
# Amount of unique values in both train and test datasets
df = pd.concat([train.drop(["train_id", "price"], axis=1).nunique(), test.drop(["test_id"], axis=1).nunique()], axis=1)
df.columns = ["Train", "Test"]
df.T

In [None]:
# Top 5 names in each dataset
train["name"].value_counts()[:5], test["name"].value_counts()[:5]

In [None]:
# Top 5 brand names in each dataset
train["brand_name"].value_counts()[:5], test["brand_name"].value_counts()[:5]

In [None]:
# Top 5 category names
train["category_name"].value_counts()[:5], test["category_name"].value_counts()[:5]

In [None]:
# Top 5 item descriptions in each dataset
train["item_description"].value_counts()[:5], test["item_description"].value_counts()[:5]

In [None]:
# Item condition distribution

fig, ax = plt.subplots(figsize=(12, 7))
x = np.arange(1, 6, 1)
y1 = train["item_condition_id"].value_counts().values/len(train["item_condition_id"])*100
y2 = test["item_condition_id"].value_counts().values/len(test["item_condition_id"])*100
bars1 = ax.bar(x-0.2,
               y1,
               width=0.4, color="cornflowerblue", label="Train dataset", edgecolor="black")
bars2 = ax.bar(x+0.2,
               y2,
               width=0.4, color="palevioletred", label="Test dataset", edgecolor="black")
ax.set_title("Item condition values distribution in the both datasets", fontsize=20, pad=15)
ax.set_ylabel("Fraction of all values", fontsize=15, labelpad=10)
ax.set_xlabel("Value", fontsize=15, labelpad=10)
ax.set_yticks(ax.get_yticks())
ax.set_yticklabels([str(int(x))+"%" for x in ax.get_yticks()])
ax.tick_params(axis="x", labelsize=12)
ax.tick_params(axis="y", labelsize=12)
ax.bar_label(bars1, [str(round(x, 2))+"%" for x in y1], padding=3, fontsize=12)
ax.bar_label(bars2, [str(round(x, 2))+"%" for x in y2], padding=3, fontsize=12)
ax.grid(axis="y")
ax.margins(0.05, 0.1)
ax.legend(fontsize=12)
plt.show();

In [None]:
# Shipment distribution

fig, ax = plt.subplots(figsize=(7, 7))
x = np.arange(0, 2, 1)
y1 = train["shipping"].value_counts().values/len(train["shipping"])*100
y2 = test["shipping"].value_counts().values/len(test["shipping"])*100
bars1 = ax.bar(x-0.15,
               y1,
               width=0.3, color="cornflowerblue", label="Train dataset", edgecolor="black")
bars2 = ax.bar(x+0.15,
               y2,
               width=0.3, color="palevioletred", label="Test dataset", edgecolor="black")
ax.set_title("Item condition values distribution in the both datasets", fontsize=20, pad=15)
ax.set_ylabel("Fraction of all values", fontsize=15, labelpad=15)
ax.set_xlabel("Value", fontsize=15, labelpad=15)
ax.set_yticks(ax.get_yticks())
ax.set_yticklabels([str(int(x))+"%" for x in ax.get_yticks()])
ax.tick_params(axis="x", labelsize=12)
ax.tick_params(axis="y", labelsize=12)
ax.set_xticks([0, 1])
ax.set_xticklabels(["Shipment is included", "Shipment is not included"])
ax.bar_label(bars1, [str(round(x, 2))+"%" for x in y1], padding=3, fontsize=13)
ax.bar_label(bars2, [str(round(x, 2))+"%" for x in y2], padding=3, fontsize=13)
ax.grid(axis="y")
ax.margins(0.05, 0.1)
ax.legend(fontsize=12)
plt.show();

As per two last plots we can see that the datasets are pretty balanced by categorical features.

In [None]:
# Price distribution
fig, axs = plt.subplots(ncols=1, nrows=2, figsize=(14, 12))

plt.subplots_adjust(hspace = 0.4)

axs[0].hist(train["price"], bins=100, color="sandybrown", edgecolor="black")
axs[0].set_title("Price distribution", fontsize=20, pad=5)
axs[0].set_xlabel("Price", fontsize=15, labelpad=5)
axs[0].set_ylabel("Amount of items", fontsize=15, labelpad=5)
axs[0].set_yticks(np.arange(0, 1000000, 100000))
axs[0].tick_params(axis="y", labelsize=13)
axs[0].tick_params(axis="x", labelsize=13)
axs[0].grid(axis="y")
axs[0].margins(0.05, 0.05)

axs[1].hist(train[train["price"] < 150]["price"], bins=np.arange(0, 150, 2.5), color="rosybrown", edgecolor="black")
axs[1].set_title("Price distribution from 0 to 150", fontsize=20, pad=5)
axs[1].set_xlabel("Price", fontsize=15, labelpad=5)
axs[1].set_ylabel("Amount of items", fontsize=15, labelpad=5)
axs[1].set_xticks(np.arange(0, 180, 10))
axs[1].set_yticks(np.arange(0, 270000, 20000))
axs[1].tick_params(axis="y", labelsize=13)
axs[1].tick_params(axis="x", labelsize=13)
axs[1].grid(axis="y")
axs[1].margins(0.05, 0.05)

In [None]:
# Price dependance on item condition

fig, ax = plt.subplots(figsize=(14, 7))
x = np.arange(1, 6, 1)
y1 = train.groupby("item_condition_id")["price"].mean()
y2 = train.groupby("item_condition_id")["price"].median()

bars1 = ax.bar(x-0.2,
               y1,
               width=0.4, color="mediumorchid", label="Mean price", edgecolor="black")
bars2 = ax.bar(x+0.2,
               y2,
               width=0.4, color="lightseagreen", label="Median price", edgecolor="black")

ax.set_title("Mean and median price for different item condition", fontsize=20, pad=15)
ax.set_ylabel("Price", fontsize=15, labelpad=15)
ax.set_xlabel("Item condition id", fontsize=15, labelpad=10)
# ax.set_yticklabels([str(int(x))+"%" for x in ax.get_yticks()])
ax.tick_params(axis="x", labelsize=13)
ax.tick_params(axis="y", labelsize=13)
ax.bar_label(bars1, [str(round(x, 2)) for x in y1], padding=3, fontsize=13)
ax.bar_label(bars2, [str(round(x, 2)) for x in y2], padding=3, fontsize=13)
ax.grid(axis="y")
ax.margins(0.05, 0.1)
ax.legend(fontsize=13)
plt.show();

In [None]:
# Top 20 brand prices

fig, ax = plt.subplots(figsize=(14, 7))

brands = train["brand_name"].value_counts().index[:20]
mean_prices = train.groupby("brand_name")["price"].mean()
median_prices = train.groupby("brand_name")["price"].median()
x = np.arange(0, len(brands), 1)

bars1 = ax.bar(x-0.2,
               mean_prices.loc[brands].values,
               width=0.4, color="cornflowerblue", label="Mean price", edgecolor="black")
bars2 = ax.bar(x+0.2,
               median_prices.loc[brands].values,
               width=0.4, color="darkorange", label="Median price", edgecolor="black")

ax.set_title("Mean and median price for items of the top 20 brands", fontsize=25, pad=15)
ax.set_ylabel("Price", fontsize=15, labelpad=15)
ax.set_xlabel("Brand name", fontsize=15, labelpad=10)
ax.set_xticks(x)
ax.set_xticklabels(brands, rotation = 60, ha="right", rotation_mode='anchor')
ax.tick_params(axis="x", labelsize=15)
ax.tick_params(axis="y", labelsize=15)
ax.grid(axis="y")
ax.margins(0.025, 0.05)
ax.legend(fontsize=15)
plt.show();

Lets check the item description length in the both datasets.

In [None]:
df = pd.DataFrame()
df["descr_len"] = train["item_description"].str.len()
print(f"Train dataset desription length info:\n{df['descr_len'].describe(percentiles=[.1, .25, .5, .75, .90, .99])}")

df = pd.DataFrame()
df["descr_len"] = test["item_description"].str.len()
print(f"\nTest dataset desription length info:\n{df['descr_len'].describe(percentiles=[.1, .25, .5, .75, .90, .99])}")

In [None]:
df = pd.DataFrame()
df["descr_len"] = train["item_description"].str.len()
df["price"] = train["price"]

fig, ax = plt.subplots(figsize=(14, 7))

plot1 = ax.plot(df.groupby("descr_len")["price"].mean().index,
                df.groupby("descr_len")["price"].mean().values,
                color="mediumvioletred", label="Mean price")


plot1 = ax.plot(df.groupby("descr_len")["price"].median().index,
                df.groupby("descr_len")["price"].median().values,
                color="dodgerblue", label="Mean price")

ax.set_title("Price distribution", fontsize=20, pad=5)
ax.set_xlabel("Item description length", fontsize=15, labelpad=5)
ax.set_ylabel("Price", fontsize=15, labelpad=5)
ax.tick_params(axis="y", labelsize=13)
ax.tick_params(axis="x", labelsize=13)
ax.grid(axis="y")
ax.legend(fontsize=15)
ax.margins(0.05, 0.05)

In [None]:
# Lets make the same plot but with 5 chars wide bins 
df = pd.DataFrame()
df["descr_len"] = train["item_description"].str.len()
df["price"] = train["price"]
df["descr_len"] = pd.cut(df["descr_len"], np.arange(0, 1055, 5), right=False)
df = df.groupby("descr_len")["price"].mean()
df.index = np.arange(5, 1055, 5)

fig, ax = plt.subplots(figsize=(14, 7))

plot1 = ax.plot(df.index, df.values, color="mediumvioletred", label="Mean price")

df = pd.DataFrame()
df["descr_len"] = train["item_description"].str.len()
df["price"] = train["price"]
df["descr_len"] = pd.cut(df["descr_len"], np.arange(0, 1055, 5), right=False)
df = df.groupby("descr_len")["price"].median()
df.index = np.arange(5, 1055, 5)

plot2 = ax.plot(df.index, df.values, color="dodgerblue", label="Median price")

ax.set_title("Price distribution", fontsize=20, pad=5)
ax.set_xlabel("Item description length", fontsize=15, labelpad=5)
ax.set_ylabel("Price", fontsize=15, labelpad=5)
ax.tick_params(axis="y", labelsize=13)
ax.tick_params(axis="x", labelsize=13)
ax.grid(axis="y")
ax.legend(fontsize=15)
ax.margins(0.05, 0.05)

# **Data management**

In [None]:
# Getting rid of rows with zero or negative price and resetting DataFrame index
train.drop(train[train["price"] <= 0].index, inplace=True)
train.reset_index(inplace=True, drop=True)

In [None]:
def process_data(df):
    # Category_name column splitting into three columns
    df[["First_category", "Second_category", "Third_category"]] = df["category_name"].str.split('/', 2, expand=True)

    # Deleting unneeded column
    df.drop("category_name", axis=1, inplace=True)

    # Adding new features indicating missing data
    df["item_description"] = df["item_description"].replace({"No description yet": np.nan})
    df["Category_was_missing"] = df["First_category"].isna()
    df["Brand_was_missing"] = df["brand_name"].isna()
    df["Description_was_missing"] = df["item_description"].isna()

    # Replacing NaN values with "missing" string
    for i in ["brand_name", "First_category", "Second_category", "Third_category", "item_description"]:
        df[i] = df[i].fillna("missing")

    # Adding features indicating that there was a price tag in item description or name
    df["Price_was_in_description"] = df["item_description"].str.contains("\[rm\]")
    df["Price_was_in_name"] = df["name"].str.contains("\[rm\]")
    
    # Adding a new column with description text length devided into 5 character intervals
    df["descr_len"] = df["item_description"].str.len()
    df["descr_len"] = pd.cut(df["descr_len"], np.arange(0, 1055, 5), right=False)
    df["descr_len"] = df["descr_len"].astype("string")

    # Stopwords import from nltk 
    stop_words = set(stopwords.words("english"))

    # Processing of name and description columns
    for column in ["item_description", "brand_name", "name"]:
        processed_column = []
        for text_row in df[column]:
            text_row = text_row.replace('[rm]', '')
            text_row = re.sub('[^A-Za-z0-9]+', ' ', text_row)
            if column != "brand_name":
                text_row = ' '.join(word for word in text_row.lower().split() if word not in stop_words)
            processed_column.append(text_row.strip())
        df[column] = processed_column

    # Processing of category columns
    for column in ["First_category", "Second_category", "Third_category"]:    
        processed_column = []
        for text_row in df[column]:
            text_row = text_row.replace(' ','')
            text_row = text_row.replace('&','_')
            text_row = re.sub('[^A-Za-z0-9_]+', ' ', text_row)
            processed_column.append(text_row.lower().strip())
        df[column] = processed_column

    return df

In [None]:
train = process_data(train)

In [None]:
def make_tokens_count_plot(count_vercorizer, matrix, column_name, set_name="train"):
    # Using one color for train set plots and the other for test/valid set
    if set_name == "train":
        color = "lightcoral"
    else:
        color = "steelblue"
        
    df = pd.DataFrame()
    df["tokens"] = count_vercorizer.get_feature_names()
    df["counts"] = np.asarray(matrix.sum(axis=0))[0]
    x = df.sort_values("counts", axis=0, ascending=False).head(50)["tokens"]
    y = df.sort_values("counts", axis=0, ascending=False).head(50)["counts"]

    fig, ax = plt.subplots(figsize=(15, 6))

    ax.bar(x, y, color=color, edgecolor="black")
    ax.set_title(f"Most popular words in the {column_name} column of the {set_name} dataset", fontsize=20, pad=15)
    ax.set_ylabel("Count", fontsize=14, labelpad=15)
    ax.set_xlabel("Word", fontsize=14, labelpad=10)
    ax.set_xticks(x)
    ax.set_xticklabels(x, rotation = 60, ha="right", rotation_mode='anchor')
    ax.tick_params(axis="x", labelsize=14)
    ax.tick_params(axis="y", labelsize=14)
    ax.grid(axis="y")
    ax.margins(0.025, 0.05)
    plt.show();

In [None]:
def get_transformed_train_valid_data(df, y, cat_features):
    """
    Vectorizes and return train and valid data. For valid data transformation get_transformed_test_data() is called.
    """    
    X_train, X_valid, y_train, y_valid = train_test_split(df, y, test_size=0.2, random_state=42)
    
    label_binarizers = []
    binarized_columns = []
    count_vercorizers = []
    vectorized_columns = []
    
    for column in cat_features:
        binarizer = LabelBinarizer(sparse_output=True)
        binarized_column = binarizer.fit_transform(X_train[column])
        label_binarizers.append(binarizer)
        binarized_columns.append(binarized_column)
        
  
    vectorizer = CountVectorizer(min_df=7, max_features=20000)
    vectorized_column = vectorizer.fit_transform(X_train["name"])
    count_vercorizers.append(vectorizer)
    vectorized_columns.append(vectorized_column)
    make_tokens_count_plot(vectorizer, vectorized_column, "name", "train")
    
    
    vectorizer = CountVectorizer(min_df=15, ngram_range=(1, 2), max_features=60000)
    vectorized_column = vectorizer.fit_transform(X_train["item_description"])
    count_vercorizers.append(vectorizer)
    vectorized_columns.append(vectorized_column)
    make_tokens_count_plot(vectorizer, vectorized_column, "item_description", "train")
    
    vectorizer = CountVectorizer(min_df=30, ngram_range=(3, 3), max_features=10000)
    vectorized_column = vectorizer.fit_transform(X_train["item_description"])
    count_vercorizers.append(vectorizer)
    vectorized_columns.append(vectorized_column)
    make_tokens_count_plot(vectorizer, vectorized_column, "item_description", "train")
    print(f"Name columns vectorized shape is {vectorized_columns[0].shape}")
    print(f"Item_description columns vectorized shape is {vectorized_columns[1].shape}")
    print(f"Item_description columns vectorized (1,3) shape is {vectorized_columns[2].shape}")
    

    X_train_stack = hstack((binarized_columns[0], binarized_columns[1], binarized_columns[2],
                            binarized_columns[3], binarized_columns[4], binarized_columns[5],
                            binarized_columns[6], binarized_columns[7], binarized_columns[8],
                            binarized_columns[9], binarized_columns[10], binarized_columns[11],
#                             vectorized_columns[0])).tocsr()                            
                            vectorized_columns[0], vectorized_columns[1], vectorized_columns[2])).tocsr()
    
    X_valid_stack = get_transformed_test_data(X_valid, cat_features, label_binarizers, count_vercorizers)

   
    return X_train_stack, X_valid_stack, y_train, y_valid, label_binarizers, count_vercorizers

def get_transformed_test_data(df, cat_features, label_binarizers, count_vercorizers):
    """
    Vectorizes and return test data. Can be used to vectorize valid data as well.
    """    
    binarized_columns = []
    vectorized_columns = []
    
    for num, column in enumerate(cat_features):
        binarizer = label_binarizers[num]
        binarized_column = binarizer.transform(df[column])
        binarized_columns.append(binarized_column)
        
    vectorizer = count_vercorizers[0]
    vectorized_column = vectorizer.transform(df["name"])
    vectorized_columns.append(vectorized_column)
    make_tokens_count_plot(vectorizer, vectorized_column, "name", "valid/test")
    
    vectorizer = count_vercorizers[1]
    vectorized_column = vectorizer.transform(df["item_description"])
    vectorized_columns.append(vectorized_column)
    make_tokens_count_plot(vectorizer, vectorized_column, "item_description", "valid/test")
    
    vectorizer = count_vercorizers[2]
    vectorized_column = vectorizer.transform(df["item_description"])
    vectorized_columns.append(vectorized_column)
    make_tokens_count_plot(vectorizer, vectorized_column, "item_description", "valid/test")    
    print(f"Name columns vectorized shape is {vectorized_columns[0].shape}")
    print(f"Item_description columns vectorized shape is {vectorized_columns[1].shape}")
    print(f"Item_description columns vectorized (1,3) shape is {vectorized_columns[2].shape}")
    

    X_test_stack = hstack((binarized_columns[0], binarized_columns[1], binarized_columns[2],
                           binarized_columns[3], binarized_columns[4], binarized_columns[5],
                           binarized_columns[6], binarized_columns[7], binarized_columns[8],
                           binarized_columns[9], binarized_columns[10], binarized_columns[11],
#                            vectorized_columns[0])).tocsr()
                           vectorized_columns[0], vectorized_columns[1], vectorized_columns[2])).tocsr()

   
    return X_test_stack

def get_transformed_train_data(df, cat_features):
    """
    Vectorizes and return train data only
    """
    
    label_binarizers = []
    binarized_columns = []
    count_vercorizers = []
    vectorized_columns = []
    
    for column in cat_features:
        binarizer = LabelBinarizer(sparse_output=True)
        binarized_column = binarizer.fit_transform(df[column])
        label_binarizers.append(binarizer)
        binarized_columns.append(binarized_column)
        
    vectorizer = CountVectorizer(min_df=7, max_features=20000)
    vectorized_column = vectorizer.fit_transform(df["name"])
    count_vercorizers.append(vectorizer)
    vectorized_columns.append(vectorized_column)
    make_tokens_count_plot(vectorizer, vectorized_column, "name", "train")
    
    
    vectorizer = CountVectorizer(min_df=15, ngram_range=(1, 2), max_features=60000)
    vectorized_column = vectorizer.fit_transform(df["item_description"])
    count_vercorizers.append(vectorizer)
    vectorized_columns.append(vectorized_column)
    make_tokens_count_plot(vectorizer, vectorized_column, "item_description", "train")
    
    vectorizer = CountVectorizer(min_df=30, ngram_range=(3, 3), max_features=10000)
    vectorized_column = vectorizer.fit_transform(df["item_description"])
    count_vercorizers.append(vectorizer)
    vectorized_columns.append(vectorized_column)
    make_tokens_count_plot(vectorizer, vectorized_column, "item_description", "train")
    print(f"Name columns vectorized shape is {vectorized_columns[0].shape}")
    print(f"Item_description columns vectorized shape is {vectorized_columns[1].shape}")
    print(f"Item_description columns vectorized (1,3) shape is {vectorized_columns[2].shape}")
    

    X_train_stack = hstack((binarized_columns[0], binarized_columns[1], binarized_columns[2],
                            binarized_columns[3], binarized_columns[4], binarized_columns[5],
                            binarized_columns[6], binarized_columns[7], binarized_columns[8],
                            binarized_columns[9], binarized_columns[10], binarized_columns[11],
#                             vectorized_columns[0])).tocsr()                            
                            vectorized_columns[0], vectorized_columns[1], vectorized_columns[2])).tocsr()

   
    return X_train_stack, label_binarizers, count_vercorizers

In [None]:
cat_features = ["item_condition_id", "First_category", "Second_category", "Third_category",
                "shipping", "brand_name", "Description_was_missing", "Price_was_in_name",
                "Price_was_in_description", "Brand_was_missing", "Category_was_missing",
                "descr_len"]

# # Get train and valid data to measure models performance
# X_train, X_valid, y_train, y_valid, label_binarizers, count_vercorizers = \
# get_transformed_train_valid_data(train.drop(["train_id", "price"], axis=1), 
#                                  np.log1p(train["price"]), cat_features)

X_train, label_binarizers, count_vercorizers = get_transformed_train_data(train.drop(["train_id", "price"], axis=1), cat_features)
y_train = np.log1p(train["price"])

In [None]:
# Deleting train dataset to free memory
for column in train.columns:
    train.drop(column, axis=1, inplace=True)
    
del train
gc.collect()

# **Metrics**

In [None]:
def rmsle(y_true, y_preds):
    return np.sqrt(mean_squared_log_error(y_true, y_preds))


def get_scores(model, X_train, X_valid, y_train, y_valid):
    train_preds = model.predict(X_train)
    val_preds = model.predict(X_valid)
    scores = {"Training MAE": mean_absolute_error(y_train, train_preds),
              "Validation MAE": mean_absolute_error(y_valid, val_preds),
              "Training RMSLE": rmsle(y_train, train_preds),
              "Validation RMSLE": rmsle(y_valid, val_preds),
              "Training R^2": model.score(X_train, y_train),
              "Validation R^2": model.score(X_valid, y_valid)}
    return scores

# **Model training**

In [None]:
%%time

model = SGDRegressor(alpha=0.000001,
                     penalty="l2",
                     random_state=42,
                     loss="epsilon_insensitive",
                     max_iter=50000)

model.fit(X_train, y_train)

# **Train data predictions and submission**

In [None]:
# Deleting train dataset to free memory
del X_train, y_train
gc.collect()

In [None]:
# Predictions dataframe initialization
preds = pd.DataFrame(columns = ["test_id", "price"])
preds["test_id"] = test["test_id"]

In [None]:
test = process_data(test.drop("test_id", axis=1))

X_test = get_transformed_test_data(test, cat_features, label_binarizers, count_vercorizers)

In [None]:
preds["price"] = np.expm1(model.predict(X_test))
preds.to_csv('submission.csv', index=False)
preds.head()