In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

In [None]:
# tsvファイルからPandas DataFrameへ読み込み
df_train = pd.read_csv('/kaggle/input/mercari-price-suggestion-challenge/train.tsv', delimiter='\t')
df_test = pd.read_csv('/kaggle/input/mercari-price-suggestion-challenge/test_stg2.tsv', delimiter='\t')

In [None]:
# df_trainとdf_testを縦方向df_train.shape[0]に連結
df = pd.concat([df_train, df_test], axis=0)
df.head()

In [None]:
#行数をnrow_trainに代入
nrow_train=df_train.shape[0]

In [None]:
#priceはloge(e)でy_trainに代入
y_train = np.log1p(df_train["price"])

In [None]:
#df_trainを削除
del df_train

#ガーベージコレクション
import gc
gc.collect()

In [None]:
#nameからCountVectorizerで出現回数の特徴量を作成
from sklearn.feature_extraction.text import CountVectorizer
count = CountVectorizer(min_df=10)
X_name = count.fit_transform(df["name"])

In [None]:
X_name

In [None]:
#category_nameからCountVectorizerで出現回数の特徴量を作成
from sklearn.feature_extraction.text import CountVectorizer
count_category = CountVectorizer()
df["category_name"] = df["category_name"].fillna("Other").astype("category")
X_category = count_category.fit_transform(df["category_name"])

In [None]:
X_category

In [None]:
#item_descriptionからTfidfVectorizerでtf-idfの特徴量を作成
from sklearn.feature_extraction.text import TfidfVectorizer
count_descp = TfidfVectorizer(max_features = 50000,ngram_range = (1,3),stop_words = "english")
df["item_description"] = df["item_description"].fillna("None")
X_descp = count_descp.fit_transform(df["item_description"])

In [None]:
X_descp

In [None]:
#brand_nameに頻出する上位2500にないデータにOtherを代入
df["brand_name"] = df["brand_name"].fillna("unknown")
pop_brands = df["brand_name"].value_counts().index[:2500]
df.loc[~df["brand_name"].isin(pop_brands), "brand_name"] = "Other"
df["brand_name"] = df["brand_name"].astype("category")

#brand_nameをLabelBinarizerでダミー変数化
from sklearn.preprocessing import LabelBinarizer
vect_brand = LabelBinarizer(sparse_output=True)
X_brand = vect_brand.fit_transform(df["brand_name"])

In [None]:
X_brand

In [None]:
import scipy

In [None]:
#item_condition_id, shippingをダミー変数化
import scipy
from scipy.sparse import csr_matrix
df["item_condition_id"] = df["item_condition_id"].astype("category")
X_dummies = scipy.sparse.csr_matrix(pd.get_dummies(df[["item_condition_id", "shipping"]]).values)

In [None]:
X_dummies

In [None]:
#各特徴量の配列を横に連結
import scipy
from scipy.sparse import hstack
X = scipy.sparse.hstack((X_dummies,X_category,X_descp,X_brand,X_name)).tocsr()

In [None]:
#Ridge回帰モデルで学習
from sklearn.linear_model import Ridge

#トレーニング用データを格納
X_train = X[:nrow_train]

# モデルの作成
model = Ridge(solver = "lsqr", fit_intercept=False)
model.fit(X_train, y_train)

# スコアを表示
model.score(X_train, y_train)

In [None]:
#「X_test」にもとのテスト用データを格納
X_test = X[nrow_train:]

# 作成したリッジ回帰モデル「m」に「X_test」を入れて予測
preds = model.predict(X_test)

# Numpy配列からpandasシリーズへ変換
df_test["price"] = np.expm1(preds)

# テストデータのIDと予測値を連結させ、# 提出ファイルとしてCSVへ書き出し
df_test[["test_id", "price"]].to_csv("submission_ridge.csv", index = False)