In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from scipy.sparse import csr_matrix
from scipy.sparse import hstack
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_log_error

import warnings
warnings.filterwarnings('ignore')

In [None]:
!apt-get install p7zip
!p7zip -d -f -k /kaggle/input/mercari-price-suggestion-challenge/train.tsv.7z
!unzip -o /kaggle/input/mercari-price-suggestion-challenge/sample_submission_stg2.csv.zip
!unzip -o /kaggle/input/mercari-price-suggestion-challenge/test_stg2.tsv.zip

In [None]:
train = pd.read_csv('train.tsv', sep = '\t')
test = pd.read_csv('test_stg2.tsv', sep='\t')

In [None]:
print (train.shape)
print (test.shape)

In [None]:
train.head(5)

In [None]:
test.head(5)

In [None]:
train.info()

In [None]:
train["price"].value_counts()

In [None]:
train.describe()

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
train.hist(bins=50, figsize=(20,15))
plt.show()

In [None]:
import numpy as np

def split_train_test(data, test_ratio):
    shuffled_indices = np.random.permutation(len(data))
    test_set_size = int(len(data) * test_ratio)
    test_indices = shuffled_indices[:test_set_size]
    train_indices = shuffled_indices[test_set_size:]
    return data.iloc[train_indices], data.iloc[test_indices]

In [None]:
train_set, test_set = split_train_test(train, 0.2)
print(len(train_set), "train +", len(test_set), "test")

In [None]:
from zlib import crc32

def test_set_check(identifier, test_ratio):
    return crc32(np.int64(identifier)) & 0xffffffff < test_ratio * 2**32

def split_train_test_by_id(data, test_ratio, id_column):
    ids = data[id_column]
    in_test_set = ids.apply(lambda id_: test_set_check(id_, test_ratio))
    return data.loc[~in_test_set], data.loc[in_test_set]

In [None]:
import hashlib

def test_set_check(identifier, test_ratio, hash=hashlib.md5):
    return hash(np.int64(identifier)).digest()[-1] < 256 * test_ratio

In [None]:
def test_set_check(identifier, test_ratio, hash=hashlib.md5):
    return bytearray(hash(np.int64(identifier)).digest())[-1] < 256 * test_ratio

In [None]:
train_with_id = train.reset_index()   
train_set, test_set = split_train_test_by_id(train_with_id, 0.2, "index")

In [None]:
test_set.head()

In [None]:
from sklearn.model_selection import train_test_split

train_set, test_set = train_test_split(train, test_size=0.2, random_state=42)

In [None]:
test_set.head()

# 1.  feature extraction

In [None]:
train["price"].hist()

In [None]:
train["price"] = np.ceil(train["train_id"] / 1.5)
train["price"].where(train["price"] < 5, 5.0, inplace=True)

In [None]:
train["price"] = pd.cut(train["train_id"],
                               bins=[0., 1.5, 3.0, 4.5, 6., np.inf],
                               labels=[1, 2, 3, 4, 5])

In [None]:
train["price"].value_counts()

In [None]:
train["price"].hist()

In [None]:
train_corr = train.corr()

In [None]:
import seaborn as sns

sns.heatmap(train_corr, 
            xticklabels=train_corr.columns.values,
            yticklabels=train_corr.columns.values)

In [None]:
train_corr["shipping"].sort_values(ascending=False)

In [None]:
from pandas.plotting import scatter_matrix

attributes = ["train_id", "item_condition_id", "shipping"]
scatter_matrix(train[attributes], figsize=(12, 8))


In [None]:
train.plot(kind="scatter", x="price", y="train_id",
             alpha=0.1)
plt.axis([0, 16, 0, 550000])


In [None]:
train_corr = train.corr()
train_corr["train_id"].sort_values(ascending=False)

In [None]:
train.plot(kind="scatter", x="shipping", y="train_id",
             alpha=0.2)
plt.axis([0, 5, 0, 520000])
plt.show()

In [None]:
train.describe()

#  2.data cleaning

In [None]:
incomplete_data = train[train.isnull().any(axis=1)].head()
incomplete_data

In [None]:
try:
    from sklearn.impute import SimpleImputer 
except ImportError:
    from sklearn.preprocessing import Imputer as SimpleImputer

imputer = SimpleImputer(strategy="median")

In [None]:
train.median().values

In [None]:
newtrain = train[['category_name']]
newtrain.head(10)

# 3.Handling Text and Attributes


In [None]:
train_cat = train[['train_id','category_name','name', 'brand_name','item_description']]
train_cat.head(5)

In [None]:
try:
    from sklearn.preprocessing import OrdinalEncoder
except ImportError:
    from future_encoders import OrdinalEncoder

In [None]:
ordinal_encoder = OrdinalEncoder()
train_cat = train_cat.dropna()
train_cat_encoded = ordinal_encoder.fit_transform(train_cat)
train_cat_encoded
train_cat_encoded[:10]

# 4.One Hot encoder 


In [None]:
try:
    from sklearn.preprocessing import OrdinalEncoder 
    from sklearn.preprocessing import OneHotEncoder
except ImportError:
    from future_encoders import OneHotEncoder 

cat_encoder = OneHotEncoder()
train_cat_1hot = cat_encoder.fit_transform(train_cat)
train_cat_1hot

In [None]:
cat_encoder = OneHotEncoder(sparse=False)
train_cat_1hot = cat_encoder.fit_transform(train_cat)
train_cat_1hot