In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# to make this notebook's output stable across runs
np.random.seed(42)

# To plot pretty figures
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

# Where to save the figures
PROJECT_ROOT_DIR = "."
CHAPTER_ID = ""
#IMAGES_PATH = os.path.join(PROJECT_ROOT_DIR, "images", CHAPTER_ID)
IMAGES_PATH = PROJECT_ROOT_DIR

def save_fig(fig_id, tight_layout=True, fig_extension="png", resolution=300):
    path = os.path.join(IMAGES_PATH, fig_id + "." + fig_extension)
    print("Saving figure", fig_id)
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format=fig_extension, dpi=resolution)

# Ignore useless warnings (see SciPy issue #5998)
import warnings
warnings.filterwarnings(action="ignore", message="^internal gelsd")

In [None]:
!apt-get install p7zip
!p7zip -d -f -k /kaggle/input/mercari-price-suggestion-challenge/train.tsv.7z
!unzip -o /kaggle/input/mercari-price-suggestion-challenge/sample_submission_stg2.csv.zip
!unzip -o /kaggle/input/mercari-price-suggestion-challenge/test_stg2.tsv.zip

In [None]:
df_train = pd.read_csv('train.tsv', sep = '\t')
df_test = pd.read_csv('test_stg2.tsv', sep='\t')

In [None]:
df_train.head()

In [None]:
df_train.info()

In [None]:
df_train["price"].value_counts()

In [None]:
df_train.describe()

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
df_train.hist(bins=50, figsize=(20,15))
save_fig("attribute_histogram_plots")
plt.show()

# create a test set

In [None]:
import numpy as np

# For illustration only. Sklearn has train_test_split()
def split_train_test(data, test_ratio):
    shuffled_indices = np.random.permutation(len(data))
    test_set_size = int(len(data) * test_ratio)
    test_indices = shuffled_indices[:test_set_size]
    train_indices = shuffled_indices[test_set_size:]
    return data.iloc[train_indices], data.iloc[test_indices]

In [None]:
train_set, test_set = split_train_test(df_train, 0.2)
print(len(train_set), "train +", len(test_set), "test")

In [None]:
from zlib import crc32

def test_set_check(identifier, test_ratio):
    return crc32(np.int64(identifier)) & 0xffffffff < test_ratio * 2**32

def split_train_test_by_id(data, test_ratio, id_column):
    ids = data[id_column]
    in_test_set = ids.apply(lambda id_: test_set_check(id_, test_ratio))
    return data.loc[~in_test_set], data.loc[in_test_set]

In [None]:
import hashlib

def test_set_check(identifier, test_ratio, hash=hashlib.md5):
    return hash(np.int64(identifier)).digest()[-1] < 256 * test_ratio

In [None]:
df_train_with_id = df_train.reset_index()   # adds an `index` column
train_set, test_set = split_train_test_by_id(df_train_with_id, 0.2, "index")

In [None]:
df_train_with_id["id"] = df_train["price"] * 1000 + df_train["price"]
train_set, test_set = split_train_test_by_id(df_train_with_id, 0.2, "id")

In [None]:
test_set.head()

In [None]:
from sklearn.model_selection import train_test_split

train_set, test_set = train_test_split(df_train, test_size=0.2, random_state=42)

In [None]:
test_set.head()

In [None]:
# to make this notebook's output identical at every run
np.random.seed(42)

In [None]:
df_train["price"].hist()

# Stratified sampling

In [None]:
from sklearn.model_selection import StratifiedShuffleSplit

split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in split.split(df_train, df_train["price"]):
    strat_train_set = df_train.iloc[train_index]
    strat_test_set = df_train.iloc[test_index]

In [None]:
strat_test_set["price"].value_counts() / len(strat_test_set)

In [None]:
df_train["price"].value_counts() / len(df_train)

In [None]:
def income_cat_proportions(data):
    return data["price"].value_counts() / len(data)

train_set, test_set = train_test_split(df_train, test_size=0.2, random_state=42)

compare_props = pd.DataFrame({
    "Overall": income_cat_proportions(df_train),
    "Stratified": income_cat_proportions(strat_test_set),
    "Random": income_cat_proportions(test_set),
}).sort_index()
compare_props["Rand. %error"] = 100 * compare_props["Random"] / compare_props["Overall"] - 100
compare_props["Strat. %error"] = 100 * compare_props["Stratified"] / compare_props["Overall"] - 100

In [None]:
compare_props

# Looking for Correlations

In [None]:
corr_matrix = df_train.corr()

In [None]:
import seaborn as sns

sns.heatmap(corr_matrix, 
            xticklabels=corr_matrix.columns.values,
            yticklabels=corr_matrix.columns.values)

In [None]:
corr_matrix["price"].sort_values(ascending=False)

In [None]:
# from pandas.tools.plotting import scatter_matrix # For older versions of Pandas
from pandas.plotting import scatter_matrix

attributes = ["price", "shipping", "train_id",
              "item_condition_id"]
scatter_matrix(df_train[attributes], figsize=(12, 8))
save_fig("scatter_matrix_plot")

In [None]:
df_train.plot(kind="scatter", x="price", y="shipping",
             alpha=0.1)
plt.axis([0, 16, 0, 550000])
save_fig("price_vs_shipping_value_scatterplot")

# data ceaning 

In [None]:
sample_incomplete_rows = df_train[df_train.isnull().any(axis=1)].head()
sample_incomplete_rows

In [None]:
sample_incomplete_rows.dropna(subset=['category_name'])

In [None]:
sample_incomplete_rows.dropna(subset=['name'])  

In [None]:
sample_incomplete_rows.dropna(subset=['brand_name'])    

In [None]:
sample_incomplete_rows.dropna(subset=['item_description'])

# sklern

In [None]:
try:
    from sklearn.impute import SimpleImputer # Scikit-Learn 0.20+
except ImportError:
    from sklearn.preprocessing import Imputer as SimpleImputer

imputer = SimpleImputer(strategy="median")

In [None]:
df_train_num = df_train.drop(['category_name','name', 'brand_name','item_description'], axis=1)
# alternatively: housing_num = housing.select_dtypes(include=[np.number])

In [None]:
imputer.fit(df_train_num)

In [None]:
imputer.statistics_

In [None]:
df_train_num.median().values

In [None]:
X = imputer.transform(df_train_num)

In [None]:
df_train_tr = pd.DataFrame(X, columns=df_train_num.columns,
                          index=df_train.index)

In [None]:
df_train_tr.loc[sample_incomplete_rows.index.values]

In [None]:
imputer.strategy

In [None]:
df_train_tr = pd.DataFrame(X, columns=df_train_num.columns,
                          index=df_train_num.index)
df_train_tr.head()

# Handling Text and Categorical Attributes

In [None]:
df_train_cat = df_train[['category_name','name', 'brand_name','item_description']]
df_train_cat.head(10)

In [None]:
try:
    from sklearn.preprocessing import OrdinalEncoder
except ImportError:
    from future_encoders import OrdinalEncoder # Scikit-Learn < 0.20

In [None]:
ordinal_encoder = OrdinalEncoder()
df_train_cat = df_train_cat.dropna()
df_train_cat_encoded = ordinal_encoder.fit_transform(df_train_cat)
df_train_cat_encoded
df_train_cat_encoded[:10]

In [None]:
ordinal_encoder.categories_

# One Hot Encoder

In [None]:
try:
    from sklearn.preprocessing import OrdinalEncoder # just to raise an ImportError if Scikit-Learn < 0.20
    from sklearn.preprocessing import OneHotEncoder
except ImportError:
    from future_encoders import OneHotEncoder # Scikit-Learn < 0.20

cat_encoder = OneHotEncoder()
df_train_cat_1hot = cat_encoder.fit_transform(df_train_cat)
df_train_cat_1hot

In [None]:
df_train_cat_1hot.toarray()

In [None]:
cat_encoder = OneHotEncoder(sparse=False)
df_train_cat_1hot = cat_encoder.fit_transform(df_train_cat)
df_train_cat_1hot

In [None]:
cat_encoder.categories_