In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# To support both python 2 and python 3
from __future__ import division, print_function, unicode_literals

# Common imports
import numpy as np
import os
from sklearn.linear_model import LinearRegression,SGDRegressor,ElasticNet,Ridge
from sklearn.svm import SVC
from sklearn import linear_model
from sklearn import metrics
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error,mean_absolute_error
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from sklearn.metrics import r2_score
# to make this notebook's output stable across runs
np.random.seed(42)

# To plot pretty figures
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

# Where to save the figures
PROJECT_ROOT_DIR = "."
CHAPTER_ID = ""
#IMAGES_PATH = os.path.join(PROJECT_ROOT_DIR, "images", CHAPTER_ID)
IMAGES_PATH = PROJECT_ROOT_DIR

def save_fig(fig_id, tight_layout=True, fig_extension="png", resolution=300):
    path = os.path.join(IMAGES_PATH, fig_id + "." + fig_extension)
    print("Saving figure", fig_id)
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format=fig_extension, dpi=resolution)

# Ignore useless warnings (see SciPy issue #5998)
import warnings
warnings.filterwarnings(action="ignore", message="^internal gelsd")

In [None]:
!apt-get install p7zip
!p7zip -d -f -k /kaggle/input/mercari-price-suggestion-challenge/train.tsv.7z
!p7zip -d -f -k /kaggle/input/mercari-price-suggestion-challenge/test.tsv.7z
!p7zip -d -f -k /kaggle/input/mercari-price-suggestion-challenge/sample_submission.csv.7z

In [None]:
df = pd.read_csv("train.tsv",delimiter="\t",index_col=["train_id"])

In [None]:
df.head()

In [None]:
df.info()

# Data Analytics

In [None]:
df["name"].value_counts()

In [None]:
df["brand_name"].value_counts()

In [None]:
df["category_name"].value_counts()

In [None]:
df['category_name'].value_counts()[:20].plot(kind='bar')

**the categories are hierarchical which means a main category that holds other sub-categories and so on.They will need to be separated.**

In [None]:
df["item_description"].value_counts()

In [None]:
df.describe()

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
df.hist(bins=50, figsize=(20,15))
save_fig("attribute_histogram_plots")
plt.show()

**1. the item condition column has five categories : New,Like New,Good,Fair,Poor.the max is category 1 and the min is category 5.**

**2. the mean value of the price is 26.7 and the maximum value is around 2000 which means that the data is skewned.**

**3. the shipping is written in binary form 0 if not payed, 1 if payed**

In [None]:
np.random.seed(42)

**fixing the price skewness**

In [None]:
df['price'].skew()

In [None]:
df['price']= np.log(df['price']+1)

**using log transformation to remove the skewness from the target value**

In [None]:
df['price'].skew()

**the skewness went from 11.3 to 0.6. It is not normally distributed but it is in a better condition**

In [None]:
df['price'].hist(bins=50, figsize=(10,10))
save_fig("attribute_histogram_plots")
plt.show()

**Splitting the categories**

In [None]:
df['main_category'],df['sub_category1'],df['sub_category2']=df['category_name'].str.split("/",2).str
df

In [None]:
import seaborn as sns
num =df['main_category'].value_counts()
print("Number of unique main categories",len(num))
plt.figure(figsize=(16,5))
sns.barplot(num[0:19],num.index[0:19])
plt.title("Bar Plot Of top 20 main categories")
plt.xlabel('num')
plt.ylabel('main category')
plt.show()

In [None]:
import seaborn as sns
num =df['sub_category1'].value_counts()
print("Number of unique Sub Category1",len(num))
plt.figure(figsize=(16,5))
sns.barplot(num[0:19],num.index[0:19])
plt.title("Bar Plot Of top 20 Sub Category1")
plt.xlabel('num')
plt.ylabel('sub category1')
plt.show()

In [None]:
import seaborn as sns
num =df['sub_category2'].value_counts()
print("Number of unique Sub Category2",len(num))
plt.figure(figsize=(16,5))
sns.barplot(num[0:19],num.index[0:19])
plt.title("Bar Plot Of top 20 Sub Category2")
plt.xlabel('num')
plt.ylabel('sub category2')
plt.show()

In [None]:
corr_matrix = df.corr()
corr_matrix

In [None]:
import seaborn as sns

sns.heatmap(corr_matrix, 
            xticklabels=corr_matrix.columns.values,
            yticklabels=corr_matrix.columns.values)


# Data Cleaning

In [None]:
df["category_name"].fillna("missing", inplace=True)
print("Number of Missing Values in Category Name : ", df['category_name'].isnull().sum())

df['main_category'],df['sub_category1'],df['sub_category2']=df['category_name'].str.split("/",2).str

df["sub_category1"].fillna("missing", inplace=True)
print("Number of Missing Values in sub_category1 : ", df['sub_category1'].isnull().sum())

df["sub_category2"].fillna("missing", inplace=True)
print("Number of Missing Values in sub_category2 : ", df['sub_category2'].isnull().sum())

df["brand_name"].fillna("missing", inplace=True)
print("Number of Missing Values in brand_name : ", df['brand_name'].isnull().sum())

df["item_description"].fillna("No description yet", inplace=True)
print("Number of Missing Values in item_description : ", df['item_description'].isnull().sum())

In [None]:
categorical = df.select_dtypes(include = "object").columns
df[categorical].head(20)

In [None]:
from sklearn.preprocessing import LabelEncoder
from sklearn import utils
from sklearn import preprocessing

encoder = preprocessing.LabelEncoder()

for i in categorical:
    df[i] = encoder.fit_transform(df[i])
print (df.info())


In [None]:
df = df.astype(np.float64)
df.info()

In [None]:
from sklearn import utils
print(utils.multiclass.type_of_target(df))

In [None]:
print(utils.multiclass.type_of_target(df.astype('int')))

In [None]:
X = df.drop(['price'], axis=1)
Y = df['price']

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X,Y, test_size=0.2)

train_data,test_data,train_target,test_target

# Models

*** Linear Regression**

In [None]:
linerRegression = LinearRegression()
linerRegression.fit(X_train, y_train)
linerRegression.score(X_test, y_test )
y_pred_linear = linerRegression.predict(X_test)

In [None]:
print("mean_squared_error: "+ str(np.sqrt(mean_squared_error(y_test, y_pred_linear))))
print("Mean absolute error : "+ str(np.sqrt(mean_absolute_error(y_test, y_pred_linear))))
R2_score_linear = metrics.r2_score(y_test, y_pred_linear)
print(R2_score_linear)

*** Lasso Regression**

In [None]:
lasso = Lasso(alpha=0.01)
lasso.fit(X_train, y_train) 
lasso.score(X_test, y_test) 
y_pred_lasso = lasso.predict(X_test)

In [None]:
print("mean_squared_error: "+ str(np.sqrt(mean_squared_error(y_test, y_pred_lasso))))
print("Mean absolute error : "+ str(np.sqrt(mean_absolute_error(y_test, y_pred_lasso))))
R2_score_lasso = metrics.r2_score(y_test, y_pred_lasso)
print(R2_score_lasso)

*** ElasticNet**

In [None]:
elasticNet = ElasticNet(alpha = 0.01)
elasticNet.fit(X_train, y_train) 
elasticNet.score(X_test, y_test)
pred_train_enet= elasticNet.predict(X_test)


In [None]:
print("mean_squared_error: "+ str(np.sqrt(mean_squared_error(y_test, pred_train_enet))))
print("Mean absolute error : "+ str(np.sqrt(mean_absolute_error(y_test, pred_train_enet))))
R2_score_elastic = metrics.r2_score(y_test, pred_train_enet)
print(R2_score_elastic)

*** Linear Lasso**

In [None]:
linerRegression = linear_model.Lasso(alpha=0.1)
linerRegression.fit(X_train, y_train)
linerRegression.score(X_test, y_test )
y_pred_linear_lasso = linerRegression.predict(X_test)

In [None]:
print("mean_squared_error: "+ str(np.sqrt(mean_squared_error(y_test, y_pred_linear_lasso))))
print("Mean absolute error : "+ str(np.sqrt(mean_absolute_error(y_test, y_pred_linear_lasso))))
R2_score_linear_lasso = metrics.r2_score(y_test, y_pred_linear_lasso)
print(R2_score_linear_lasso)

*** Ridge**

In [None]:
ridge_regression = Ridge(alpha=0.01)
ridge_regression.fit(X_train, y_train) 
ridge_regression.score(X_test, y_test )
pred_train_ridge_regression= ridge_regression.predict(X_test)

In [None]:
print("mean_squared_error: "+ str(np.sqrt(mean_squared_error(y_test, pred_train_ridge_regression))))
print("Mean absolute error : "+ str(np.sqrt(mean_absolute_error(y_test, pred_train_ridge_regression))))
R2_score_ridge = metrics.r2_score(y_test, pred_train_ridge_regression)
print(R2_score_ridge)

In [None]:
m = {'Model': ['Linear Regression','Ridge','Lasso','Elastic','LinearLasso'],'R-squared Score': [R2_score_linear*100,R2_score_ridge*100,R2_score_lasso*100, R2_score_elastic*100, R2_score_linear_lasso*100]}
models = pd.DataFrame.from_dict(m, orient='index')
models = models.transpose()

models.sort_values(by='R-squared Score', ascending=False)