In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import missingno as msno
import matplotlib.pyplot as plt
plt.style.use('ggplot')
%matplotlib inline
plt.rcParams['figure.figsize'] = 12,6
import warnings
warnings.filterwarnings('ignore')
seq_col_brew = sns.color_palette("YlGnBu_r", 4)
sns.set_palette(seq_col_brew)

In [None]:
dataset = pd.read_csv('../input/train.tsv', sep='\t', usecols=['item_condition_id', 'shipping', 'brand_name', 'price', 'category_name'])

In [None]:
dataset.head()

In [None]:
x = dataset[['item_condition_id', 'shipping', 'brand_name', 'category_name']]
y = dataset['price']

In [None]:
dataset.info(memory_usage='deep')

In [None]:
x.isnull().sum()

In [None]:
x.shape

In [None]:
x['brand_name'].fillna('Other', inplace=True)

In [None]:
x.isnull().sum()

In [None]:
dataset[dataset['category_name'].isnull()]

In [None]:
x.isnull().sum()

In [None]:
x.shape

In [None]:
from sklearn.cross_validation import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.80, random_state = 0)

In [None]:
x_train.info()

In [None]:
x_train.shape

In [None]:
x_train.isnull().sum()

In [None]:
x_train.head()

## Data Analysis

In [None]:
#To check Gaussian Distribution.
sns.distplot(y)

In [None]:
#Log distribution
sns.distplot(np.log(y + 1))

In [None]:
sns.distplot(y[y < 100])

In [None]:
y.mode()

In [None]:
y.mean()

In [None]:
y.median()

In [None]:
#How do you know what a high standard deviation is? 
#https://www.researchgate.net/post/What_do_you_consider_a_good_standard_deviation
y.std()

In [None]:
#Viewing the an independent variable and it's different distribution 
x_train.item_condition_id.hist()

In [None]:
#1 if shipping fee is paid by seller and 0 by buyer
x_train.shipping.hist()

In [None]:
#The seller takes a hit by paying for the shipping him/herself
sns.barplot(x='shipping', y='price', data=dataset)

In [None]:
dataset.describe(include=['O']) 

In [None]:
dataset.describe()

In [None]:
sns.kdeplot(data=y, shade=True, bw=.85)

In [None]:
sns.barplot(x='item_condition_id', y='price', hue='shipping', data=dataset)

In [None]:
sns.barplot(x='item_condition_id', y='price', data=dataset)

## Summary Data Analysis
1. There are outliers in the price data (could I apply feature scaling without having the y?)
2. 5 condition id is more expensive - meaning 5 is higher quality
3. Majourity of items are either in 1 or 2 condition

In [None]:
x_train["brand_name"].value_counts()

In [None]:
x_train.isnull().sum()

In [None]:
#Splitting the categories into sub categories

In [None]:
category_columns = ['Top_Level_Category'] + ['Second_Level_Category'] + ['Third_Level_Category']

In [None]:
category_columns

In [None]:
x_train.head()

In [None]:
new_categories = x_train['category_name'].str.extract('(\w*)\/(\w*)\/(\w*)', expand=True)

In [None]:
new_categories.columns

In [None]:
new_categories.columns = category_columns

In [None]:
x_train = pd.concat([x_train, new_categories], axis=1)

In [None]:
x_train.head()

In [None]:
columns = ['Top_Level_Category', 'Second_Level_Category', 'Third_Level_Category']

In [None]:
x_train.isnull().sum()

In [None]:
new_categories.isnull().sum()

In [None]:
for col in columns:
   x_train[col].fillna('Other', inplace=True)

In [None]:
x_train.head()

In [None]:
x_train = x_train.drop('category_name', axis=1)

In [None]:
x_train['brand_name'] = x_train['brand_name'].astype('category').cat.codes
x_train['Top_Level_Category'] = x_train['Top_Level_Category'].astype('category').cat.codes
x_train['Second_Level_Category'] = x_train['Second_Level_Category'].astype('category').cat.codes
x_train['Third_Level_Category'] = x_train['Third_Level_Category'].astype('category').cat.codes

In [None]:
x_train.head()

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

In [None]:
'''linreg = LinearRegression(n_jobs=-1)
cv_scores = (linreg, x_train, y_train, cv=10, scoring='neg_mean_squared_error')
print(cv_scores.mean(), cv_scores.std())'''

In [None]:
submission = pd.read_csv('../input/sample_submission.csv')
pred = np.ones((submission.shape[0]))

In [None]:
pred

In [None]:
pred * y_train.mean()

In [None]:
pred = pred * y_train.mean()

In [None]:
submission.shape

In [None]:
submission['price'] = pred

In [None]:
submission.to_csv('sample_submission.csv', index=False)