In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
sns.set(style="ticks", color_codes=True)
plt.rcParams['figure.figsize'] = (8,5)
plt.rcParams['figure.dpi'] = 150

# StockX Yeezy Market Analysis

In this notebook, I'll do EDA to analyze which features affect at most the profit you can get reselling sneakers. Also, I'll do regression analysis to see what we can predict from this dataset. I'll focus on Yeezy Sneakers to find more specific details that influence the shoe price, but the same analysis could be made for the Off-White sneakers, to find another results.

# Cleaning the Data

In [None]:
df = pd.read_csv('/kaggle/input/stockx-data-contest/StockX-Data-Contest-2019-3.csv')

In [None]:
df.head()

In [None]:
df.isnull().sum()

Since there isn't any NA value, we can go on

In [None]:
df.dtypes

1st step: adjust data types
    
Release Date and Order Date -> datetime

Retail and Sale prices -> float

In [None]:
sp = [float(x[1:].replace(',', '')) for x in list(df['Sale Price'])]
df['Sale Price'] = sp
rp = [float(x[1:].replace(',', '')) for x in list(df['Retail Price'])]
df['Retail Price'] = rp

df['Release Date'] = pd.to_datetime(df['Release Date'])
df['Order Date'] = pd.to_datetime(df['Order Date'])

I'll create two additional columns: Time Released, meaning how much time the sneaker was released on market, and Profit, which is the (Sale Price - Retail Price)

In [None]:
df['Time Released'] = ((df['Order Date'] - df['Release Date'])/ np.timedelta64(1, 'M')).astype(int) # in months

In [None]:
df['Profit'] = (df['Sale Price'] - df['Retail Price'])

In [None]:
df.head()

# EDA

First step of my exploratory data analysis is see which sneakers are more popular (i.e. had more sales on StockX) and see which factors are influencing on their price.

## Yeezy

In [None]:
yzy = df[df['Brand'] == ' Yeezy']
yzy_count = yzy.groupby(['Sneaker Name']).count()
yzy_count.head()

In [None]:
#Bar plot of yeezys sell
figure, axis = plt.subplots()
plt.xticks(range(len(yzy_count.index)), list(yzy_count.index), rotation = 90)
plot = axis.bar(yzy_count.index, yzy_count['Order Date'])
for rectangle in plot:
    height = rectangle.get_height()
    axis.text(rectangle.get_x() + rectangle.get_width() /2., 1.002 *
        height, '%d' % int(height), ha='center', va = 'bottom')
    
plt.title('Bar Plot of Yeezys sold in this period')
plt.show()

As you can see, the most sold Yeezy 350 was the Butter, followed by Beluga V2 and Zebra.

Which factors affect the Yeezy selling? First, I'll analyze if it's the price:

In [None]:
table = pd.pivot_table(yzy, index=['Sneaker Name'])

table['Sale Price'].plot(kind='bar')
plt.show()

In [None]:
print('5 Cheapest Yeezys: ', table['Sale Price'].sort_values()[:5])

It's clear that Zebras and Butter are among the cheapest Yeezy's. Also, Beluga V2 does not have a very high price, as you can see from the graph.

Now, I'll see the influence of the release date for this sneakers. First, let's see which are the most recent and the older in the market.

In [None]:
yzy.groupby('Sneaker Name').min()['Release Date'].sort_values()

Now, I'll see if there is any correlation between the Time Released and the Sale Price

In [None]:
from scipy import stats

for snkr in yzy['Sneaker Name'].value_counts().index.values:
    yzy_s = yzy[yzy['Sneaker Name'] == snkr].copy()

    corr = stats.pearsonr(yzy_s['Sale Price'], yzy_s['Time Released'])

    print("Correlation of Sale Price and Time Released of the {}: {}\n".format(snkr, corr[0].round(2)))

A lot of sneakers have a strong negative correlation with the release date. It means that, with the pass of the time, the sneaker starts to devalue. However, some sneakers like the V2 Beluga, one of the most sold, continues with good value by the time. Let's see this below:

In [None]:
fig, axes = plt.subplots(5, 4)
fig.suptitle('Sale Price series for each Sneaker')

for snkr, ax in zip(yzy['Sneaker Name'].value_counts().index.values, axes.flat):
    
    yzy_s = yzy[yzy['Sneaker Name'] == snkr].copy()
    snkr_serie = yzy_s['Sale Price']
    ax.plot(snkr_serie)
    ax.set_title(snkr[23:], fontsize=5)
    ax.axes.get_xaxis().set_visible(False)
    ax.axes.get_yaxis().set_visible(False)


plt.show()

Another metric that seems to affect sneaker resell price is the Shoe Size. I'll start analyzing it by seeing which sizes are more popular, then I'll see how much this affects in the price.

In [None]:
yzy_count = yzy.groupby(['Shoe Size']).count() #yeezys by shoe size


#bar plot of yeezys by shoe size
figure, axis = plt.subplots()
plt.xticks(list(yzy_count.index),rotation = 45)
plot = axis.bar(yzy_count.index, yzy_count['Order Date'], width=.4)
for rectangle in plot:
    height = rectangle.get_height()
    axis.text(rectangle.get_x() + rectangle.get_width() /2., 1.002 *
        height, '%d' % int(height), ha='center', va = 'bottom', fontsize=8)

plt.show()

The most sold sneakers are from 10, 9 and 9.5 size. Let's analyze the prices by the shoe size, now.

In [None]:
yzy_mean = yzy.groupby('Shoe Size').mean()

In [None]:
#Bar plot of yeezys mean price by shoe size
figure, axis = plt.subplots()
plt.xticks(list(yzy_mean.index),rotation = 45)
plot = axis.bar(yzy_mean.index, yzy_mean['Sale Price'], width=.4)
for rectangle in plot:
    height = rectangle.get_height()
    axis.text(rectangle.get_x() + rectangle.get_width() /2., 1.002 *
        height, '%d' % int(height), ha='center', va = 'bottom', fontsize=8)

In [None]:
ax = sns.boxplot(data=yzy, x='Shoe Size', y='Sale Price')
ax.set_xticklabels(ax.get_xticklabels(),rotation=90)
plt.show()

Although being relevant to answer the question of how much sneakers are sold, shoe size isn't relevant to determine the price of a sneaker. The majority of sizes have similar mean, but with some of them having higher median due to more sales.

I'll check the correlation between all Yeezys and Time Released and Shoe Size. The idea is to see if the Yeezy is or isn't correlated at all with these variables. 

In [None]:
sns.set(style='ticks', color_codes=True)
sns.pairplot(yzy, vars=['Profit', 'Time Released', 'Shoe Size'], kind='reg')
plt.show()

It's clear that the Yeezys are a lot different individually. The Beluga's behavior in our analysis was completely different than the Cream White's, for example. Since we have this differences, there isn't any correlation appearing in the macro scenario, but when analyzing the sneakers individually, as I did before, we see that we can find some interesting features to help the regressor.

Finally, I'll compare the Yeezys against the Off-Whites. Let's see if there is any clear information that can helps.

In [None]:
df.groupby('Brand').count()['Order Date'].plot(kind='bar')
plt.show()

In [None]:
df.groupby('Brand').mean()['Profit'].plot(kind='bar')
plt.show()

In [None]:
sns.boxplot(x='Brand', y='Sale Price', data=df)
plt.show()

The Off-Whites seems to have a higher profit, but they have less sales. Probably, it's because their demand is higher due to less sneakers sold at retail. Also, Off-Whites can have more prices considered an outlier, i.e. much bigger than the mean, than Yeezys, as you can see in the boxplot.

# Regression

Finally, I'll apply a linear regression model to the Yeezy dataset, to predict which price is considered good to sell any given sneaker in this list.

For this, I'll use as features the ones who presented the highest correlation: Time Released, and of course, the Name of the sneaker as categorical one.

## Not using the Shoe Size as feature

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn import preprocessing

In [None]:
features = ['Sneaker Name', 'Time Released']
target = 'Sale Price'

X = yzy[features]
y = yzy[target]

In [None]:
X = pd.get_dummies(X)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.3, random_state = 42)

In [None]:
sc = preprocessing.StandardScaler()

X_train_std = sc.fit_transform(X_train)
X_test_std = sc.transform (X_test)

In [None]:
regressor = LinearRegression()

regressor.fit(X_train_std, y_train)

In [None]:
regressor.score(X_test_std, y_test)

In [None]:
y_pred = regressor.predict(X_test_std)

evaluate = pd.DataFrame({'Actual': y_test.values.flatten(), 'Predicted': y_pred.flatten()})
evaluate.head(10)

In [None]:
evaluate.head(10).plot(kind = 'bar')
plt.show()

In [None]:
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
# R2 Score
print(f"R2 score: {r2_score(y_test, y_pred)}")
# Mean Absolute Error (MAE)
print(f"MAE score: {mean_absolute_error(y_test, y_pred)}")
# Mean Squared Error (MSE)
print(f"MSE score: {mean_squared_error(y_test, y_pred)}")

My regressor had an accuraccy of 0.78, which I consider good, due to we are using only two variables. Now, I'll try to see the influence of the Shoe Size by training again the model with it as a feature. It's important to notice that Shoe Size is a categorical variable!

## Using the Shoe Size as feature

In [None]:
features = ['Sneaker Name', 'Time Released', 'Shoe Size']
target = 'Sale Price'

X = yzy[features]
y = yzy[target]

In [None]:
X = pd.get_dummies(X, columns=['Sneaker Name', 'Shoe Size'])

In [None]:
X.head()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.3, random_state = 42)

sc = preprocessing.StandardScaler()

X_train_std = sc.fit_transform(X_train)
X_test_std = sc.transform (X_test)

regressor = LinearRegression()

regressor.fit(X_train_std, y_train)

In [None]:
regressor.score(X_test_std, y_test)

In [None]:
y_pred = regressor.predict(X_test_std)

evaluate = pd.DataFrame({'Actual': y_test.values.flatten(), 'Predicted': y_pred.flatten()})
evaluate.head(10)

In [None]:
evaluate.head(10).plot(kind = 'bar')

In [None]:
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
# R2 Score
print(f"R2 score: {r2_score(y_test, y_pred)}")
# Mean Absolute Error (MAE)
print(f"MAE score: {mean_absolute_error(y_test, y_pred)}")
# Mean Squared Error (MSE)
print(f"MSE score: {mean_squared_error(y_test, y_pred)}")

# Conclusion
In this notebook, we were able to see some of the main features that affect the price of an Yeezy sneakers. A lot of sneakers start with a very high resell price and decreases over time, another ones, which I would consider as a good investment, present high profit, high demand and low devaluation. 

We can see that Yeezy sneakers are a lot popular, too. Even with some high prices, they sell almost 3x than Off-Whites, which probably has less products in stock when it releases in retail. 

After that, I constructed a model to predict the price of the yeezy sneakers. It performed good, as I expected it would be hard to predict the prices with a little number of variables, but I think that I used, in this model, the 3 more important: time passed since release, shoe size and the sneaker model.