# Zadanie domowe 2

In [212]:
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

In [213]:
data = pd.read_csv('allegro-api-transactions.csv')

In [214]:
data.head()

Unnamed: 0,lp,date,item_id,categories,pay_option_on_delivery,pay_option_transfer,seller,price,it_is_allegro_standard,it_quantity,it_is_brand_zone,it_seller_rating,it_location,main_category
0,0,2016-04-03 21:21:08,4753602474,"['Komputery', 'Dyski i napędy', 'Nośniki', 'No...",1,1,radzioch666,59.99,1,997,0,50177,Warszawa,Komputery
1,1,2016-04-03 15:35:26,4773181874,"['Odzież, Obuwie, Dodatki', 'Bielizna damska',...",1,1,InwestycjeNET,4.9,1,9288,0,12428,Warszawa,"Odzież, Obuwie, Dodatki"
2,2,2016-04-03 14:14:31,4781627074,"['Dom i Ogród', 'Budownictwo i Akcesoria', 'Śc...",1,1,otostyl_com,109.9,1,895,0,7389,Leszno,Dom i Ogród
3,3,2016-04-03 19:55:44,4783971474,"['Książki i Komiksy', 'Poradniki i albumy', 'Z...",1,1,Matfel1,18.5,0,971,0,15006,Wola Krzysztoporska,Książki i Komiksy
4,4,2016-04-03 18:05:54,4787908274,"['Odzież, Obuwie, Dodatki', 'Ślub i wesele', '...",1,1,PPHU_RICO,19.9,1,950,0,32975,BIAŁYSTOK,"Odzież, Obuwie, Dodatki"


W zbiorze jest 6 zmiennych kategorycznych: pay_option_on_delivery, pay_option_transfer, it_is_allegro_standard, it_is_brand_zone, it_location, main_category. Zmiennej seller nie rozpatruję.

In [215]:
pd.unique(data.it_location).shape

(10056,)

In [216]:
data.groupby('it_location').size().nlargest(50)

it_location
Warszawa                   23244
Kraków                     15135
Łódź                       10935
Poznań                     10610
Wrocław                     8646
Białystok                   7356
Lublin                      6654
Rzeszów                     6054
Kielce                      5779
Częstochowa                 5601
Internet                    4514
INTERNET                    3956
Gdańsk                      3933
Szczecin                    3929
Katowice                    3720
Bydgoszcz                   3708
Koszalin                    3301
Zielona Góra                3100
Gdynia                      2851
WARSZAWA                    2672
internet                    2519
Dębica                      2417
Cała Polska                 2224
Bielsko-Biała               2199
Radom                       2097
Polska                      2086
Toruń                       2000
Chrzypsko Wielkie           1984
Tarnowskie Góry             1851
kobylnica                   182

W it_location jest bardzo wiele unikalnych wartości, może wystąpić problem, gdy w nieznanych danych pojawi się nowa miejscowość.

In [217]:
pd.unique(data.main_category).shape

(27,)

## Wstępne przygotowanie danych
Na podstawie pracy domowej 1.

In [218]:
data = data.drop(columns='lp')
data = data.drop(columns='seller')
data = data.drop(columns='date')
data = data.drop(columns='categories')

In [219]:
data.shape

(420020, 10)

In [220]:
((data.it_quantity<=0) | (data.it_seller_rating<=0) | (data.price<=0)).sum()

63778

In [221]:
ind = (data.it_quantity<=0) | (data.it_seller_rating<=0) | (data.price<=0)

In [222]:
data = data.loc[~ind,:]

In [223]:
data.shape

(356242, 10)

In [224]:
data.loc[:,'price'] = np.log(data.price)
data.loc[:,'it_quantity'] = np.log(data.it_quantity)
data.loc[:,'it_seller_rating'] = np.log(data.it_seller_rating)

## Podział danych

Jako zmienna objaśnianą przyjmuję it_quantity.

In [225]:
from sklearn.model_selection import train_test_split

In [226]:
X = data.drop(columns='it_quantity')
y = data.it_quantity

In [227]:
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [228]:
X_train.reset_index(inplace=True)
X_test.reset_index(inplace=True)

## Kodowania

### One Hot encoding
Dla zmiennej main_category zastosujemy One Hot encoding. Tu raczej nie musimy się martwić tym, że nagle pojawi się dodatkowa kategoria.

In [229]:
from sklearn.preprocessing import OneHotEncoder

In [230]:
onh = OneHotEncoder()
tmp = pd.DataFrame(onh.fit_transform(X_train.main_category.values.reshape(-1,1)).toarray())

In [231]:
X_train = X_train.drop(columns='main_category')
X_train = pd.concat((X_train, tmp), axis=1)

In [232]:
X_train.shape

(267181, 35)

In [233]:
X_train = X_train.drop(columns='index')

In [234]:
X_test = X_test.drop(columns='index')

In [235]:
tmp = pd.DataFrame(onh.transform(X_test.main_category.values.reshape(-1,1)).toarray())
X_test = X_test.drop(columns='main_category')
X_test = pd.concat((X_test, tmp), axis=1)

In [236]:
X_test.shape

(89061, 34)

### Działanie modelu bez kolumny it_location

In [237]:
from sklearn.linear_model import LinearRegression

In [240]:
X_train_2 = X_train.drop(columns='it_location')
X_test_2 = X_test.drop(columns='it_location')

lr = LinearRegression()
lr.fit(X_train_2, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)

In [242]:
y_pred = lr.predict(X_test_2)

print(mean_squared_error(y_test, y_pred),
     mean_absolute_error(y_test, y_pred),
     r2_score(y_test, y_pred))

6.95724307951012 2.080567612841791 0.15370237869283976


### Target Encoding

In [243]:
from category_encoders.target_encoder import TargetEncoder

In [244]:
te = TargetEncoder(cols=['it_location'])

In [245]:
X_train_te = te.fit_transform(X_train, y_train)

#### Model

In [246]:
from sklearn.linear_model import LinearRegression

In [247]:
lr = LinearRegression()
lr.fit(X_train_te, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)

In [248]:
X_test_te = te.transform(X_test)
y_pred = lr.predict(X_test_te)

print(mean_squared_error(y_test, y_pred),
     mean_absolute_error(y_test, y_pred),
     r2_score(y_test, y_pred))

6.957235480576751 2.0805588054199493 0.153703303047381


# Wnioski

Rozkodowanie kolumny it_location używając target encoding mało wniosło do modelu.