In [80]:
import pandas as pd
import numpy as np
from collections import Counter

## ML Libraries
from sklearn.feature_extraction.text import TfidfVectorizer, TfidfTransformer, CountVectorizer
from sklearn.metrics import accuracy_score, mean_squared_error
from sklearn.neural_network import MLPRegressor
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

## Plotting Libraries
import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt

In [2]:
!ls

1.baseline.ipynb      test.tsv              test_stg2.tsv.zip
sample_submission.csv test_stg2.tsv         train.tsv


### **"Objective"** --> predict sales price of an item  

In [3]:
submit = pd.read_csv('sample_submission.csv')

In [4]:
submit.tail()

Unnamed: 0,test_id,price
693354,693354,26.738
693355,693355,26.738
693356,693356,26.738
693357,693357,26.738
693358,693358,26.738


In [5]:
train = pd.read_csv('train.tsv', sep='\t').reset_index(drop=True)
print(len(train))
train.tail()

1482535


Unnamed: 0,train_id,name,item_condition_id,category_name,brand_name,price,shipping,item_description
1482530,1482530,Free People Inspired Dress,2,Women/Dresses/Mid-Calf,Free People,20.0,1,"Lace, says size small but fits medium perfectl..."
1482531,1482531,Little mermaid handmade dress,2,Kids/Girls 2T-5T/Dresses,Disney,14.0,0,Little mermaid handmade dress never worn size 2t
1482532,1482532,21 day fix containers and eating plan,2,Sports & Outdoors/Exercise/Fitness accessories,,12.0,0,"Used once or twice, still in great shape."
1482533,1482533,World markets lanterns,3,Home/Home Décor/Home Décor Accents,,45.0,1,There is 2 of each one that you see! So 2 red ...
1482534,1482534,Brand new lux de ville wallet,1,Women/Women's Accessories/Wallets,,22.0,0,"New with tag, red with sparkle. Firm price, no..."


## Description
- **item_condition_id** - the condition of the items provided by the seller
- **category_name** - category of the listing
- **shipping** - 1 if shipping fee is paid by seller and 0 by buyer

In [6]:
train = train.where((pd.notnull(train)), None)
train.head()

Unnamed: 0,train_id,name,item_condition_id,category_name,brand_name,price,shipping,item_description
0,0,MLB Cincinnati Reds T Shirt Size XL,3,Men/Tops/T-shirts,,10.0,1,No description yet
1,1,Razer BlackWidow Chroma Keyboard,3,Electronics/Computers & Tablets/Components & P...,Razer,52.0,0,This keyboard is in great condition and works ...
2,2,AVA-VIV Blouse,1,Women/Tops & Blouses/Blouse,Target,10.0,1,Adorable top with a hint of lace and a key hol...
3,3,Leather Horse Statues,1,Home/Home Décor/Home Décor Accents,,35.0,1,New with tags. Leather horses. Retail for [rm]...
4,4,24K GOLD plated rose,1,Women/Jewelry/Necklaces,,44.0,0,Complete with certificate of authenticity


In [7]:
for i in range(0,10):
    print(i, train.loc[i].item_description)

0 No description yet
1 This keyboard is in great condition and works like it came out of the box. All of the ports are tested and work perfectly. The lights are customizable via the Razer Synapse app on your PC.
2 Adorable top with a hint of lace and a key hole in the back! The pale pink is a 1X, and I also have a 3X available in white!
3 New with tags. Leather horses. Retail for [rm] each. Stand about a foot high. They are being sold as a pair. Any questions please ask. Free shipping. Just got out of storage
4 Complete with certificate of authenticity
5 Banana republic bottoms, Candies skirt with matching blazer,Amy Byers suit, Loft bottoms and cami top.
6 Size small but straps slightly shortened to fit xs, besides that, perfect condition
7 You get three pairs of Sophie cheer shorts size small and medium girls and two sports bra/boy shorts spandex matching sets in small and medium girls. All items total retail for [rm] in store and you can take him today for less than the price of one

In [8]:
for i in range(0,10):
    print(i, train.loc[i].brand_name)

0 None
1 Razer
2 Target
3 None
4 None
5 None
6 Acacia Swimwear
7 Soffe
8 Nike
9 None


## Check most expensive brands

In [9]:
most_expensive_brands = list(set(train.brand_name[train.price >1500]))
print(len(most_expensive_brands))
most_expensive_brands[0:10]

10


['Saint Laurent',
 'David Yurman',
 'Celine',
 'Mary Kay',
 'Michael Kors',
 'Apple',
 'Urban Decay',
 'Chanel',
 'Louis Vuitton',
 None]

## Which products are above 1500 price

In [10]:
train[train.price > 1500][0:15]

Unnamed: 0,train_id,name,item_condition_id,category_name,brand_name,price,shipping,item_description
1971,1971,Chanel Classic Flag Bag medium Caviar L,3,Women/Women's Handbags/Shoulder Bag,Chanel,1506.0,0,No description yet
145696,145696,Louis Vuitton Monogram Metis Bag,2,Women/Women's Handbags/Shoulder Bag,Louis Vuitton,1759.0,0,32 L x 36 H x 14 W cm Features: Open top; One ...
164527,164527,MONOGRAM SAINT LAURENT WALLET MATELASSÉ,1,Women/Women's Handbags/Shoulder Bag,Saint Laurent,1525.0,1,"$ 1,550.00 RETAIL MONOGRAM SAINT LAURENT CHAIN..."
180749,180749,NEW apple products,1,Electronics/Cell Phones & Accessories/Cell Pho...,Apple,1909.0,0,"bundle includes: ipad pro keyboard, iphone 6S ..."
182847,182847,Auth. chanel boy maxi lambskin France,3,Women/Women's Handbags/Shoulder Bag,Chanel,1850.0,1,Elegant authentic chanel boy bag made in Franc...
194639,194639,iPad Pro,1,Electronics/Computers & Tablets/iPad/Tablet/eB...,Apple,1900.0,1,"Great condition iPad Pro, biggest size with lo..."
229506,229506,Elegant authentic Chanel boy bag France,2,Women/Women's Handbags/Shoulder Bag,Chanel,1625.0,1,Ob hold. ...Beautiful and elegant authentic Ch...
244054,244054,LV Bag,2,Women/Women's Handbags/Shoulder Bag,Louis Vuitton,1815.0,0,9/10 condition like new
279398,279398,Louis Vuitton Kensington Damier Ebene,2,Vintage & Collectibles/Bags and Purses/Handbag,Louis Vuitton,1600.0,1,Brand new Bag 100% authentic Perfect condition...
339886,339886,Chanel Le Boy,3,Women/Women's Handbags/Shoulder Bag,Chanel,1609.0,0,Large. Authentic. Comes with dustbag and authe...


In [11]:
ultra_cheap_brands = list(set(train.brand_name[train.price <1]))
print(len(ultra_cheap_brands))
ultra_cheap_brands[0:10]

237


['Anne Klein',
 'Penn Fishing',
 'Army',
 'MARC BY MARC JACOBS',
 'Kendra Scott',
 'Free People',
 'Bath & Body Works',
 'Jovani',
 'Juicy Couture',
 'Younique']

## Which products are below 1 price - dropping 0 price products

In [12]:
print(len(train))
print(len(train[train.price == 0]))

1482535
874


In [13]:
train = train[train.price != 0].reset_index(drop=True)

In [14]:
print(len(train))
print(len(train[train.price == 0]))

1481661
0


## Count number of Brands 

In [15]:
total_number_brands = print(len(set(train.brand_name)))

4808


In [16]:
brand_counter = Counter(list(train.brand_name))

In [17]:
brand_counter.most_common(n=10)

[(None, 632336),
 ('PINK', 54072),
 ('Nike', 54006),
 ("Victoria's Secret", 48011),
 ('LuLaRoe', 30995),
 ('Apple', 17314),
 ('FOREVER 21', 15178),
 ('Nintendo', 14998),
 ('Lululemon', 14550),
 ('Michael Kors', 13916)]

### Feature #1 - Brand name encoded

In [18]:
train["brand_name_encoded"] = ""
train.head()

Unnamed: 0,train_id,name,item_condition_id,category_name,brand_name,price,shipping,item_description,brand_name_encoded
0,0,MLB Cincinnati Reds T Shirt Size XL,3,Men/Tops/T-shirts,,10.0,1,No description yet,
1,1,Razer BlackWidow Chroma Keyboard,3,Electronics/Computers & Tablets/Components & P...,Razer,52.0,0,This keyboard is in great condition and works ...,
2,2,AVA-VIV Blouse,1,Women/Tops & Blouses/Blouse,Target,10.0,1,Adorable top with a hint of lace and a key hol...,
3,3,Leather Horse Statues,1,Home/Home Décor/Home Décor Accents,,35.0,1,New with tags. Leather horses. Retail for [rm]...,
4,4,24K GOLD plated rose,1,Women/Jewelry/Necklaces,,44.0,0,Complete with certificate of authenticity,


In [19]:
label_enc_brand = LabelEncoder()
train.brand_name_encoded = label_enc_brand.fit_transform(train["brand_name"].astype(str))
train.loc[0:5]

Unnamed: 0,train_id,name,item_condition_id,category_name,brand_name,price,shipping,item_description,brand_name_encoded
0,0,MLB Cincinnati Reds T Shirt Size XL,3,Men/Tops/T-shirts,,10.0,1,No description yet,3076
1,1,Razer BlackWidow Chroma Keyboard,3,Electronics/Computers & Tablets/Components & P...,Razer,52.0,0,This keyboard is in great condition and works ...,3556
2,2,AVA-VIV Blouse,1,Women/Tops & Blouses/Blouse,Target,10.0,1,Adorable top with a hint of lace and a key hol...,4179
3,3,Leather Horse Statues,1,Home/Home Décor/Home Décor Accents,,35.0,1,New with tags. Leather horses. Retail for [rm]...,3076
4,4,24K GOLD plated rose,1,Women/Jewelry/Necklaces,,44.0,0,Complete with certificate of authenticity,3076
5,5,Bundled items requested for Ruie,3,Women/Other/Other,,59.0,0,"Banana republic bottoms, Candies skirt with ma...",3076


### Feature #2 - Category name encoded

In [20]:
train["category_name_encoded"] = ""
train.head()

Unnamed: 0,train_id,name,item_condition_id,category_name,brand_name,price,shipping,item_description,brand_name_encoded,category_name_encoded
0,0,MLB Cincinnati Reds T Shirt Size XL,3,Men/Tops/T-shirts,,10.0,1,No description yet,3076,
1,1,Razer BlackWidow Chroma Keyboard,3,Electronics/Computers & Tablets/Components & P...,Razer,52.0,0,This keyboard is in great condition and works ...,3556,
2,2,AVA-VIV Blouse,1,Women/Tops & Blouses/Blouse,Target,10.0,1,Adorable top with a hint of lace and a key hol...,4179,
3,3,Leather Horse Statues,1,Home/Home Décor/Home Décor Accents,,35.0,1,New with tags. Leather horses. Retail for [rm]...,3076,
4,4,24K GOLD plated rose,1,Women/Jewelry/Necklaces,,44.0,0,Complete with certificate of authenticity,3076,


In [21]:
label_enc_categories = LabelEncoder()

In [22]:
train.category_name_encoded = label_enc_categories.fit_transform(train["category_name"].astype(str))
train.loc[0:5]

Unnamed: 0,train_id,name,item_condition_id,category_name,brand_name,price,shipping,item_description,brand_name_encoded,category_name_encoded
0,0,MLB Cincinnati Reds T Shirt Size XL,3,Men/Tops/T-shirts,,10.0,1,No description yet,3076,808
1,1,Razer BlackWidow Chroma Keyboard,3,Electronics/Computers & Tablets/Components & P...,Razer,52.0,0,This keyboard is in great condition and works ...,3556,86
2,2,AVA-VIV Blouse,1,Women/Tops & Blouses/Blouse,Target,10.0,1,Adorable top with a hint of lace and a key hol...,4179,1255
3,3,Leather Horse Statues,1,Home/Home Décor/Home Décor Accents,,35.0,1,New with tags. Leather horses. Retail for [rm]...,3076,485
4,4,24K GOLD plated rose,1,Women/Jewelry/Necklaces,,44.0,0,Complete with certificate of authenticity,3076,1182
5,5,Bundled items requested for Ruie,3,Women/Other/Other,,59.0,0,"Banana republic bottoms, Candies skirt with ma...",3076,1194


In [23]:
cat_counter = Counter(list(train.category_name))

In [24]:
cat_counter.most_common(n=10)

[('Women/Athletic Apparel/Pants, Tights, Leggings', 60152),
 ('Women/Tops & Blouses/T-Shirts', 46349),
 ('Beauty/Makeup/Face', 34320),
 ('Beauty/Makeup/Lips', 29901),
 ('Electronics/Video Games & Consoles/Games', 26547),
 ('Beauty/Makeup/Eyes', 25200),
 ('Electronics/Cell Phones & Accessories/Cases, Covers & Skins', 24668),
 ('Women/Underwear/Bras', 21254),
 ('Women/Tops & Blouses/Tank, Cami', 20270),
 ('Women/Tops & Blouses/Blouse', 20269)]

In [25]:
cat_counter['Beauty/Makeup/Face']

34320

In [26]:
a = "Check whether you have started the CoreNLP server e.g"

In [27]:
"CoreNLP server" in a

True

### Feature #3 - Most expensive price (Brand)

In [28]:
train["most_expensive_brand"] = ""
train.head()

Unnamed: 0,train_id,name,item_condition_id,category_name,brand_name,price,shipping,item_description,brand_name_encoded,category_name_encoded,most_expensive_brand
0,0,MLB Cincinnati Reds T Shirt Size XL,3,Men/Tops/T-shirts,,10.0,1,No description yet,3076,808,
1,1,Razer BlackWidow Chroma Keyboard,3,Electronics/Computers & Tablets/Components & P...,Razer,52.0,0,This keyboard is in great condition and works ...,3556,86,
2,2,AVA-VIV Blouse,1,Women/Tops & Blouses/Blouse,Target,10.0,1,Adorable top with a hint of lace and a key hol...,4179,1255,
3,3,Leather Horse Statues,1,Home/Home Décor/Home Décor Accents,,35.0,1,New with tags. Leather horses. Retail for [rm]...,3076,485,
4,4,24K GOLD plated rose,1,Women/Jewelry/Necklaces,,44.0,0,Complete with certificate of authenticity,3076,1182,


In [29]:
list_brands = list(set(train.brand_name))
list_brands_avg_price = []

In [30]:
for brand in list_brands:
    avg_price = train.price[train.brand_name == brand].mean()
    list_brands_avg_price.append(avg_price)

In [31]:
prices_avg = list(zip(list_brands, list_brands_avg_price))
prices_avg[0:10]

[('Fox', 15.875),
 ('Gerber', 13.412234042553191),
 ('Billabong', 15.94979079497908),
 ('Valve', 30.0),
 ('Spyder', 36.34117647058824),
 ('KR3W', 16.0),
 ('Berek', 13.5),
 ('Melannco', 9.0),
 ('She and Sky', 16.333333333333332),
 ('ACCEL', 22.0)]

In [32]:
## 10 most expensive brands (AVG)
most_exp_avg = sorted(prices_avg, key=lambda x: x[1], reverse = True)[0:20]
most_exp_avg

[('Demdaco', 429.0),
 ('Proenza Schouler', 413.25),
 ('MCM Worldwide', 289.17391304347825),
 ('Longines', 254.0),
 ('Vitamix', 253.11111111111111),
 ('Blendtec', 251.8),
 ('David Yurman', 248.61157024793388),
 ('Celine', 238.78082191780823),
 ('Saint Laurent', 232.578125),
 ('Canada Goose', 230.25),
 ('Valentino Garavani', 227.0),
 ('Mackage', 221.0),
 ('MICHELE', 216.98305084745763),
 ('Alexander Wang', 205.7027027027027),
 ('Contours', 203.33333333333334),
 ('EVGA', 202.66666666666666),
 ('Louis Vuitton', 202.1876372764355),
 ('Stephen Webster', 200.0),
 ('Alyce Paris', 200.0),
 ('Mikimoto', 189.0)]

In [33]:
## 10 least expensive brands (AVG)
least_exp_avg = sorted(prices_avg, key=lambda x: x[1], reverse = False)[0:20]
least_exp_avg

[('Scunci', 3.0),
 ('Play MG', 3.0),
 ('Ask', 3.0),
 ('Old Glory', 3.0),
 ('Feetures!', 3.0),
 ('Peanut Shell', 3.0),
 ('Kids Only', 3.0),
 ('A.B.S. by Allen Schwartz', 3.0),
 ('Kae Argatherapie', 3.0),
 ('DUO', 3.0),
 ('Revlon Colorsilk', 3.0),
 ('Genica', 3.0),
 ('First Wave', 4.0),
 ('Grant Thomas', 5.0),
 ('Avon Cosmetics, Inc', 5.0),
 ('DenTek', 5.0),
 ('Deco Mesh', 5.0),
 ('Coed Sportwear', 5.0),
 ('Animal Adventure', 5.0),
 ('PEZ', 5.8)]

In [34]:
most_exp_avg

[('Demdaco', 429.0),
 ('Proenza Schouler', 413.25),
 ('MCM Worldwide', 289.17391304347825),
 ('Longines', 254.0),
 ('Vitamix', 253.11111111111111),
 ('Blendtec', 251.8),
 ('David Yurman', 248.61157024793388),
 ('Celine', 238.78082191780823),
 ('Saint Laurent', 232.578125),
 ('Canada Goose', 230.25),
 ('Valentino Garavani', 227.0),
 ('Mackage', 221.0),
 ('MICHELE', 216.98305084745763),
 ('Alexander Wang', 205.7027027027027),
 ('Contours', 203.33333333333334),
 ('EVGA', 202.66666666666666),
 ('Louis Vuitton', 202.1876372764355),
 ('Stephen Webster', 200.0),
 ('Alyce Paris', 200.0),
 ('Mikimoto', 189.0)]

In [35]:
company, price = zip(*most_exp_avg)
train['most_expensive_brand']=train['brand_name'].apply(lambda x: 1 if x in list(company) else 0)
print(len(train[train['most_expensive_brand']==1]))
train[train['most_expensive_brand']==1][0:10]

3852


Unnamed: 0,train_id,name,item_condition_id,category_name,brand_name,price,shipping,item_description,brand_name_encoded,category_name_encoded,most_expensive_brand
766,766,Louis Vuitton wallet,2,Women/Women's Accessories/Wallets,Louis Vuitton,15.0,0,"Not used it was a gift, great condition",2591,1277,1
822,822,Louis Vuitton box include dust bag,1,Women/Women's Accessories/Wallets,Louis Vuitton,39.0,0,Louis Vuitton wallet box with dust bag No flaw...,2591,1277,1
1333,1334,Authentic Louis Vuitton Mini Lin,3,Vintage & Collectibles/Bags and Purses/Handbag,Louis Vuitton,306.0,0,Louis Vuitton Speedy 30 M.M. Lin Ebene. Good c...,2591,978,1
2205,2206,Louis Vuitton COMPACT CURIEUSE WALLET,2,Women/Women's Accessories/Wallets,Louis Vuitton,316.0,0,"Bought it 2013 - Excellent, condition - Includ...",2591,1277,1
2567,2569,Phantom Trapeze Handbag,2,Women/Women's Handbags/Totes & Shoppers,Celine,70.0,0,NWOT; black vegan leather Phantom Trapeze larg...,880,1287,1
2687,2689,David Yurman cable classics bracelet,3,Women/Jewelry/Bracelets,David Yurman,200.0,1,David Yurman cable classics bracelet with pear...,1197,1180,1
2804,2807,LV artsy,3,Women/Women's Handbags/Shoulder Bag,Louis Vuitton,180.0,1,No description yet,2591,1286,1
3306,3309,Louis Vuitton Tivoli GM,3,Women/Women's Handbags/Shoulder Bag,Louis Vuitton,749.0,0,The lather is even honey patina clean inside v...,2591,1286,1
4146,4151,Gold chain,2,Women/Women's Handbags/Messenger & Crossbody,Louis Vuitton,24.0,0,It's not branded not a Louis Vuitton chain but...,2591,1283,1
4420,4425,Louis Vuitton Clutch,3,Women/Women's Handbags/Messenger & Crossbody,Louis Vuitton,66.0,0,Genuine Leather. A cute clutch/wristlet. It's ...,2591,1283,1


In [36]:
train['least_expensive_brand']=""
company, price = zip(*least_exp_avg)
train['least_expensive_brand']=train['brand_name'].apply(lambda x: 1 if x in list(company) else 0)
print(len(train[train['least_expensive_brand']==1]))
train[train['least_expensive_brand']==1][0:10]

24


Unnamed: 0,train_id,name,item_condition_id,category_name,brand_name,price,shipping,item_description,brand_name_encoded,category_name_encoded,most_expensive_brand,least_expensive_brand
83235,83291,RSVD@ALEXIS Duo WHITE clear strip glue,1,Beauty/Makeup/Eyes,DUO,3.0,1,√RESERVED@ALEXIS purchase Only. ########## PLE...,1166,26,0,1
128217,128293,Color Silk 54 Light Golden Brown,2,Women/Women's Accessories/Hair Accessories,Revlon Colorsilk,3.0,1,Never used! Comes with conditioner.,3619,1272,0,1
222222,222339,Feetures running socks,3,Women/Athletic Apparel/Socks,Feetures!,3.0,1,No description yet,1546,1143,0,1
251041,251170,Avengers pez candy set,1,Kids/Toys/Action Figures & Statues,PEZ,10.0,0,Avengers pez candy set. Iron man with light up...,3216,683,0,1
451291,451514,Grant Thomas black genuine leather belt,3,Men/Men's Accessories/Belts,Grant Thomas,5.0,1,This belt is super nice! It is made of black g...,1808,744,0,1
596492,596805,Red luxury men's Watch,1,Electronics/Cell Phones & Accessories/Cell Pho...,Genica,3.0,1,New* 100% working Red luxury watch Fast shipping,1718,80,0,1
627924,628259,Luxurious earrings,1,Women/Jewelry/Earrings,Kae Argatherapie,3.0,1,New never used,2283,1181,0,1
779679,780113,Bathroom Organizer,2,Home/Bath/Bathroom Accessories,"Avon Cosmetics, Inc",5.0,1,Avon Bathroom Organizer with 6 small pockets &...,359,418,0,1
853856,854325,☆10kt gold Diamond cut Cuban link chain☆,2,Men/Men's Accessories/Other,Ask,3.0,1,I am selling this 30 inch 10 karat gold neckla...,314,746,0,1
935615,936136,Plaid long sleeve,3,Men/Tops/Button-Front,Old Glory,3.0,1,Nothing wrong with it! It's a men's large,3141,802,0,1


### NLP Features

In [37]:
import gensim
from string import punctuation
import re
import unicodedata

In [38]:
def remove_latin_character(text):
    text = unicodedata.normalize('NFKD',text).encode('ASCII','ignore').decode("utf-8") # remove latin utf-8 characters
    return text

In [39]:
def strip_punctuation(s):
    return ''.join(c for c in s if c not in punctuation)

In [40]:
def remove_link(text):
    text = text.split()
    links = []
    for item in text:
        try:
            result = re.search('http(.*)', item)
            name = result.group(1)
            links.append(item)
        except:
            pass
    text_filtered = [x for x in text if x not in links]
    text_filtered = " ".join(text_filtered)
    return text_filtered, links

In [41]:
text = 'All items total retail for [rm] in store and you can take hi'

In [42]:
def remove_rm(text):
    text = text.split()
    text_filtered = [x for x in text if x != "[rm]"]
    text_filtered = " ".join(text_filtered)
    return text_filtered

In [43]:
remove_rm(text)

'All items total retail for in store and you can take hi'

In [44]:
def preprocess(text):
    # Replace punctuation with tokens so we can use them in our model
    text = text.lower()
    text = text.replace('\n', ' ')
    
    # Remove links 
    text, _ = remove_link(text)

    # Remove punctuation
    text = strip_punctuation(text)
    
    # Remove latin character
    
    text = remove_latin_character(text)
    
    # Removing non ASCII chars    
    text = re.sub(r'[^\x00-\x7f]',r' ',text)
    
    # Remove "RM" string
    text = remove_rm(text)
    
    #text = re.sub(r'[^\w\s]','',text) # strip punctuation
    
    text = ' '.join(text.split()) ## replacing extra spaces
    text = text.replace(".","")
    return text

In [45]:
for x,item in enumerate(train['item_description'][0:5]):
    print(x)
    print(item)
    print(preprocess(item))

0
No description yet
no description yet
1
This keyboard is in great condition and works like it came out of the box. All of the ports are tested and work perfectly. The lights are customizable via the Razer Synapse app on your PC.
this keyboard is in great condition and works like it came out of the box all of the ports are tested and work perfectly the lights are customizable via the razer synapse app on your pc
2
Adorable top with a hint of lace and a key hole in the back! The pale pink is a 1X, and I also have a 3X available in white!
adorable top with a hint of lace and a key hole in the back the pale pink is a 1x and i also have a 3x available in white
3
New with tags. Leather horses. Retail for [rm] each. Stand about a foot high. They are being sold as a pair. Any questions please ask. Free shipping. Just got out of storage
new with tags leather horses retail for rm each stand about a foot high they are being sold as a pair any questions please ask free shipping just got out of s

In [46]:
train = train[train.item_description != None].reset_index(drop=True)

In [47]:
#for i in range(0,1000):
    #if (type(train['item_description'][i])) != 'str':
#    print(preprocess(train['item_description'][i]))

In [48]:
train['item_description_clean'] = ""
train.head()

Unnamed: 0,train_id,name,item_condition_id,category_name,brand_name,price,shipping,item_description,brand_name_encoded,category_name_encoded,most_expensive_brand,least_expensive_brand,item_description_clean
0,0,MLB Cincinnati Reds T Shirt Size XL,3,Men/Tops/T-shirts,,10.0,1,No description yet,3076,808,0,0,
1,1,Razer BlackWidow Chroma Keyboard,3,Electronics/Computers & Tablets/Components & P...,Razer,52.0,0,This keyboard is in great condition and works ...,3556,86,0,0,
2,2,AVA-VIV Blouse,1,Women/Tops & Blouses/Blouse,Target,10.0,1,Adorable top with a hint of lace and a key hol...,4179,1255,0,0,
3,3,Leather Horse Statues,1,Home/Home Décor/Home Décor Accents,,35.0,1,New with tags. Leather horses. Retail for [rm]...,3076,485,0,0,
4,4,24K GOLD plated rose,1,Women/Jewelry/Necklaces,,44.0,0,Complete with certificate of authenticity,3076,1182,0,0,


In [49]:
train['item_description_clean'] = train['item_description'].apply(lambda x: preprocess(str(x)))

In [50]:
train.head()

Unnamed: 0,train_id,name,item_condition_id,category_name,brand_name,price,shipping,item_description,brand_name_encoded,category_name_encoded,most_expensive_brand,least_expensive_brand,item_description_clean
0,0,MLB Cincinnati Reds T Shirt Size XL,3,Men/Tops/T-shirts,,10.0,1,No description yet,3076,808,0,0,no description yet
1,1,Razer BlackWidow Chroma Keyboard,3,Electronics/Computers & Tablets/Components & P...,Razer,52.0,0,This keyboard is in great condition and works ...,3556,86,0,0,this keyboard is in great condition and works ...
2,2,AVA-VIV Blouse,1,Women/Tops & Blouses/Blouse,Target,10.0,1,Adorable top with a hint of lace and a key hol...,4179,1255,0,0,adorable top with a hint of lace and a key hol...
3,3,Leather Horse Statues,1,Home/Home Décor/Home Décor Accents,,35.0,1,New with tags. Leather horses. Retail for [rm]...,3076,485,0,0,new with tags leather horses retail for rm eac...
4,4,24K GOLD plated rose,1,Women/Jewelry/Necklaces,,44.0,0,Complete with certificate of authenticity,3076,1182,0,0,complete with certificate of authenticity


In [51]:
train['name_clean'] = ""
train.head()

Unnamed: 0,train_id,name,item_condition_id,category_name,brand_name,price,shipping,item_description,brand_name_encoded,category_name_encoded,most_expensive_brand,least_expensive_brand,item_description_clean,name_clean
0,0,MLB Cincinnati Reds T Shirt Size XL,3,Men/Tops/T-shirts,,10.0,1,No description yet,3076,808,0,0,no description yet,
1,1,Razer BlackWidow Chroma Keyboard,3,Electronics/Computers & Tablets/Components & P...,Razer,52.0,0,This keyboard is in great condition and works ...,3556,86,0,0,this keyboard is in great condition and works ...,
2,2,AVA-VIV Blouse,1,Women/Tops & Blouses/Blouse,Target,10.0,1,Adorable top with a hint of lace and a key hol...,4179,1255,0,0,adorable top with a hint of lace and a key hol...,
3,3,Leather Horse Statues,1,Home/Home Décor/Home Décor Accents,,35.0,1,New with tags. Leather horses. Retail for [rm]...,3076,485,0,0,new with tags leather horses retail for rm eac...,
4,4,24K GOLD plated rose,1,Women/Jewelry/Necklaces,,44.0,0,Complete with certificate of authenticity,3076,1182,0,0,complete with certificate of authenticity,


In [52]:
train['name_clean'] = train['name'].apply(lambda x: preprocess(str(x)))

In [53]:
train.tail()

Unnamed: 0,train_id,name,item_condition_id,category_name,brand_name,price,shipping,item_description,brand_name_encoded,category_name_encoded,most_expensive_brand,least_expensive_brand,item_description_clean,name_clean
1481656,1482530,Free People Inspired Dress,2,Women/Dresses/Mid-Calf,Free People,20.0,1,"Lace, says size small but fits medium perfectl...",1640,1166,0,0,lace says size small but fits medium perfectly...,free people inspired dress
1481657,1482531,Little mermaid handmade dress,2,Kids/Girls 2T-5T/Dresses,Disney,14.0,0,Little mermaid handmade dress never worn size 2t,1264,633,0,0,little mermaid handmade dress never worn size 2t,little mermaid handmade dress
1481658,1482532,21 day fix containers and eating plan,2,Sports & Outdoors/Exercise/Fitness accessories,,12.0,0,"Used once or twice, still in great shape.",3076,895,0,0,used once or twice still in great shape,21 day fix containers and eating plan
1481659,1482533,World markets lanterns,3,Home/Home Décor/Home Décor Accents,,45.0,1,There is 2 of each one that you see! So 2 red ...,3076,485,0,0,there is 2 of each one that you see so 2 red 2...,world markets lanterns
1481660,1482534,Brand new lux de ville wallet,1,Women/Women's Accessories/Wallets,,22.0,0,"New with tag, red with sparkle. Firm price, no...",3076,1277,0,0,new with tag red with sparkle firm price no fr...,brand new lux de ville wallet


## NLP Features

In [104]:
features = TfidfVectorizer(max_df=0.9, stop_words=None, lowercase=True)

In [105]:
x_train_description_tfidf = features.fit_transform(train['item_description_clean'])

In [56]:
#print(x_train_phrases_tfidf.toarray().size())

In [None]:
x_train, x_valid, y_train, y_valid = train_test_split(list(zip( x_train_description_tfidf.toarray(),
                                                                train['shipping'],
                                                                train['item_condition_id'],
                                                                train['category_name_encoded'],
                                                                train['brand_name_encoded'],
                                                                train['most_expensive_brand'],
                                                                train['least_expensive_brand'])),
                                                                train['price'], 
                                                                test_size=0.2, random_state=2017)

In [58]:
model = MLPRegressor(activation = 'relu', 
                       max_iter=400, 
                       hidden_layer_sizes=(100,), 
                       solver='adam')

In [61]:
model.fit(x_train, y_train)

MLPRegressor(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(100,), learning_rate='constant',
       learning_rate_init=0.001, max_iter=400, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=None,
       shuffle=True, solver='adam', tol=0.0001, validation_fraction=0.1,
       verbose=False, warm_start=False)

In [83]:
preds = model.predict(x_valid)

In [84]:
preds[3]

21.717261229099478

In [85]:
mean_squared_error(y_valid, preds)

1253.785134139714

## Predict prices based on the model

In [101]:
brand = 'Beautees'
print(label_enc_brand.transform(brand))

ValueError: bad input shape ()

In [89]:
shipping = 1
item_condition = 1