In [102]:
# Imports
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
import re

from sklearn.cross_validation import cross_val_score
from sklearn.cross_validation import train_test_split
from sklearn import metrics
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.multiclass import OneVsRestClassifier


In [148]:
# Set up file read/folder structure
train = pd.read_csv("C:\\Users\\erroden\\Desktop\\whiskey_prediction\\data\\train.csv")
test = pd.read_csv("C:\\Users\\erroden\\Desktop\\whiskey_prediction\\data\\test.csv")
sample_submission = pd.read_csv("C:\\Users\\erroden\\Desktop\\whiskey_prediction\\data\\sample_submission.csv")

In [149]:
sample_submission.head()

Unnamed: 0,id,category
0,955,1
1,3532,3
2,1390,2
3,1024,4
4,1902,2


In [150]:
train.head()

Unnamed: 0,id,author,description,price,ratingValue,pert_alcohol,category
0,1,John Hansell,A marriage of 13 and 18 year old bourbons. A m...,85.0,97,51.5,2.0
1,2,Dave Broom,There have been some legendary Bowmores from t...,13500.0,97,42.9,1.0
2,3,John Hansell,This bottling celebrates master distiller Park...,150.0,97,50.0,2.0
3,4,John Hansell,What impresses me most is how this whisky evol...,4500.0,97,40.5,1.0
4,6,Davin de Kergommeaux,"After 40 years in barrels, the trademark Canad...",199.0,96,45.0,


In [151]:
train.describe()

Unnamed: 0,id,price,ratingValue,pert_alcohol,category
count,2874.0,2811.0,2874.0,2814.0,2586.0
mean,2075.814544,225.297937,86.361517,48.043019,1.637664
std,1177.805945,990.619608,4.511554,6.298527,0.963049
min,1.0,7.5,60.0,33.0,1.0
25%,1087.25,50.0,84.0,43.3,1.0
50%,2109.5,80.0,87.0,46.0,1.0
75%,3102.0,140.0,90.0,51.375,2.0
max,4157.0,26650.0,97.0,98.6,4.0


In [152]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2874 entries, 0 to 2873
Data columns (total 7 columns):
id              2874 non-null int64
author          2874 non-null object
description     2874 non-null object
price           2811 non-null float64
ratingValue     2874 non-null int64
pert_alcohol    2814 non-null float64
category        2586 non-null float64
dtypes: float64(3), int64(2), object(2)
memory usage: 157.2+ KB


In [153]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 288 entries, 0 to 287
Data columns (total 6 columns):
id              288 non-null int64
author          288 non-null object
description     288 non-null object
price           279 non-null float64
ratingValue     288 non-null int64
pert_alcohol    284 non-null float64
dtypes: float64(2), int64(2), object(2)
memory usage: 13.6+ KB


In [154]:
# impute mean values since there are relatively few nans
train.price.fillna(train.price.mean(), inplace=True)
train.pert_alcohol.fillna(train.pert_alcohol.mean(), inplace=True)

train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2874 entries, 0 to 2873
Data columns (total 7 columns):
id              2874 non-null int64
author          2874 non-null object
description     2874 non-null object
price           2874 non-null float64
ratingValue     2874 non-null int64
pert_alcohol    2874 non-null float64
category        2586 non-null float64
dtypes: float64(3), int64(2), object(2)
memory usage: 157.2+ KB


In [155]:
# drop train data where category is unavailable. This value could be imputed but that could introduce unknown error
train.dropna(inplace=True)
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2586 entries, 0 to 2873
Data columns (total 7 columns):
id              2586 non-null int64
author          2586 non-null object
description     2586 non-null object
price           2586 non-null float64
ratingValue     2586 non-null int64
pert_alcohol    2586 non-null float64
category        2586 non-null float64
dtypes: float64(3), int64(2), object(2)
memory usage: 161.6+ KB


In [156]:
# Add on text length as an extra data point - https://medium.com/tensorist/classifying-yelp-reviews-using-nltk-and-scikit-learn-c58e71e962d9
train["text_len"] = train.description.apply(len)

In [209]:
# generate tokens
train['tokens'] = train.description.apply(lambda x: x.split(" "))

# set stop words
stops = set(stopwords.words('english'))

# remove stop words
train['clean'] = train['tokens'].apply(lambda x: [word for word in x if word not in stops])

# remove the none text characters 
train['clean'] = train['clean'].apply(lambda x: [word for word in x if re.search('^[a-zA-Z]+', word)])

# Lemmatizer 
lemmatizer = nltk.WordNetLemmatizer()

# lemmatize and rejoin
train['clean'] = train['clean'].apply(lambda x: [lemmatizer.lemmatize(word) for word in x])
train['ready'] =  train['clean'].apply(lambda x: ' '.join(x))

# Vectorize and fit the model
from sklearn.feature_extraction.text import TfidfVectorizer

# parameters set to try to reduce features space to something that won't crash my computer
tfidf = TfidfVectorizer(max_df = .9, min_df=.01)

# running on portion of population to try to get results to work
vect = tfidf.fit_transform(train['ready'])

In [210]:
vect.shape # records x tokens

(2586, 804)

In [211]:
# reset index for join to work below
train.reset_index(drop=True, inplace=True)

In [212]:
# Classification using tfidf and category as output variable
y = train.category

In [213]:
# push the tables together
vect = pd.DataFrame(vect.toarray())
vect.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,794,795,796,797,798,799,800,801,802,803
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.232574,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.156458,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [214]:
train.head()

Unnamed: 0,id,author,description,price,ratingValue,pert_alcohol,category,text_len,tokens,clean,ready
0,1,John Hansell,A marriage of 13 and 18 year old bourbons. A m...,85.0,97,51.5,2.0,361,"[A, marriage, of, 13, and, 18, year, old, bour...","[A, marriage, year, old, bourbons., A, mature,...",A marriage year old bourbons. A mature yet ele...
1,2,Dave Broom,There have been some legendary Bowmores from t...,13500.0,97,42.9,1.0,503,"[There, have, been, some, legendary, Bowmores,...","[There, legendary, Bowmores, mid-60s, every, b...",There legendary Bowmores mid-60s every bit equ...
2,3,John Hansell,This bottling celebrates master distiller Park...,150.0,97,50.0,2.0,824,"[This, bottling, celebrates, master, distiller...","[This, bottling, celebrates, master, distiller...",This bottling celebrates master distiller Park...
3,4,John Hansell,What impresses me most is how this whisky evol...,4500.0,97,40.5,1.0,495,"[What, impresses, me, most, is, how, this, whi...","[What, impress, whisky, evolves;, incredibly, ...",What impress whisky evolves; incredibly comple...
4,9,Fred Minnick,"A caramel-laden fruit bouquet, followed by une...",150.0,96,54.49,2.0,415,"[A, caramel-laden, fruit, bouquet,, followed, ...","[A, caramel-laden, fruit, bouquet,, followed, ...","A caramel-laden fruit bouquet, followed unendi..."


In [215]:
X = pd.concat([train, vect], axis=1)
X.shape

(2586, 815)

In [216]:
new_col = train.columns.tolist() + vect.columns.tolist()

In [217]:
X.columns = new_col

In [218]:
X.tail()

Unnamed: 0,id,author,description,price,ratingValue,pert_alcohol,category,text_len,tokens,clean,...,794,795,796,797,798,799,800,801,802,803
2581,4146,John Hansell,"Earthy, fleshy notes with brooding grape notes...",80.0,70,57.1,1.0,252,"[Earthy,, fleshy, notes, with, brooding, grape...","[Earthy,, fleshy, note, brooding, grape, note,...",...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2582,4153,Davin de Kergommeaux,With its overt floral perfume notes and the sc...,17.0,65,44.9,4.0,442,"[With, its, overt, floral, perfume, notes, and...","[With, overt, floral, perfume, note, scent, ch...",...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2583,4154,Geoffrey Kleinman,"An unaged whiskey from Carroll County, Iowa, w...",35.0,65,40.0,3.0,431,"[An, unaged, whiskey, from, Carroll, County,, ...","[An, unaged, whiskey, Carroll, County,, Iowa,,...",...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2584,4155,John Hansell,"Fiery peat kiln smoke, tar, and ripe barley on...",60.0,63,45.0,1.0,327,"[Fiery, peat, kiln, smoke,, tar,, and, ripe, b...","[Fiery, peat, kiln, smoke,, tar,, ripe, barley...",...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2585,4157,Fred Minnick,"Although it’s not on the label, Cavalry uses t...",30.0,60,45.0,2.0,424,"[Although, it’s, not, on, the, label,, Cavalry...","[Although, it’s, label,, Cavalry, us, TerrePUR...",...,0.0,0.0,0.0,0.0,0.0,0.18236,0.0,0.0,0.0,0.0


In [219]:
# Now add in sentiment
from textblob import TextBlob, Word

# create a text blob column 
X['blob'] = X.description.apply(lambda x: TextBlob(x))

X['polarity'] = [sent.sentiment.polarity for sent in X.blob]

In [220]:
X.drop(['author', 'description', 'tokens', 'clean', 'ready', 'blob', 'category'],axis=1, inplace=True)
X.head()

Unnamed: 0,id,price,ratingValue,pert_alcohol,text_len,0,1,2,3,4,...,795,796,797,798,799,800,801,802,803,polarity
0,1,85.0,97,51.5,361,0.0,0.0,0.0,0.0,0.0,...,0.232574,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.269167
1,2,13500.0,97,42.9,503,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.337917
2,3,150.0,97,50.0,824,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.156458,0.0,0.0,0.0,0.235417
3,4,4500.0,97,40.5,495,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.062037
4,9,150.0,96,54.49,415,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.095192


In [221]:
GB = GradientBoostingClassifier()
model = OneVsRestClassifier(GB, n_jobs=-1)

In [222]:
train_predictions = model.fit(X.drop(['id'], axis=1),y).predict(X.drop(['id'], axis=1))

In [223]:
X['predicted_category'] = train_predictions

In [224]:
preds = X[['id', 'predicted_category']]
type(train)

pandas.core.frame.DataFrame

In [225]:
train_pred = train.merge(preds, on=['id'])
train_pred[['id','category', 'predicted_category']]

Unnamed: 0,id,category,predicted_category
0,1,2.0,2.0
1,2,1.0,1.0
2,3,2.0,2.0
3,4,1.0,1.0
4,9,2.0,2.0
5,12,2.0,2.0
6,14,2.0,2.0
7,15,4.0,4.0
8,17,2.0,2.0
9,21,2.0,2.0


In [226]:
# Predict on test data
# impute mean values since there are relatively few nans
test.price.fillna(test.price.mean(), inplace=True)
test.pert_alcohol.fillna(test.pert_alcohol.mean(), inplace=True)

# Add on text length as an extra data point - https://medium.com/tensorist/classifying-yelp-reviews-using-nltk-and-scikit-learn-c58e71e962d9
test["text_len"] = test.description.apply(len)

# generate tokens
test['tokens'] = test.description.apply(lambda x: x.split(" "))

# set stop words
stops = set(stopwords.words('english'))

# remove stop words
test['clean'] = test['tokens'].apply(lambda x: [word for word in x if word not in stops])

# remove the none text characters 
test['clean'] = test['clean'].apply(lambda x: [word for word in x if re.search('^[a-zA-Z]+', word)])

# Lemmatizer 
lemmatizer = nltk.WordNetLemmatizer()

# lemmatize and rejoin
test['clean'] = test['clean'].apply(lambda x: [lemmatizer.lemmatize(word) for word in x])
test['ready'] =  test['clean'].apply(lambda x: ' '.join(x))

# apply original tfidf
vect_test = tfidf.transform(test['ready'])

test.reset_index(drop=True, inplace=True)

# push the tables together
vect_test = pd.DataFrame(vect_test.toarray())

X = pd.concat([test, vect_test], axis=1, ignore_index=True)

new_col = test.columns.tolist() + vect_test.columns.tolist()

X.columns = new_col

# create a text blob column 
X['blob'] = X.description.apply(lambda x: TextBlob(x))

X['polarity'] = [sent.sentiment.polarity for sent in X.blob]

X.drop(['author', 'description', 'tokens', 'clean', 'ready', 'blob'],axis=1, inplace=True)
X.head()

Unnamed: 0,id,price,ratingValue,pert_alcohol,text_len,0,1,2,3,4,...,795,796,797,798,799,800,801,802,803,polarity
0,955,36.0,90,50.0,423,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.120833
1,3532,90.0,82,49.3,424,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.2125
2,1390,48.0,89,45.0,440,0.0,0.0,0.182316,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.175758
3,1024,180.0,90,55.8,402,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.153974
4,1902,71.0,87,45.9,423,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.079167


In [227]:
X['category'] = model.predict(X.drop(['id'], axis=1))

In [228]:
X['category'] = X['category'].astype(int)

In [229]:
X[['id','category']].to_csv("C:\\Users\\erroden\\Desktop\\whiskey_prediction\\data\\submission_wider_tfidf.csv", index=False)

In [206]:
predictions = pd.read_csv("C:\\Users\\erroden\\Desktop\\whiskey_prediction\\data\\submission.csv")

In [207]:
predictions.head()

Unnamed: 0,id,category
0,955,2
1,3532,3
2,1390,1
3,1024,1
4,1902,1


In [208]:
sample_submission.head()

Unnamed: 0,id,category
0,955,1
1,3532,3
2,1390,2
3,1024,4
4,1902,2
