In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import SGDClassifier
from sklearn.pipeline import Pipeline

import matplotlib.pyplot as plt
import seaborn as sns

import random
random.seed(42)

In [2]:
reviews = pd.read_csv('reviews.csv')

In [107]:
reviews

Unnamed: 0.1,Unnamed: 0,Clothing ID,Age,Title,Review Text,Rating,Recommended IND,Positive Feedback Count,Division Name,Department Name,Class Name
0,0,767,33,,Absolutely wonderful - silky and sexy and comf...,4,1,0,Initmates,Intimate,Intimates
1,1,1080,34,,Love this dress! it's sooo pretty. i happene...,5,1,4,General,Dresses,Dresses
2,2,1077,60,Some major design flaws,I had such high hopes for this dress and reall...,3,0,0,General,Dresses,Dresses
3,3,1049,50,My favorite buy!,"I love, love, love this jumpsuit. it's fun, fl...",5,1,0,General Petite,Bottoms,Pants
4,4,847,47,Flattering shirt,This shirt is very flattering to all due to th...,5,1,6,General,Tops,Blouses
...,...,...,...,...,...,...,...,...,...,...,...
23481,23481,1104,34,Great dress for many occasions,I was very happy to snag this dress at such a ...,5,1,0,General Petite,Dresses,Dresses
23482,23482,862,48,Wish it was made of cotton,"It reminds me of maternity clothes. soft, stre...",3,1,0,General Petite,Tops,Knits
23483,23483,1104,31,"Cute, but see through","This fit well, but the top was very see throug...",3,0,1,General Petite,Dresses,Dresses
23484,23484,1084,28,"Very cute dress, perfect for summer parties an...",I bought this dress for a wedding i have this ...,3,1,2,General,Dresses,Dresses


In [131]:
review_text = reviews['Review Text'].values.astype('U')
age_group = np.floor(reviews['Age'] / 10).values.astype('f')

X_train, X_test, y_train, y_test = train_test_split(review_text, age_group, test_size=0.03, random_state=42)

# Train model

In [134]:
pipeline = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', SGDClassifier()),
])

parameters = {
    'vect__stop_words': ('english',),
    'vect__max_df': (0.5, 0.75, 1.0),
    #'vect__max_features': (None, 5000, 10000, 50000),
    'vect__ngram_range': ((1, 2),),  # unigrams or bigrams
    'tfidf__use_idf': (True, ),
    'tfidf__norm': ('l1', 'l2'),
    'clf__max_iter': (100, ),
    'clf__alpha': (0.00001, ),
    'clf__penalty': ('l2', ),
    # 'clf__max_iter': (10, 50, 80),
}

In [135]:
grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1)
estimator = grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 6 candidates, totalling 30 fits


In [136]:
    print("Best score: %0.3f" % grid_search.best_score_)
    print("Best parameters set:")
    best_parameters = grid_search.best_estimator_.get_params()
    for param_name in sorted(parameters.keys()):
        print("\t%s: %r" % (param_name, best_parameters[param_name]))

Best score: 0.333
Best parameters set:
	clf__alpha: 1e-05
	clf__max_iter: 100
	clf__penalty: 'l2'
	tfidf__norm: 'l1'
	tfidf__use_idf: True
	vect__max_df: 0.5
	vect__ngram_range: (1, 2)
	vect__stop_words: 'english'


# Play

In [153]:
selected_review = random.randint(0, len(X_test))

In [154]:
X_test[selected_review]

"I'm a new mom, and am thrilled at the structure and quality of this dress. it gives me a great shape, very flattering. i can wear it to work with a jacket or dress it up with jewelry for a fancier event."

In [155]:
estimator.predict([ X_test[selected_review] ])[0]

2.0

In [156]:
y_test[selected_review]

2.0