In [2]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import sklearn 

dataset = pd.read_csv('train.csv')

dataset.describe()

Unnamed: 0,index,beer/ABV,beer/beerId,beer/brewerId,review/appearance,review/aroma,review/overall,review/palate,review/taste,review/timeUnix,user/ageInSeconds,user/birthdayUnix
count,37500.0,37500.0,37500.0,37500.0,37500.0,37500.0,37500.0,37500.0,37500.0,37500.0,7856.0,7856.0
mean,24951.887573,7.403725,21861.152027,3036.59512,3.900053,3.87324,3.88944,3.854867,3.92244,1232794000.0,1176705000.0,241630300.0
std,14434.009669,2.318145,18923.130832,5123.084675,0.588778,0.680865,0.70045,0.668068,0.716504,71909550.0,337551400.0,337551400.0
min,0.0,0.1,175.0,1.0,0.0,1.0,0.0,1.0,1.0,926294400.0,703436600.0,-2208960000.0
25%,12422.5,5.4,5441.0,395.0,3.5,3.5,3.5,3.5,3.5,1189194000.0,979481000.0,143362800.0
50%,24942.5,6.9,17538.0,1199.0,4.0,4.0,4.0,4.0,4.0,1248150000.0,1100009000.0,318326400.0
75%,37416.75,9.4,34146.0,1315.0,4.5,4.5,4.5,4.5,4.5,1291330000.0,1274973000.0,438854400.0
max,49999.0,57.7,77207.0,27797.0,5.0,5.0,5.0,5.0,5.0,1326267000.0,3627295000.0,714898800.0


In [3]:
dataset.drop(['beer/beerId', 'user/ageInSeconds', 'user/birthdayRaw', 'user/birthdayUnix', 'user/gender', 'user/profileName', 'review/timeStruct', 'review/timeUnix'], axis=1, inplace=True)

dataset.dropna( inplace=True)


Standardize ABV

In [4]:
dataset['standardized_abv'] = (dataset['beer/ABV']-dataset['beer/ABV'].mean())/dataset['beer/ABV'].std()

dataset.head()

Unnamed: 0,index,beer/ABV,beer/brewerId,beer/name,beer/style,review/appearance,review/aroma,review/overall,review/palate,review/taste,review/text,standardized_abv
0,40163,5.0,14338,Chiostro,Herbed / Spiced Beer,4.0,4.0,4.0,4.0,4.0,Pours a clouded gold with a thin white head. N...,-1.036809
1,8135,11.0,395,Bearded Pat's Barleywine,American Barleywine,4.0,3.5,3.5,3.5,3.0,12oz bottle into 8oz snifter.\t\tDeep ruby red...,1.551615
2,10529,4.7,365,Naughty Nellie's Ale,American Pale Ale (APA),3.5,4.0,3.5,3.5,3.5,First enjoyed at the brewpub about 2 years ago...,-1.16623
3,44610,4.4,1,Pilsner Urquell,Czech Pilsener,3.0,3.0,2.5,3.0,3.0,First thing I noticed after pouring from green...,-1.295651
4,37062,4.4,1417,Black Sheep Ale (Special),English Pale Ale,4.0,3.0,3.0,3.5,2.5,A: pours an amber with a one finger head but o...,-1.295651


In [5]:
import string
import re
from nltk.stem import SnowballStemmer, WordNetLemmatizer

from nltk import corpus
from num2words import num2words
from gensim.parsing.preprocessing import STOPWORDS


negators = {'not', 'no', 'never', 'nothing'}
stop = set(STOPWORDS).difference(negators)
stop = stop.union({'will', 'can', 'should', 'shall'})
stemmer = SnowballStemmer('english')
lemmatizer = WordNetLemmatizer()
english_words = set(corpus.words.words())


def text_process(review):
    def is_number(word):
        return word.replace('.','',1).isdigit()
    
    def convert_number(number):
        if is_number(number):
            return num2words(number)
        else:
            return number
    
    def reduce_lengthening(text):
      
        pattern = re.compile(r"(.)\1{2,}")
        return pattern.sub(r"\1\1", text)
    
    
    
    def remove_y(word):
        # remove small words
        if len(word) == 1:
            return ' '
        if word[-1] == 'y' and word[:-1] in english_words:
            return word[:-1]
        elif word[-1] == 'y' and word[-2] == word[-3] and word[:-2] in english_words:
            return word[:-2]
        elif word[-1] == 'y' and word[:-2]+'e' in english_words:
            return word[:-1]
        else:
            return word
    
    def next_word_negation(review):
        is_negated = False
        output = []
        for word in review:
            if word in negators:
                is_negated = True
                continue
            if is_negated and len(word) > 1:
                output.append("neg_" + word)
                is_negated = False
            else:
                output.append(word)
        return output
    
    
    
    review = str(review).lower()
    
    
    
    review = [remove_y(word) for word in review]
    
    # remove stop words
    review = filter(lambda word : word not in stop, review)
    

    review = ' '.join([convert_number(word) for word in review.split()])
   
    review = ' '.join([stemmer.stem(lemmatizer.lemmatize(word, pos='a')) for word in review.split()])
    return review


dataset['processed_beer_style'] = dataset['beer/style'].map(text_process)


dataset['processed_beer_name'] = dataset['beer/name'].map(text_process)


dataset['processed_text'] = dataset['review/text'].map(text_process).astype(str)


dataset.head(5)

Unnamed: 0,index,beer/ABV,beer/brewerId,beer/name,beer/style,review/appearance,review/aroma,review/overall,review/palate,review/taste,review/text,standardized_abv,processed_beer_style,processed_beer_name,processed_text
0,40163,5.0,14338,Chiostro,Herbed / Spiced Beer,4.0,4.0,4.0,4.0,4.0,Pours a clouded gold with a thin white head. N...,-1.036809,herb spice beer,chiostro,pour cloud gold white head nose floral larg sp...
1,8135,11.0,395,Bearded Pat's Barleywine,American Barleywine,4.0,3.5,3.5,3.5,3.0,12oz bottle into 8oz snifter.\t\tDeep ruby red...,1.551615,american barleywin,beard pat barleywin,twelv oz bottl eight oz snifter deep rub red h...
2,10529,4.7,365,Naughty Nellie's Ale,American Pale Ale (APA),3.5,4.0,3.5,3.5,3.5,First enjoyed at the brewpub about 2 years ago...,-1.16623,american pale ale apa,naught nelli ale,enjoy brewpub year ago final manag bottl sligh...
3,44610,4.4,1,Pilsner Urquell,Czech Pilsener,3.0,3.0,2.5,3.0,3.0,First thing I noticed after pouring from green...,-1.295651,czech pilsen,pilsner urquel,thing notic pour green bottl glass skunk smell...
4,37062,4.4,1417,Black Sheep Ale (Special),English Pale Ale,4.0,3.0,3.0,3.5,2.5,A: pours an amber with a one finger head but o...,-1.295651,english pale ale,black sheep ale special,pour amber finger head onl v strong pour head ...
