In [14]:
# Imports
import pandas as pd
import numpy as np
import spacy
import string
import re

from sklearn.metrics import accuracy_score, roc_auc_score, roc_curve, mean_absolute_error
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import Pipeline

import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer

from xgboost import XGBRFClassifier, XGBClassifier
from bs4 import BeautifulSoup

nltk.download('vader_lexicon')
nlp = spacy.load("en_core_web_lg")
sid = SentimentIntensityAnalyzer()

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/reesh/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [15]:
# initial data
df = pd.read_csv('./train.csv')
test = pd.read_csv('./test.csv')

In [16]:
# functions
def lemmaz(doc):
    lemmas = []
    multi_ws = '[ ]{2,}'
    non_alpha = '[^a-zA-Z]'
    lonely = ' [a-zA-Z] '
    empty_start = '^ '
    empty_end = ' $'

    doc = re.sub(non_alpha, ' ', doc)
    doc = re.sub(lonely, ' ', doc)
    doc = re.sub(multi_ws, ' ', doc)
    doc = re.sub(empty_start, '', doc)
    doc = re.sub(empty_end, '', doc)
    doc = doc.lower()
    
    doc = nlp(doc)

    for token in doc:
        if (((token.pos_ == 'ADJ')
             or (token.pos_ == 'NOUN')
             or (token.pos_ == 'VERB'))
            and (token.is_stop == False)):
            lemmas.append(token.lemma_)

    return lemmas


def stringz(doc):
    lemma_docs = []

    lonely = ' [a-zA-Z] '
    empty_start = '^ '
    empty_end = ' $'
    
    for lemma_list in doc:
        text = ""
        for lemma in lemma_list:
            text = " ".join((text, lemma))        
        text = re.sub(lonely, ' ', text)
        text = re.sub(empty_start, '', text)
        text = re.sub(empty_end, '', text) 
        lemma_docs.append(text)
    return lemma_docs


def vec(doc):
    return [nlp(doc).vector]

In [17]:
# data cleaning
def train_data_tranformer(df):
    df['target'] = df.ratingCategory
    df['lemmas'] = df.description.apply(lemmaz)
    df["text"] = stringz(df.lemmas)
    df['vector'] = df.text.apply(vec)

    # numerical
    df["chars"] = df.description.apply(lambda x: len(x))
    df["words"] = df.description.apply(lambda x: len(x.split(" ")))

    df["sentiments"] = df.text.apply(lambda x: sid.polarity_scores(x))
    df = pd.concat([df.drop(['sentiments'], axis=1), df['sentiments'].apply(pd.Series)], axis=1)

    tfidf = TfidfVectorizer()
    tfidf_result = tfidf.fit_transform(df.text).toarray()
    tfidf_df = pd.DataFrame(tfidf_result, columns = tfidf.get_feature_names())
    tfidf_df.columns = ["w_" + str(x) for x in tfidf_df.columns]
    tfidf_df.index = df.index

    df = pd.concat([df.drop(columns=['description','ratingCategory']), tfidf_df], axis=1)
    return df

df = train_data_tranformer(df)
df.to_csv('train_clean.csv', index=False)

In [18]:
train = df.drop(columns=['lemmas','id'])
train.to_csv('train_ready.csv', index=False)

In [19]:
train = pd.read_csv('train_ready.csv')
train.head()

Unnamed: 0,target,text,vector,chars,words,neg,neu,pos,compound,w_abandon,...,w_zinfandel,w_zing,w_zinge,w_zinginess,w_zingy,w_zip,w_zippy,w_zombie,w_zone,w_zuidam
0,1,whisky batch leftover barrel return warehouse ...,"[array([-6.12000078e-02, 2.31518507e-01, -7.4...",513,76,0.097,0.734,0.17,0.5859,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0,uncommon exclusive bottling year old cask stre...,"[array([-3.20599135e-03, 1.96402133e-01, -1.1...",471,81,0.079,0.696,0.225,0.802,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1,release port version amrut intermediate sherry...,"[array([ 1.12371407e-01, 1.00963630e-01, -8.5...",482,84,0.024,0.755,0.221,0.8658,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1,year old single cask age sherry butt interact ...,"[array([-1.21437453e-01, 2.08656073e-01, -9.2...",450,69,0.0,0.886,0.114,0.6486,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1,herbal nose aroma dry tarragon parsley dill ch...,"[array([-1.42396763e-01, 2.51199961e-01, -1.7...",427,65,0.0,0.875,0.125,0.6486,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [5]:
X = train.drop(columns=['target','vector','text'])
y = train.target

X.shape, y.shape

((4087, 8743), (4087,))

In [13]:
# class SpacyVectorTransformer(BaseEstimator, TransformerMixin):
#     def __init__(self, nlp):
#         self.nlp = nlp
#         self.dim = 300

#     def fit(self, X, y):
#         return self

#     def transform(self, X):
#         return [self.nlp(doc).vector for doc in X]
    
# svd = TruncatedSVD()
# xgb = XGBClassifier(random_state=19)
# mms = MinMaxScaler()
# pca = PCA()
# ss = StandardScaler()


# model = Pipeline([('clf', xgb)])

# model = Pipeline([("lsa", lsa),
#                  ("clf", xgb)])

# text_set = pipe.fit_transform(text_data)

# pca = Pipeline([("minmaxscaler", mms), 
#                 ('pca', pca)])

# pipe_num = Pipeline([("standardscaler", ss)])

# transformer_list = [("pca", pca), 
#                     ("scaler", pipe_num)]   

# num_union = FeatureUnion(transformer_list, n_jobs=-1, verbose=2)
# num_set = num_union.fit_transform(num_data)
X.shape

(4087, 8743)

In [11]:
# Hyperparameters
obj = [ 'rank:ndcg' , 'rank:map', 'rank:pairwise' ]
eta = np.arange( 0.01 , .1 , .005 ) 
mdepth = np.arange( 5 , 21 , 3 )
ssam = np.arange( .5 , 1 , .05 )
colsamp = np.arange( .5 , 1 , .05 )
tmeth = [ 'gpu_hist' , 'approx' ]
gpol = [ 'depthwise' , 'lossguide' ]
est = np.arange( 200 , 401 , 20 )
emet = [ 'merror', 'mlogloss', 'ndcg', 'map' ]
lam = np.arange( 0 , 10 , 1 )
alp = np.arange( 0 , 10 , 1 )

params = {           #"svd__n_components": np.arange(100, 1001, 100),
                     'clf__objective' : obj ,
                           'clf__eta' : eta ,
                     'clf__max_depth' : mdepth ,
                     'clf__subsample' : ssam ,
                   'clf__grow_policy' : gpol ,
              'clf__colsample_bytree' : colsamp ,
                  'clf__n_estimators' : est,
                   'clf__tree_method' : tmeth ,               
                   'clf__eval_metric' : emet ,
                    'clf__reg_lambda' : lam ,
                     'clf__reg_alpha' : alp,
         }

# params = {           'objective' : obj ,
#                            'eta' : eta ,
#                      'max_depth' : mdepth ,
#                      'subsample' : ssam ,
#                    'grow_policy' : gpol ,
#               'colsample_bytree' : colsamp ,
#                   'n_estimators' : est,
#                    'tree_method' : tmeth ,               
#                    'eval_metric' : emet ,
#                     'reg_lambda' : lam ,
#                      'reg_alpha' : alp,
#          }

In [12]:
svd = TruncatedSVD()
xgb = XGBClassifier(random_state=19)

pipe = Pipeline([('svd',svd),('clf', xgb)])

rscv = RandomizedSearchCV(xgb, 
                          params, 
                          n_iter=3, 
                          n_jobs=-1, 
                          cv=2, 
                          verbose=2)

rscv.fit(X,y)

Fitting 2 folds for each of 3 candidates, totalling 6 fits


KeyboardInterrupt: 

In [None]:
print('MAE: ' , round( mean_absolute_error( y , rscv.predict( X ) ) , 3 ) )
# print('Validation MAE: ' , round( mean_absolute_error( 
#                                y_test , rscv.predict( X_test ) ) , 3 ) )
tuned_rscv = rscv.best_estimator_
rscv.best_score_, rscv.best_params_

In [59]:
# best_score_ 5x3 = 0.734094616639478
# best_score_ 100x5 = 0.7395223306713989
                    # {'svd__n_components': 61,
                    # 'clf__tree_method': 'approx',
                    # 'clf__subsample': 0.8500000000000003,
                    # 'clf__reg_lambda': 4,
                    # 'clf__reg_alpha': 8,
                    # 'clf__objective': 'rank:pairwise',
                    # 'clf__n_estimators': 201,
                    # 'clf__max_depth': 9,
                    # 'clf__grow_policy': 'lossguide',
                    # 'clf__eval_metric': 'map',
                    # 'clf__eta': 0.1,
                    # 'clf__colsample_bytree': 0.8000000000000003}
                    
# best_score_ 200x3 = 0.7400761283306144
                    #  {'svd__n_components': 51,
                    #   'clf__tree_method': 'approx',
                    #   'clf__subsample': 0.6500000000000001,
                    #   'clf__reg_lambda': 7,
                    #   'clf__reg_alpha': 4,
                    #   'clf__objective': 'rank:map',
                    #   'clf__n_estimators': 201,
                    #   'clf__max_depth': 17,
                    #   'clf__grow_policy': 'depthwise',
                    #   'clf__eval_metric': 'mlogloss',
                    #   'clf__eta': 0.1,
                    #   'clf__colsample_bytree': 0.8000000000000003}
                    
# best_score_ 400x3 = 0.744426318651441
                    #  {'svd__n_components': 60,
                    #   'clf__tree_method': 'approx',
                    #   'clf__subsample': 0.8500000000000003,
                    #   'clf__reg_lambda': 1,
                    #   'clf__reg_alpha': 6,
                    #   'clf__objective': 'rank:map',
                    #   'clf__n_estimators': 340,
                    #   'clf__max_depth': 13,
                    #   'clf__grow_policy': 'lossguide',
                    #   'clf__eval_metric': 'ndcg',
                    #   'clf__eta': 0.06499999999999999,
                    #   'clf__colsample_bytree': 0.6500000000000001}

In [128]:
# predicting and submission
test = pd.read_csv('./test.csv')

def test_data_transformer(test):
    test['lemmas'] = test.description.apply(lemmaz)
    test['text'] = stringz(test.lemmas)
    test['vector'] = test.text.apply(vec)

    test["chars"] = test.description.apply(lambda x: len(x))
    test["words"] = test.description.apply(lambda x: len(x.split(" ")))

    test["sentiments"] = test.text.apply(lambda x: sid.polarity_scores(x))
    test = pd.concat([test.drop(['sentiments'], axis=1),
                      test['sentiments'].apply(pd.Series)], axis=1)

    tfidf = TfidfVectorizer()
    tfidf_result = tfidf.fit_transform(test.text).toarray()
    tfidf_df = pd.DataFrame(tfidf_result, columns=tfidf.get_feature_names())
    tfidf_df.columns = ["w_" + str(x) for x in tfidf_df.columns]
    tfidf_df.index = test.index

    test_ready = pd.concat(
        [test.drop(columns=['description', 'lemmas', 'text', 'id']), tfidf_df], axis=1)
    
    return test_ready

test_ready = test_data_transformer(test)
# prediction = tuned_rscv.predict(test_ready)
# submission = pd.DataFrame({'id': test_ready.id, 'ratingCategory': prediction})
# submission.to_csv('submission2.csv', index=False)

Unnamed: 0,vector,chars,words,neg,neu,pos,compound,w_abandon,w_abatement,w_abc,...,w_zealand,w_zeitgeist,w_zest,w_zested,w_zesty,w_zinfandel,w_zing,w_zingy,w_zip,w_zippy
0,"[-0.0976305, 0.15472263, -0.15228683, -0.00053...",409,64,0.057,0.828,0.115,0.4019,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,"[-0.090834245, 0.16909483, -0.0033552991, -0.0...",574,95,0.0,0.632,0.368,0.9735,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.180853,0.0,0.189745,0.0,0.0,0.0
2,"[-0.031802785, 0.12913445, -0.02464436, -0.062...",481,76,0.109,0.86,0.031,-0.6249,0.0,0.0,0.0,...,0.0,0.0,0.131935,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,"[-0.13696712, 0.10186601, -0.048041925, -0.154...",209,38,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,"[0.05216053, 0.14466654, -0.15954311, 0.104861...",357,60,0.071,0.765,0.163,0.3818,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
