# Hackathon 4

In [13]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.metrics import accuracy_score
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler

import spacy
import nltk
from nltk.stem.snowball import SnowballStemmer

import seaborn as sbn

## Baseline

### Separate into training and testing

In [None]:
# Baseline

train = pd.read_csv('./data/train.csv')
test = pd.read_csv('./data/test.csv')

def drop_nulls(train_data):
    return train_data[train_data['Request'].notnull()]

train = drop_nulls(train)
train_data, val_data = train_test_split(train, test_size=0.2, random_state=42)
train_X, train_y = train_data['Request'], train_data['Label']
val_X, val_y = val_data['Request'], val_data['Label']

text_clf = Pipeline([
    ('tfidf', TfidfVectorizer(ngram_range=(1,3))),
    ('clf', RandomForestClassifier(random_state=42))
])

text_clf.fit(train_X, train_y)

accuracy_score(val_y, text_clf.predict(val_X))

## Features

In [2]:
class TextSelector(BaseEstimator, TransformerMixin):
    """
    Transformer to select a single column from the data frame to perform additional transformations on
    Use on text columns in the data
    """
    def __init__(self, key):
        self.key = key

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X[self.key]
    
class NumberSelector(BaseEstimator, TransformerMixin):
    """
    Transformer to select a single column from the data frame to perform additional transformations on
    Use on numeric columns in the data
    """
    def __init__(self, key):
        self.key = key

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X[[self.key]]

In [3]:
class MultipleColumnSelector(BaseEstimator, TransformerMixin):
    def __init__(self, keys):
        self.keys = keys

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X.loc[:, self.keys]

In [4]:
train = pd.read_csv('./data/train.csv')
test = pd.read_csv('./data/test.csv')

train = train.dropna()

In [5]:
# Length
train['len_message'] = train['Request'].apply(lambda x: len(x))
test['len_message'] = test['Request'].apply(lambda x: len(x))

# Number of words
train['n_words'] = train['Request'].str.split().map(len)
test['n_words'] = test['Request'].str.split().map(len)

In [9]:
nlp = spacy.load('pt_core_news_sm', disable=['parser'])

In [10]:
for df in [train, test]:
    n_adj = []
    n_verbs = []
    len_message = []

    for doc in nlp.pipe(df['Request']):
        n_adj.append(len([token for token in doc if token.pos_ == 'ADJ']))
        n_verbs.append(len([token for token in doc if token.pos_ == 'VERB']))
        
    df['n_adj'] = n_adj
    df['n_verbs'] = n_verbs

train_data, val_data = train_test_split(train, test_size=0.2, random_state=42)
train_X, train_y = train_data.drop('Label', axis=1), train_data['Label']
val_X, val_y = val_data.drop('Label', axis=1), val_data['Label']

In [14]:
stopwords = nltk.corpus.stopwords.words('portuguese')

In [15]:
one = Pipeline([
    ('select', TextSelector(key='Request')),
    ('tfidf', TfidfVectorizer(ngram_range=(1,1), stop_words=stopwords))
])

twoplus = Pipeline([
    ('select', TextSelector(key='Request')),
    ('tfidf', TfidfVectorizer(ngram_range=(2,3), max_features=10000))])

length = Pipeline([
    ('select', NumberSelector(key='len_message')),
    ('scale', StandardScaler()),
])

n_words = Pipeline([
    ('select', NumberSelector(key='n_words')),
    ('scale', StandardScaler()),
])

n_adj = Pipeline([
    ('select', NumberSelector(key='n_adj')),
    ('scale', StandardScaler())
])


n_verbs = Pipeline([
    ('select', NumberSelector(key='n_verbs')),
    ('scale', StandardScaler())
])

### Spacy

In [16]:
stemmer = SnowballStemmer("portuguese")

In [17]:
key_dict = {'add_to_playlist':  ['adicion','faixa', 'lista', 'música','colocar', 'reprodução','clássicos','compilacao','melodia','artista','disco'],
            'search_screening_event': ['curta','longa','metragem','filme', 'telefilme','teatro','cinema','horas','animação','documentario'],
            'book_restaurant':  ['restaurante','mesa','reserva','comer','lugar','bar'],
            'rate_book': ['de 6','saga','estrelas','livro','classificar','romance','encoclopédia','texto','dar','valor','título','0','1','2','3','4','5','6','um','dois','três','quatro','cinco','seis'],
            'get_weather': ['área','posição','tempestade','clima','previsão','neve','tempo','frio','chuva','calor','vento','temperatura','neblina','sol','nublado'],
            'play_music':  ['álbum','artista','spotify','melodia','balada','música','anos','youtube','setenta','oitenta','noventa','ouvir','reproduzir','tocar','sinfonia','polular','metal','rock','jazz','sonora','punk','blues','clássica','quarenta','cinquenta','sessenta','setenta','oitenta','noventa','ópera'],
            'search_creative_work':  ['procure','encontre','pesquisar','arranjar','video','televisão','título','gostaria','programa','trabalho','criativo','quero','mostrar','obter','pintura','filme','show','vídeo','game','jogo','foto','achar'],}

In [18]:
def create_stem_dict(original_dict):
    stem_dict = {}
    for k in original_dict:
        stem_dict[k] = list(map(stemmer.stem, original_dict[k]))
    return stem_dict

stem_dict = create_stem_dict(key_dict)

In [19]:
for df in [train, test]:
    for k in stem_dict:
        words = []
        for doc in df['Request']:
            words.append(len([word for word in doc.split() if stemmer.stem(word) in stem_dict[k]]))
        df[k] = words / df['len_message']

train_data, val_data = train_test_split(train, test_size=0.2, random_state=42)
train_X, train_y = train_data.drop('Label', axis=1), train_data['Label']
val_X, val_y = val_data.drop('Label', axis=1), val_data['Label']

In [20]:
stem_feats = Pipeline([
    ('select', MultipleColumnSelector(list(key_dict.keys()))),
    ('scale', StandardScaler())
])

In [31]:
feats = FeatureUnion([
    ('one', one),
    ('twop', twoplus),
    ('len', length),
    ('n_words', n_words),
    ('n_adj', n_adj),
    ('n_verbs', n_verbs),
    ('stem_feats', stem_feats)
])

In [23]:
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_selection import SelectKBest, chi2, f_classif
from sklearn.svm import SVC

In [32]:
pipe = Pipeline([
    ('feat', feats),
    ('clf', SVC()),
])

#pipe.fit(train_X, train_y)
#accuracy_score(val_y, pipe.predict(val_X))

In [38]:
t_pipe = Pipeline(pipe.steps[:-1])
train_no_emb = pd.DataFrame(t_pipe.fit_transform(train).to_array())
test_no_emb = pd.DataFrame(t_pipe.transform(test).to_array())

## Embedding

In [27]:
train_embedding = pd.read_csv('train_embeddings.csv',index_col=0)
test_embedding = pd.read_csv('test_embeddings.csv', index_col=0)

In [42]:
train_data = pd.concat([train_embedding, train_no_emb, train['Label']], axis=1)
test_data = pd.concat([test_embedding, test_no_emb], axis=1)

In [48]:
train_data = train_data.dropna()

In [49]:
train_data, val_data = train_test_split(train_data, test_size=0.2, random_state=42)
train_X, train_y = train_data.drop('Label', axis=1), train_data['Label']
val_X, val_y = val_data.drop('Label', axis=1), val_data['Label']

In [50]:
svm = SVC().fit(train_X, train_y)

In [51]:
accuracy_score(val_y, svm.predict(val_X))

0.8859523809523809