03 text mining model svc

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Sat Jul  8 23:03:22 2017

@author: pablotempone
"""

import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer


train = pd.read_csv("/Volumes/Disco_SD/Set de datos/guia_oleo/ratings_train.csv",sep = ',',encoding = "ISO-8859-1")

count_vect = CountVectorizer()

x = count_vect.fit_transform(train["comentario"].values.astype('U'))  ## Even astype(str) would work
x.shape


count_vect.vocabulary_.get('bueno')

count_vect.vocabulary_.get('excelente')

count_vect.vocabulary_.get('malo')


from nltk.corpus import stopwords
from nltk import word_tokenize
from nltk.data import load
from nltk.stem import SnowballStemmer
from string import punctuation
from sklearn.feature_extraction.text import CountVectorizer       


spanish_stopwords = stopwords.words('spanish')

stemmer = SnowballStemmer('spanish')

non_words = list(punctuation)
non_words.extend(['¿', '¡'])
non_words.extend(map(str,range(10)))

stemmer = SnowballStemmer('spanish')
def stem_tokens(tokens, stemmer):
    stemmed = []
    for item in tokens:
        stemmed.append(stemmer.stem(item))
    return stemmed

def tokenize(text):
    text = ''.join([c for c in text if c not in non_words])
    tokens =  word_tokenize(text)

    # stem
    try:
        stems = stem_tokens(tokens, stemmer)
    except Exception as e:
        print(e)
        print(text)
        stems = ['']
    return stems
    
vectorizer = CountVectorizer(
                analyzer = 'word',
                tokenizer = tokenize,
                lowercase = True,
                stop_words = spanish_stopwords)

x = vectorizer.fit_transform(train["comentario"].values.astype('U'))  ## Even astype(str) would work

x.shape

df_rest = pd.concat(([train['restaurante'].reset_index(drop=True), df_b], axis=1))


vectorizer.vocabulary_.get('bueno')

vectorizer.vocabulary_.get('excelente')

vectorizer.vocabulary_.get('malo')

from sklearn.cross_validation import cross_val_score
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsRegressor

#LinearSVC() es el clasificador

pipeline = Pipeline([
    ('vect', vectorizer),
    ('cls', KNeighborsRegressor()),
])
    
    
parameters = {
    'vect__max_df': (0.5, 1.9),
    'vect__min_df': (10, 20,50),
    'vect__max_features': (500, 1000),
    'vect__ngram_range': ((1, 1), (1, 2)),  # unigrams or bigrams
    'cls__n_neighbors': (5,10,20,50),
    'cls__weights': ('uniform', 'distance')
}

grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1)


grid_search.fit(train["comentario"].values.astype('U'), train.rating_ambiente)