In [1]:
from sklearn.datasets import fetch_20newsgroups

In [2]:
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

In [3]:
dataset = fetch_20newsgroups(subset='train', shuffle=True)

Downloading 20news dataset. This may take a few minutes.
Downloading dataset from https://ndownloader.figshare.com/files/5975967 (14 MB)


In [4]:
X = dataset.data
y = dataset.target

In [6]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

In [22]:
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

pipeline = Pipeline([
                     ('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('clf', KNeighborsClassifier())
])

In [29]:
import numpy as np

parameters = {
    'vect__ngram_range': [(1,1), (1,2)],
    'tfidf__use_idf': [True, False],
    'clf__n_neighbors': [3,4, 5, 6, 7, 8]
}

In [30]:
gs_clf = GridSearchCV(pipeline, parameters, cv=5, n_jobs=-1)

In [31]:
gs_clf.fit(X[:400], y[:400])

GridSearchCV(cv=5, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('vect',
                                        CountVectorizer(analyzer='word',
                                                        binary=False,
                                                        decode_error='strict',
                                                        dtype=<class 'numpy.int64'>,
                                                        encoding='utf-8',
                                                        input='content',
                                                        lowercase=True,
                                                        max_df=1.0,
                                                        max_features=None,
                                                        min_df=1,
                                                        ngram_range=(1, 1),
                                                        prep

In [32]:
gs_clf.best_score_

0.3625

In [33]:
gs_clf.best_estimator_

Pipeline(memory=None,
         steps=[('vect',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 2), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabulary=None)),
                ('tfidf',
                 TfidfTransformer(norm='l2', smooth_idf=True,
                                  sublinear_tf=False, use_idf=True)),
                ('clf',
                 KNeighborsClassifier(algorithm='auto', leaf_size=30,
                                      metric='minkowski', metric_params=None,
     

In [50]:
from sklearn.neural_network import MLPClassifier

In [51]:
from sklearn.gaussian_process.kernels import RBF
pipeline = Pipeline([
                     ('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('clf', MLPClassifier())
])

In [55]:
parameters = {
    'vect__ngram_range': [(1,1), (1,2)],
    'tfidf__use_idf': [True, False],
    'clf__alpha': [1,2],
    'clf__max_iter':[1000]
}

In [56]:
gs_clf = GridSearchCV(pipeline, parameters, cv=5, n_jobs=-1)

In [58]:
gs_clf.fit(X[:100], y[:100])



GridSearchCV(cv=5, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('vect',
                                        CountVectorizer(analyzer='word',
                                                        binary=False,
                                                        decode_error='strict',
                                                        dtype=<class 'numpy.int64'>,
                                                        encoding='utf-8',
                                                        input='content',
                                                        lowercase=True,
                                                        max_df=1.0,
                                                        max_features=None,
                                                        min_df=1,
                                                        ngram_range=(1, 1),
                                                        prep

In [59]:
gs_clf.best_score_

0.27999999999999997

In [1]:
import pandas as pd

In [2]:
dataset = pd.read_csv('yelp-review-subset.csv', header=0, delimiter=',', names=['stars', 'text', 'funny', 'useful', 'cool'])

In [3]:
dataset.head()

Unnamed: 0,stars,text,funny,useful,cool
0,4,"Mr Hoagie is an institution. Walking in, it do...",0,0,0
1,5,Excellent food. Superb customer service. I mis...,0,0,0
2,5,Yes this place is a little out dated and not o...,1,1,0
3,3,PROS: Italian hoagie was delicious. Friendly ...,0,0,0
4,2,First the only reason this place could possibl...,0,1,0


In [4]:
from sklearn.model_selection import train_test_split

In [5]:
X = dataset[['text', 'funny', 'useful', 'cool']]
y = dataset['stars']

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8)

In [6]:
X_train.columns

Index(['text', 'funny', 'useful', 'cool'], dtype='object')

In [7]:
from sklearn.feature_extraction.text import CountVectorizer

In [9]:
cv = CountVectorizer()

In [10]:
X_train_counts = cv.fit_transform(X_train.text)

In [11]:
from sklearn.base import TransformerMixin, BaseEstimator
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import FeatureUnion, Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction import DictVectorizer
from sklearn.metrics import classification_report

In [12]:
def binarize_num(num, threshold):
  return 0 if num < threshold else 1


class ItemSelector(TransformerMixin, BaseEstimator):
  def __init__(self, keys):
    self.keys = keys

  def fit(self, x, y=None):
    return self

  def transform(self, dataframe):
    return dataframe[self.keys]


class VotesToDictTransformer(TransformerMixin, BaseEstimator):
  def fit(self, x, y=None):
    return self

  def transform(self, votes):
    funny, useful, cool = votes['funny'], votes['useful'], votes['cool']

    return [{'funny': binarize_num(f, 1), 'useful': binarize_num(u, 1), 'cool': binarize_num(c, 1)}
            for f, u, c in zip(funny, useful, cool)]

In [14]:
pipeline = Pipeline([
                  #Use FeatureUnion to combine features from text and votes
                  ('union', FeatureUnion(
                      transformer_list =[
                                         #Pipeline for getting BOW features from text
                                         (
                                             'bag-of-words', Pipeline([
                                                                       ('selector', ItemSelector(keys='text')),
                                                                       ('counts', CountVectorizer()),
                                             ])
                                         ),

                                         #Pipeline for getting vote counts as features
                                         ('votes', Pipeline([
                                                             ('selector', ItemSelector(keys=['funny', 'useful', 'cool'])),
                                                             ('votes_to_dict', VotesToDictTransformer()),
                                                             ('vectorizer', DictVectorizer()),
                                         ])),


                      ],
                      #weight components in feature union
                      transformer_weights = {
                          'bag-of-words': 1.0,
                          'votes': 0.5
                      },
                  )),

                  ('clf', LogisticRegression()),   

                  
])

pipeline.fit(X_train, y_train)
predicted = pipeline.predict(X_test)
print(classification_report(predicted, y_test))


              precision    recall  f1-score   support

           1       0.72      0.69      0.71       101
           2       0.45      0.48      0.47        98
           3       0.43      0.41      0.42       106
           4       0.31      0.40      0.35        85
           5       0.67      0.55      0.60       110

    accuracy                           0.51       500
   macro avg       0.52      0.50      0.51       500
weighted avg       0.52      0.51      0.51       500



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [15]:
from textblob import TextBlob

In [21]:
class SentimentTransformer(TransformerMixin, BaseEstimator):
  def fit(self, x, y=None):
    return self

  def transform(self, texts):
    features = []
    for text in texts:
      blob = TextBlob(text)
      features.append({
          'polarity':binarize_num(blob.sentiment.polarity, 0.5),
          'subjectivity':binarize_num(blob.sentiment.subjectivity, 0.5)
      })

      return features

In [22]:
pipeline = Pipeline([
                     ('union', FeatureUnion(
                         transformer_list =[
                              ('bag-of-words', Pipeline([
                                                         ('selector', ItemSelector(keys='text')),
                                                         ('counts', CountVectorizer()),


                              ])),

                              ('votes', Pipeline([
                                                  ('selector', ItemSelector(keys=['funny', 'useful', 'cool'])),
                                                  ('votes_to_dict', VotesToDictTransformer()),
                                                  ('vectorizer', DictVectorizer()),
                              ])),

                              ('sentiments', Pipeline([
                                                       ('selector', ItemSelector(keys='text')),
                                                       ('sentiment_transform', SentimentTransformer()),
                                                       ('vectorizer', DictVectorizer())
                              ]))
                         ],

                         transformer_weights = {
                             'bag-of-words': 1.0,
                             'votes': 0.5,
                             'sentiments': 1.0,
                         },



                     )),
                     ('clf', LogisticRegression())
])

pipeline.fit(X_train, y_train)
predicted = pipeline.predict(X_test)
classification_report(predicted, y_test)

ValueError: ignored