In [62]:
import pandas as pd
import re
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from nltk.stem.porter import PorterStemmer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.base import BaseEstimator
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import GridSearchCV

In [63]:
def get_review_df():
  stemmer = PorterStemmer()
  corpus = []
  with open('../Task_6/data/hygiene.dat', 'r') as f:
    for line in f:
      line = re.sub(r'[^A-Za-z\s]', '', line)
      corpus.append(line)
  stemmed_corpus = [stemmer.stem(word) for word in corpus]
  # vectorizer = CountVectorizer(strip_accents="unicode", stop_words="english", min_df=5, max_features=100000)
  # X = vectorizer.fit_transform(stemmed_corpus)

  df = pd.DataFrame({'Stemmed_Reviews': stemmed_corpus})
  return df

In [64]:
def get_y():
  labels = []
  with open('../Task_6/data/hygiene.dat.labels.txt', 'r') as f:
    for line in f:
      line = list(line)
      for ch in line:
        if ch == '0' or ch == '1':
          labels.append(int(ch))
  f.close()
  return labels

In [65]:
## Getting data and labels

df_reviews = get_review_df()
df_additional = pd.read_csv('../Task_6/data/hygiene.dat.additional', 
  sep=",", 
  usecols=[0,1,2,3], 
  names=['Categories', 'Zip_Code', 'Num_Reviews', 'Rating'],
  dtype={'Categories': str, 'Zip_Code': str, 'Num_Reviews': str, 'Rating': float}
  )
df_additional['Rating'] = df_additional['Rating'].round().astype(int)
df = df_additional.join(df_reviews)

labels = get_y()
X_train, X_test, y_train, y_test = train_test_split(df, labels, train_size=0.04098052, shuffle=False)

In [66]:
# Taken from https://stackoverflow.com/questions/51695322/compare-multiple-algorithms-with-sklearn-pipeline
class ClfSwitcher(BaseEstimator):
  def __init__(self, estimator = SGDClassifier()):
    self.estimator = estimator
  
  def fit(self, X, y=None, **kwargs):
    self.estimator.fit(X,y)
    return self
  
  def predict(self, X, y=None):
    return self.estimator.predict(X)
    
  def predict_proba(self, X):
    return self.estimator.predict_proba(X)

  def score(self, X, y):
    return self.estimator.score(X, y)

In [67]:
pipeline = Pipeline([
  ('preprocessor', ColumnTransformer(
    [
      ('cv1', CountVectorizer(), 'Categories'),
      ('cv2', CountVectorizer(), 'Zip_Code'),
      ('cv3', CountVectorizer(), 'Num_Reviews'),
      ('onehot', OneHotEncoder(dtype='int', categories='auto', sparse=False), ['Rating']),
      ('tfidf', TfidfVectorizer(strip_accents='unicode', max_features=100000, stop_words='english'), 'Stemmed_Reviews')
    ],
    remainder= 'passthrough')
  ),
  ('clf', ClfSwitcher())
])

parameters = [
  {
    'clf__estimator': [SGDClassifier()], # SVM if hinge loss / logreg if log loss
    'preprocessor__tfidf__max_df': (0.5, 0.75, 0.9),
    'preprocessor__tfidf__min_df': (1,10,20),
    'clf__estimator__penalty': ('l2', 'elasticnet', 'l1'),
    #'clf__estimator__max_iter': [50, 80],
    'clf__estimator__tol': [1e-4],
    'clf__estimator__loss': ['hinge', 'log', 'modified_huber'],
},
{
    'clf__estimator': [MultinomialNB()],
    'preprocessor__tfidf__max_df': (0.5, 0.75, 0.9),
    'preprocessor__tfidf__min_df': (1,10,20),
    'clf__estimator__alpha': (1e-1, 1e-2, 1e-3),
  },
]
gscv = GridSearchCV(pipeline, parameters, cv=3, verbose=1, scoring='f1_macro')
gscv.fit(X_train, y_train)
results = gscv.cv_results_

results_df = pd.DataFrame({
  'Rank': results['rank_test_score'], 
  'Mean F1 Score': results['mean_test_score'], 
  'Params': results['params'], 
  'Mean Fit Time': results['mean_fit_time']
  }).sort_values(by=['Rank'])
results_df.to_html('results.html')

Fitting 3 folds for each of 108 candidates, totalling 324 fits
