In [1]:
%matplotlib inline

import nltk
from nltk.stem.snowball import SnowballStemmer
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from pandas.io.json import json_normalize
import json
from textblob import TextBlob
from sklearn.cross_validation import cross_val_score
import metrics
from sklearn.cross_validation import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import Normalizer
from sklearn.preprocessing import StandardScaler
import text_processors
from progressbar import ProgressBar
import data_grab
from time import time
from sklearn.feature_selection import VarianceThreshold

import pymc3 as pm 

plt.rcParams["figure.figsize"] = (10, 8)

In [2]:
def contest_metric(numpy_array_predictions, numpy_array_actual_values):
    return metrics.weighted_rmsle(numpy_array_predictions, numpy_array_actual_values,
            weights=metrics.KEEPING_IT_CLEAN_WEIGHTS)

In [3]:
def contest_scoring(X, y, pipeline):
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
    s1 = pipeline.fit(X_train, y_train['score_lvl_1']).predict(X_test)
    s2 = pipeline.fit(X_train, y_train['score_lvl_2']).predict(X_test)
    s3 = pipeline.fit(X_train, y_train['score_lvl_3']).predict(X_test)
    results = np.dstack((s1, s2, s3))
    score = contest_metric(np.round(results[0]), np.array(y_test))
    print("Contest score of {}".format(score))
    return score

In [4]:
from sklearn.metrics import accuracy_score

def raw_scoring(p1, p2, p3, ytrue):
    '''since cross_val_score doesn't allow you to round the results beforehand. also for pymc3 and other non-sklearn models'''
    score1 = accuracy_score(ytrue['score_lvl_1'], np.round(p1))
    print("Level 1 accuracy score of {}".format(score1))
    score2 = accuracy_score(ytrue['score_lvl_2'], np.round(p2))
    print("Level 2 accuracy score of {}".format(score2))
    score3 = accuracy_score(ytrue['score_lvl_3'], np.round(p3))
    print("Level 3 accuracy score of {}".format(score3))
    
    results = np.dstack((p1, p2, p3))[0]
    score = contest_metric(np.round(results), np.array(ytrue))
    print("Contest score of {}".format(score))
    
    rounded = np.clip(np.round(results), 0, np.inf)
    compare = pd.concat([pd.DataFrame(np.concatenate((results, rounded), axis=1)), ytrue.reset_index(drop=True)], axis=1)
    compare.columns = ['pred1','pred2','pred3','round1','round2','round3','true1','true2','true3']
    compare['offset1'] = compare.round1-compare.true1
    compare['offset2'] = compare.round2-compare.true2
    compare['offset3'] = compare.round3-compare.true3
        
    return score1, score2, score3, score, compare.head(10)

    
def raw_fit(X, y, pipeline):
    xtrain, xtest, ytrain, ytest = train_test_split(X, y, random_state=42)
    
    p1 = pipeline.fit(xtrain, ytrain['score_lvl_1']).predict(xtest)
    p2 = pipeline.fit(xtrain, ytrain['score_lvl_2']).predict(xtest)
    p3 = pipeline.fit(xtrain, ytrain['score_lvl_3']).predict(xtest)
        
    return p1, p2, p3, ytest

In [5]:
def score_model(X, y, pipeline):
    scores = cross_val_score(pipeline, X, y, cv=3, n_jobs=1, verbose=1)
    mean_score = np.mean(scores)
    std_dev_score = np.std(scores)
    print("CV score of {} +/- {}".format(mean_score, std_dev_score))

In [6]:
def extract_features(df):
    features = df.drop(['score_lvl_1', 'score_lvl_2', 'score_lvl_3'], axis=1)
    response = df[['score_lvl_1', 'score_lvl_2', 'score_lvl_3']].astype(np.int8)
    
    return features, response

In [7]:
from itertools import combinations

def multi_feature_test(X, y, pipeline, feature_list):
    t0 = time()
    xtrain, xtest, ytrain, ytest = train_test_split(X, y, random_state=42)
     
    combo_list = []
    for num in range(1, len(feature_list)+1):
        combo_list.extend([list(i) for i in combinations(feature_list, num)])
    
    temp_dict = {}
    for features in combo_list:
        p1 = pipeline.fit(xtrain, ytrain['score_lvl_1']).predict(xtest)
#         print("Level 1 accuracy score of {} for {}".format(accuracy_score(ytest['score_lvl_1'], np.round(p1)), features))
        temp_dict.update({tuple(features): accuracy_score(ytest['score_lvl_1'], np.round(p1))})
    print("{} seconds elapsed".format(time()-t0))
    return temp_dict

In [None]:
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report
from sklearn.pipeline import FeatureUnion
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC


class ItemSelector(BaseEstimator, TransformerMixin):
    """For data grouped by feature, select subset of data at a provided key.

    The data is expected to be stored in a 2D data structure, where the first
    index is over features and the second is over samples.  i.e.

    >> len(data[key]) == n_samples

    Please note that this is the opposite convention to sklearn feature
    matrixes (where the first index corresponds to sample).

    ItemSelector only requires that the collection implement getitem
    (data[key]).  Examples include: a dict of lists, 2D numpy array, Pandas
    DataFrame, numpy record array, etc.

    >> data = {'a': [1, 5, 2, 5, 2, 8],
               'b': [9, 4, 1, 4, 1, 3]}
    >> ds = ItemSelector(key='a')
    >> data['a'] == ds.transform(data)

    ItemSelector is not designed to handle data grouped by sample.  (e.g. a
    list of dicts).  If your data is structured this way, consider a
    transformer along the lines of `sklearn.feature_extraction.DictVectorizer`.

    Parameters
    ----------
    key : hashable, required
        The key corresponding to the desired value in a mappable.
    """
    def __init__(self, key):
        self.key = key

    def fit(self, x, y=None):
        return self

    def transform(self, data_dict):
        return data_dict[self.key]


class TextStats(BaseEstimator, TransformerMixin):
    """Extract features from each document for DictVectorizer"""

    def fit(self, x, y=None):
        return self

    def transform(self, posts):
        return [{'length': len(text),
                 'num_sentences': text.count('.')}
                for text in posts]


class SubjectBodyExtractor(BaseEstimator, TransformerMixin):
    """Extract the subject & body from a usenet post in a single pass.

    Takes a sequence of strings and produces a dict of sequences.  Keys are
    `subject` and `body`.
    """
    def fit(self, x, y=None):
        return self

    def transform(self, posts):
        features = np.recarray(shape=(len(posts),),
                               dtype=[('subject', object), ('body', object)])
        for i, text in enumerate(posts):
            headers, _, bod = text.partition('\n\n')
            bod = strip_newsgroup_footer(bod)
            bod = strip_newsgroup_quoting(bod)
            features['body'][i] = bod

            prefix = 'Subject:'
            sub = ''
            for line in headers.split('\n'):
                if line.startswith(prefix):
                    sub = line[len(prefix):]
                    break
            features['subject'][i] = sub

        return features

pipeline = Pipeline([
    # Extract the subject & body
    ('subjectbody', SubjectBodyExtractor()),

    # Use FeatureUnion to combine the features from subject and body
    ('union', FeatureUnion(
        transformer_list=[

            # Pipeline for pulling features from the post's subject line
            ('subject', Pipeline([
                ('selector', ItemSelector(key='subject')),
                ('tfidf', TfidfVectorizer(min_df=50)),
            ])),

            # Pipeline for standard bag-of-words model for body
            ('body_bow', Pipeline([
                ('selector', ItemSelector(key='body')),
                ('tfidf', TfidfVectorizer()),
                ('best', TruncatedSVD(n_components=50)),
            ])),

            # Pipeline for pulling ad hoc features from post's body
            ('body_stats', Pipeline([
                ('selector', ItemSelector(key='body')),
                ('stats', TextStats()),  # returns a list of dicts
                ('vect', DictVectorizer()),  # list of dicts -> feature matrix
            ])),

        ],

        # weight components in FeatureUnion
        transformer_weights={
            'subject': 0.8,
            'body_bow': 0.5,
            'body_stats': 1.0,
        },
    )),

    # Use a SVC classifier on the combined features
    ('svc', SVC(kernel='linear')),
])

In [None]:
from sklearn.pipeline import FeatureUnion


pipe1 = ('tfidf',Pipeline([
            ('low_var_removal', VarianceThreshold()),
            ('scaler', StandardScaler(with_mean=False)),
        ])
    )

pipe2 = ('other_features', Pipeline([
            ('low_var_removal', VarianceThreshold()),
            ('normalizer', Normalizer()),
            ('scaler', StandardScaler()),
        ])
    )

pipe1.fit(tfidf, y.score_lvl_1)
pipe2.fit(others, y.score_lvl_1)

estimator = Perceptron(n_jobs=-1, random_state=42)

pipeline = Pipeline([
        ('union', FeatureUnion(transformer_list=[pipe1, pipe2])),
        ('estimator', estimator),
        ])




In [8]:
from sklearn.externals import joblib


df = pd.read_pickle('pickle_jar/review_text_sentiment_hierarchical_df')
# prep = pd.read_pickle('pickle_jar/preprocessed_review_text_hierarchical_df')
# df = pd.concat([df, prep.preprocessed_review_text], axis=1)
sim = pd.read_pickle('pickle_jar/similarity_vectors_df')
# tfidf = joblib.load('pickle_jar/tfidf_preprocessed_ngram3_sublinear_1mil_hierarchical_dropna')
# matrix = joblib.load('pickle_jar/similarity_matrix')

In [9]:
df.previous_inspection_delta = df.previous_inspection_delta.fillna(0)
df.previous_inspection_delta = df.previous_inspection_delta.dt.days.astype(float)

In [14]:
def get_out(x):
    try:
        return x[0]
    except:
        return x


topics = ['manager', 'supervisor', 'training', 'safety', 'disease', 'ill', 'sick', 'poisoning', 'hygiene', 'raw', 'undercooked', 'cold', 'clean', 'sanitary', 'wash', 'jaundice', 'yellow', 'hazard', 'inspection', 'violation', 'gloves', 'hairnet', 'nails', 'jewelry', 'sneeze', 'cough', 'runny', 'illegal', 'rotten', 'dirty', 'mouse', 'cockroach', 'contaminated', 'gross', 'disgusting', 'stink', 'old', 'parasite', 'reheat', 'frozen', 'broken', 'drip', 'bathroom', 'toilet', 'leak', 'trash', 'dark', 'lights', 'dust', 'puddle', 'pesticide', 'bugs', 'mold', ]
pbar = ProgressBar(maxval=len(topics)).start()
for index, i in enumerate(topics):
    sim[i] = sim[i].apply(get_out)
    pbar.update(index)
pbar.finish()

100% (53 of 53) |#########################| Elapsed Time: 0:03:48 Time: 0:03:48


In [16]:
df = pd.concat([df,sim[topics]],axis=1)

In [17]:
del sim

In [21]:
df.drop('review_text', axis=1, inplace=True)

In [25]:
df.drop(['review_date', 'user_id', 'restaurant_full_address', 'restaurant_name', 'inspection_date', 'inspection_id', 'inspection_date', 'sentiment', 'vader'], axis=1, inplace=True)

In [218]:
df.drop(['restaurant_neighborhood_1',
 'restaurant_neighborhood_2',
 'restaurant_neighborhood_3',
 'restaurant_category_1',
 'restaurant_category_2',
 'restaurant_category_3',
 'restaurant_category_4',
 'restaurant_category_5',
 'restaurant_category_6',
 'restaurant_category_7',], axis=1, inplace=True)

ValueError: labels ['restaurant_neighborhood_1' 'restaurant_neighborhood_2'
 'restaurant_neighborhood_3' 'restaurant_category_1'
 'restaurant_category_2' 'restaurant_category_3' 'restaurant_category_4'
 'restaurant_category_5' 'restaurant_category_6' 'restaurant_category_7'] not contained in axis

In [30]:
train = data_grab.get_selects('train')
dropped['restaurant_categories'] = train.restaurant_categories.apply(lambda x: sorted(x))
dropped['restaurant_neighborhoods'] = train.restaurant_neighborhoods.apply(lambda x: sorted(x))

In [303]:
dropped = df.dropna(subset=['manager'])

In [16]:
dropped['review_stars'] = dropped.review_stars.fillna(0).astype('category')
dropped[['user_compliments_cool', 'user_compliments_cute', 'user_compliments_funny', 'user_compliments_hot',
 'user_compliments_list', 'user_compliments_more', 'user_compliments_note', 'user_compliments_photos', 'user_compliments_plain',
 'user_compliments_profile', 'user_compliments_writer']] = dropped[['user_compliments_cool', 'user_compliments_cute', 'user_compliments_funny', 'user_compliments_hot',
 'user_compliments_list', 'user_compliments_more', 'user_compliments_note', 'user_compliments_photos', 'user_compliments_plain',
 'user_compliments_profile', 'user_compliments_writer']].fillna(0)


In [31]:
dropped = pd.read_pickle('pickle_jar/final_dropped')
# dropped.to_pickle('pickle_jar/final_dropped')

In [304]:
dropped.columns.tolist()

['restaurant_id',
 'review_stars',
 'review_votes_cool',
 'review_votes_funny',
 'review_votes_useful',
 'user_average_stars',
 'user_compliments_cool',
 'user_compliments_cute',
 'user_compliments_funny',
 'user_compliments_hot',
 'user_compliments_list',
 'user_compliments_more',
 'user_compliments_note',
 'user_compliments_photos',
 'user_compliments_plain',
 'user_compliments_profile',
 'user_compliments_writer',
 'user_fans',
 'user_name',
 'user_review_count',
 'user_votes_cool',
 'user_votes_funny',
 'user_votes_useful',
 'user_yelping_since',
 'restaurant_stars',
 'restaurant_attributes_accepts_credit_cards',
 'restaurant_attributes_ages_allowed',
 'restaurant_attributes_alcohol',
 'restaurant_attributes_attire',
 'restaurant_attributes_byob',
 'restaurant_attributes_byob_corkage',
 'restaurant_attributes_by_appointment_only',
 'restaurant_attributes_caters',
 'restaurant_attributes_coat_check',
 'restaurant_attributes_corkage',
 'restaurant_attributes_delivery',
 'restaurant_a

In [220]:
dropped.shape

(1923536, 176)

In [190]:
dropped.restaurant_attributes_ages_allowed.dropna().shape

(17971,)

In [195]:
dropped.restaurant_attributes_ages_allowed.value_counts(dropna=False, sort=True)

NaN        1905565
21plus       17606
allages        365
dtype: int64

In [197]:
dropped.restaurant_attributes_ages_allowed.dtypes

category

In [38]:
cats = []
for i in ['restaurant_category_1',
 'restaurant_category_2',
 'restaurant_category_3',
 'restaurant_category_4',
 'restaurant_category_5',
 'restaurant_category_6',
 'restaurant_category_7']:
    cats.extend(train[i].unique().tolist())
cats = set(cats)
cats.remove(np.nan)
cats = sorted(cats)

def proper_array(x, backfill_size=7):
    encoder_prep = lambda x: cats.index(x)
    temp = map(encoder_prep, x)
    zeros = np.zeros(backfill_size, dtype='int')
    zeros[:len(temp)] = temp
    return zeros

t = dropped.restaurant_categories.apply(proper_array)

In [40]:
np.vstack(t)

array([[ 11,  34,  63, ...,   0,   0,   0],
       [ 11,  34,  63, ...,   0,   0,   0],
       [ 11,  34,  63, ...,   0,   0,   0],
       ..., 
       [ 27,  79, 129, ...,   0,   0,   0],
       [ 27,  79, 129, ...,   0,   0,   0],
       [ 27,  79, 129, ...,   0,   0,   0]])

In [None]:
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, LabelBinarizer
enc = OneHotEncoder(sparse=False)
e = enc.fit_transform(np.vstack(t))

In [None]:
e

In [388]:
enc_label = LabelEncoder()
el = enc_label.fit_transform(np.vstack(t)[:,0])

In [14]:
# from scipy.sparse import coo_matrix, hstack

# others = df[['preprocessed_review_text', 'review_stars', 'review_delta', 'previous_inspection_delta', 'polarity', 'subjectivity', 'neg', 'pos', 'neu', 'compound', 'score_lvl_1', 'score_lvl_2', 'score_lvl_3']]
# # drop nan rows according to review_text column to match with dropna-tfidf. fill review_star still existing nan's with 0 and then drop the review_text column completely
# others.preprocessed_review_text = others.preprocessed_review_text.replace('', np.nan)
# others = others.dropna(subset=['preprocessed_review_text']).fillna(0).drop('preprocessed_review_text', axis=1)
# X = hstack([tfidf, others])
# y = others[['score_lvl_1', 'score_lvl_2', 'score_lvl_3']]

In [15]:
# topics = ['manager', 'supervisor', 'training', 'safety', 'disease', 'ill', 'sick', 'poisoning', 'hygiene', 'raw', 'undercooked', 'cold', 'clean', 'sanitary', 'wash', 'jaundice', 'yellow', 'hazard', 'inspection', 'violation', 'gloves', 'hairnet', 'nails', 'jewelry', 'sneeze', 'cough', 'runny', 'illegal', 'rotten', 'dirty', 'mouse', 'cockroach', 'contaminated', 'gross', 'disgusting', 'stink', 'old', 'parasite', 'reheat', 'frozen', 'broken', 'drip', 'bathroom', 'toilet', 'leak', 'trash', 'dark', 'lights', 'dust', 'puddle', 'pesticide', 'bugs', 'mold', ]


# test = df[['review_delta', 'previous_inspection_delta', 'score_lvl_1', 'score_lvl_2', 'score_lvl_3']+topics]
# test = test.dropna()

In [16]:
# from scipy.sparse import csr_matrix, hstack

# def get_out(x):
#     try:
#         return x[0]
#     except:
#         return x

# topics = ['supervisor', 'training', 'safety', 'disease', 'ill', 'sick', 'poisoning', 'hygiene', 'raw', 'undercooked', 'cold', 'clean', 'sanitary', 'wash', 'jaundice', 'yellow', 'hazard', 'inspection', 'violation', 'gloves', 'hairnet', 'nails', 'jewelry', 'sneeze', 'cough', 'runny', 'illegal', 'rotten', 'dirty', 'mouse', 'cockroach', 'contaminated', 'gross', 'disgusting', 'stink', 'old', 'parasite', 'reheat', 'frozen', 'broken', 'drip', 'bathroom', 'toilet', 'leak', 'trash', 'dark', 'lights', 'dust', 'puddle', 'pesticide', 'bugs', 'mold', ]
# matrix = np.vstack(test.manager.apply(lambda x: x[0:2]))
# pbar = ProgressBar(maxval=len(topics)).start()
# for index, i in enumerate(topics):
#     t = np.vstack(test[i].apply(lambda x: x[0:2]))
#     matrix = np.concatenate((matrix, t), axis=1)
#     pbar.update(index)
# pbar.finish()
# matrix = csr_matrix(matrix)
# X = hstack([matrix, test[['review_delta', 'previous_inspection_delta']]])
# y = test[['score_lvl_1', 'score_lvl_2', 'score_lvl_3']]

In [18]:
# from scipy.sparse import csr_matrix, hstack
# topics = ['supervisor', 'training', 'safety', 'disease', 'ill', 'sick', 'poisoning', 'hygiene', 'raw', 'undercooked', 'cold', 'clean', 'sanitary', 'wash', 'jaundice', 'yellow', 'hazard', 'inspection', 'violation', 'gloves', 'hairnet', 'nails', 'jewelry', 'sneeze', 'cough', 'runny', 'illegal', 'rotten', 'dirty', 'mouse', 'cockroach', 'contaminated', 'gross', 'disgusting', 'stink', 'old', 'parasite', 'reheat', 'frozen', 'broken', 'drip', 'bathroom', 'toilet', 'leak', 'trash', 'dark', 'lights', 'dust', 'puddle', 'pesticide', 'bugs', 'mold', ]
# matrix = csr_matrix(np.vstack(test.manager.apply(lambda x: x[0:5])))
# pbar = ProgressBar(maxval=len(topics)).start()
# for index,i in enumerate(topics):
#     t = np.vstack(test[i].apply(lambda x: x[0:5]))
#     matrix = hstack([matrix, t])
#     pbar.update(index)
# pbar.finish()


In [24]:
matrix = joblib.load('pickle_jar/similarity_matrix5')

In [40]:
print matrix.shape
print test.shape

(1923536, 265)
(1923536, 58)


In [42]:
# from scipy.sparse import csr_matrix, hstack
# X = hstack([matrix, test[['review_delta', 'previous_inspection_delta']]])
# y = test[['score_lvl_1', 'score_lvl_2', 'score_lvl_3']]

In [44]:
# del matrix
# del df
# del test

In [111]:
def make_bins(df, bin_size=10):
    # time delta bins
    tdmax = df.review_delta.max()
    tdmin = df.review_delta.min()
    df['review_delta_bin'] = pd.cut(df["review_delta"], np.arange(tdmin, tdmax, bin_size))
    df['review_delta_bin_codes'] = df.review_delta_bin.astype('category').cat.codes
    tdmax = df.previous_inspection_delta.max()
    tdmin = df.previous_inspection_delta.min()
    df['previous_inspection_delta_bin'] = pd.cut(df["previous_inspection_delta"], np.arange(tdmin-1, tdmax, bin_size))
    df['previous_inspection_delta_bin_codes'] = df.previous_inspection_delta_bin.astype('category').cat.codes
    return df
df = make_bins(df)

In [34]:
scores = ['score_lvl_1', 'score_lvl_2', 'score_lvl_3']
# model_features = ['review_stars', 'review_delta', 'previous_inspection_delta','review_delta_bin', 'previous_inspection_delta_bin', 'polarity', 'subjectivity', 'neg', 'pos', 'neu', 'compound']
# model_features = ['review_stars', 'review_delta', 'previous_inspection_delta', 'polarity', 'subjectivity', 'neg', 'pos', 'neu', 'compound']
model_features = ['review_delta', 'previous_inspection_delta']

X, y = extract_features(test[model_features + topics + scores].dropna())

print df.shape
print X.shape
print y.shape
# review_stars doesnt exist for every observation so reducing size even further

(4071065, 185)
(1923536, 55)
(1923536, 3)


In [None]:
# from sklearn.feature_selection import RFECV
# rfecv = RFECV(estimator=SGDClassifier(n_jobs=-1), scoring=mean_squared_error)
# rfecv.fit(X, y.score_lvl_1)

from sklearn.metrics import mean_squared_error

# from sklearn.svm import LinearSVC
# X_new = LinearSVC(C=0.01, penalty="l1", dual=False).fit_transform(X, y.score_lvl_1)

# from sklearn.feature_selection import RFE
# rfe = RFE(estimator=LinearRegression(), n_features_to_select=3, step=1)
# rfe.fit(X, y.score_lvl_1)

from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.feature_selection import f_regression
from sklearn.feature_selection import f_classif
test = SelectFwe(f_classif).fit_transform(X, y.score_lvl_1)

In [86]:
X_new = SelectKBest(f_classif, k=5).fit_transform(X, y.score_lvl_1)

In [93]:
X_new

array([2, 7, 6, 3, 4, 1, 1, 1, 5])

In [45]:
X

<1923536x267 sparse matrix of type '<type 'numpy.float64'>'
	with 504667441 stored elements in COOrdinate format>

In [46]:
t0 = time()

from sklearn.linear_model import LinearRegression
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import SGDRegressor
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import Perceptron
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor 
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import GaussianNB


# set classifiers to test
estimator = LinearRegression()
# estimator = RandomForestClassifier(n_jobs=-1, random_state=42)
# estimator = SGDClassifier(n_jobs=-1, random_state=42)
# estimator = Perceptron(n_jobs=-1, random_state=42)  # gets some nuances
# estimator = SGDRegressor() # gets some nuances
# estimator = KNeighborsClassifier()
# estimator = KNeighborsRegressor()  # gets some nuances
# estimator = DecisionTreeClassifier()
# estimator = DecisionTreeRegressor()
# estimator = GaussianNB()

pipeline = Pipeline([
        ('low_var_removal', VarianceThreshold()),
        ('normalizer', Normalizer()),
#         ('normalizer', Normalizer(norm='l2')), #  for text classification and clustering
#         ('scaler', StandardScaler()),
#         ('scaler', StandardScaler(with_mean=False)), #  for sparse matrix
        ('clf', estimator),
])

p1,p2,p3,ytest = raw_fit(X, y, pipeline)
raw_scoring(p1,p2,p3,ytest)


print("{} seconds elapsed".format(time()-t0))

# first representation of manager and mold plus deltas for SGDClassifier
# Level 1 accuracy score of 0.160254863959
# Level 2 accuracy score of 0.693050714933
# Level 3 accuracy score of 0.574146779681
# Contest score of 1.52839121903

# sparse matrix of just manager no-mean-false and the detla for sdg
# Level 1 accuracy score of 0.136340572778
# Level 2 accuracy score of 0.693059032948
# Level 3 accuracy score of 0.574184210745
# Contest score of 1.41908388602

# first representation of all similarity vecs plus deltas for SGD
# Level 1 accuracy score of 0.0958193660009
# Level 2 accuracy score of 0.693059032948
# Level 3 accuracy score of 0.574184210745
# Contest score of 1.2018434011

# first two representations of all similarity vecs plus deltas for SGD
# Level 1 accuracy score of 0.134818376157
# Level 2 accuracy score of 0.693059032948
# Level 3 accuracy score of 0.574184210745
# Contest score of 1.73133292154

# first five representations of all similarity vecs plus deltas for SGD
# Level 1 accuracy score of 0.140464228379
# Level 2 accuracy score of 0.693059032948
# Level 3 accuracy score of 0.574184210745
# Contest score of 1.43479163855

Level 1 accuracy score of 0.140464228379
Level 2 accuracy score of 0.693059032948
Level 3 accuracy score of 0.574184210745
Contest score of 1.43479163855
554.074513912 seconds elapsed


In [74]:
# baseline scores if guessing zero
guess = 0
for index, score in enumerate(scores):
    print("level {}: {}".format(index+1, y[score].value_counts(normalize=True)[guess]))

level 1: 0.22128560699
level 2: 0.692799495547
level 3: 0.573199692093


In [54]:
import statsmodels.formula.api as smf
model = smf.ols(formula='score_lvl_1 ~'+'+'.join(model_features), data=df[model_features + scores].dropna()).fit()
model.summary()

0,1,2,3
Dep. Variable:,score_lvl_1,R-squared:,0.011
Model:,OLS,Adj. R-squared:,0.011
Method:,Least Squares,F-statistic:,2576.0
Date:,"Sat, 11 Jul 2015",Prob (F-statistic):,0.0
Time:,13:24:09,Log-Likelihood:,-5743500.0
No. Observations:,1925254,AIC:,11490000.0
Df Residuals:,1925245,BIC:,11490000.0
Df Model:,8,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5
,coef,std err,t,P>|t|,[95.0% Conf. Int.]
Intercept,3.9054,0.447,8.737,0.000,3.029 4.782
review_delta,0.0001,5.08e-06,26.612,0.000,0.000 0.000
previous_inspection_delta,-0.0030,2.12e-05,-141.223,0.000,-0.003 -0.003
polarity,-0.0294,0.022,-1.321,0.187,-0.073 0.014
subjectivity,0.0467,0.025,1.886,0.059,-0.002 0.095
neg,0.9265,0.453,2.046,0.041,0.039 1.814
pos,0.7716,0.449,1.719,0.086,-0.108 1.651
neu,0.7430,0.447,1.661,0.097,-0.134 1.620
compound,-0.0587,0.009,-6.333,0.000,-0.077 -0.041

0,1,2,3
Omnibus:,1120039.389,Durbin-Watson:,0.017
Prob(Omnibus):,0.0,Jarque-Bera (JB):,13578799.817
Skew:,2.602,Prob(JB):,0.0
Kurtosis:,14.925,Cond. No.,296000.0


In [90]:
from statsmodels.discrete.discrete_model import Poisson
model = Poisson(endog=y.score_lvl_1, exog=df[model_features].dropna()).fit()
model.summary()

Optimization terminated successfully.
         Current function value: 3.559156
         Iterations 8


0,1,2,3
Dep. Variable:,score_lvl_1,No. Observations:,1781150.0
Model:,Poisson,Df Residuals:,1781141.0
Method:,MLE,Df Model:,8.0
Date:,"Sat, 11 Jul 2015",Pseudo R-squ.:,0.008638
Time:,16:24:54,Log-Likelihood:,-6339400.0
converged:,True,LL-Null:,-6394600.0
,,LLR p-value:,0.0

0,1,2,3,4,5
,coef,std err,z,P>|z|,[95.0% Conf. Int.]
review_stars,-0.0185,0.000,-48.993,0.000,-0.019 -0.018
review_delta,2.759e-05,5.17e-07,53.338,0.000,2.66e-05 2.86e-05
previous_inspection_delta,-0.0008,2.54e-06,-309.734,0.000,-0.001 -0.001
polarity,0.0180,0.003,6.752,0.000,0.013 0.023
subjectivity,-0.0035,0.003,-1.061,0.289,-0.010 0.003
neg,1.5881,0.009,173.016,0.000,1.570 1.606
pos,1.6154,0.005,297.329,0.000,1.605 1.626
neu,1.6129,0.002,727.107,0.000,1.609 1.617
compound,-0.0023,0.001,-2.155,0.031,-0.004 -0.000


In [17]:
import statsmodels.api as sm
sm.stats.anova_lm(model, typ=2) # Type 2 ANOVA DataFrame


Unnamed: 0,sum_sq,df,F,PR(>F)
review_delta,19206.698621,1,841.97584,4.4336739999999994e-185
previous_inspection_delta,463551.215022,1,20320.979218,0.0
polarity,11.553256,1,0.506467,0.4766723
subjectivity,32.118953,1,1.408018,0.2353858
neg,90.939919,1,3.986589,0.04586396
pos,62.607017,1,2.744542,0.09758718
neu,58.17063,1,2.550062,0.1102901
compound,1005.620308,1,44.083995,3.14667e-11
Residual,43210335.339146,1894238,,


In [38]:
print model.params
A=np.identity(len(model.params)) # identity matrix with size = number of params
GroupTest=A[1:3,:] # for the categorical var., keep the corresponding rows of A
CovTest=A[3,:] # row for the continuous var.
print "Group effect test",model.f_test(GroupTest).fvalue
print "Covariate effect test",model.f_test(CovTest).fvalue

Intercept      -3.550546
review_delta    0.000927
polarity        0.294398
subjectivity   -0.430076
neg             8.069998
pos             8.031545
neu             7.440557
compound       -0.328166
dtype: float64
Group effect test [[ 242.21228155]]
Covariate effect test [[ 7.02122616]]


In [60]:
from pymc3 import Model, Normal, HalfNormal
basic_model = Model()

with basic_model:

    # Priors for unknown model parameters
    alpha = Normal('alpha', mu=0, sd=10)
    beta = Normal('beta', mu=0, sd=10, shape=2)
    sigma = HalfNormal('sigma', sd=1)

    # Expected value of outcome
    mu = alpha + beta[0]*X.review_delta + beta[1]*X.neg

    # Likelihood (sampling distribution) of observations
    Y_obs = Normal('Y_obs', mu=mu, sd=sigma, observed=y.score_lvl_1)

In [64]:
from pymc3 import find_MAP

from scipy import optimize

map_estimate = find_MAP(model=basic_model, fmin=optimize.fmin_powell)

print(map_estimate)

{'alpha': array(4.20236359846617), 'beta': array([  1.16082103e-04,   4.84806122e-01]), 'sigma_log': array(1.5699115275497426)}


In [67]:
from pymc3.glm import glm

with Model() as model_glm:
    glm('score_lvl_1 ~'+'+'.join(model_features), df[model_features + scores].dropna())