In [1]:
%matplotlib inline

import nltk
from nltk.stem.snowball import SnowballStemmer
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from pandas.io.json import json_normalize
import json
from textblob import TextBlob
from sklearn.cross_validation import cross_val_score
import metrics
from sklearn.cross_validation import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import Normalizer
from sklearn.preprocessing import StandardScaler
import text_processors
from progressbar import ProgressBar
import data_grab

import pymc3 as pm 

plt.rcParams["figure.figsize"] = (10, 8)

In [2]:
def contest_metric(numpy_array_predictions, numpy_array_actual_values):
    return metrics.weighted_rmsle(numpy_array_predictions, numpy_array_actual_values,
            weights=metrics.KEEPING_IT_CLEAN_WEIGHTS)

In [3]:
def contest_scoring(X, y, pipeline):
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
    s1 = pipeline.fit(X_train, y_train['score_lvl_1']).predict(X_test)
    s2 = pipeline.fit(X_train, y_train['score_lvl_2']).predict(X_test)
    s3 = pipeline.fit(X_train, y_train['score_lvl_3']).predict(X_test)
    results = np.dstack((s1, s2, s3))
    score = contest_metric(np.round(results[0]), np.array(y_test))
    print("Contest score of {}".format(score))
    return score

In [4]:
def score_model(X, y, pipeline):
    scores = cross_val_score(pipeline, X, y, cv=3, n_jobs=1, verbose=1)
    mean_score = np.mean(scores)
    std_dev_score = np.std(scores)
    print("CV score of {} +/- {}".format(mean_score, std_dev_score))

In [5]:
def extract_features(df):
    features = df.drop(['score_lvl_1', 'score_lvl_2', 'score_lvl_3'], axis=1)
    response = df[['score_lvl_1', 'score_lvl_2', 'score_lvl_3']].astype(np.float64)  #for numerical progression
    # response = df[['score_lvl_1', 'score_lvl_2', 'score_lvl_3']].astype(np.int8)  # for categorical response
    return features, response

In [46]:
df = pd.read_pickle('pickle_jar/review_text_sentiment_hierarchical_df')

In [47]:
df.previous_inspection_delta = df.previous_inspection_delta.fillna(0)
df.previous_inspection_delta = df.previous_inspection_delta.dt.days

In [48]:
# # time delta bins
# tdmax = df.review_delta.max()
# tdmin = df.review_delta.min()
# df['review_delta_bin'] = pd.cut(df["review_delta"], np.arange(tdmin, tdmax, 30))
# tdmax = df.previous_inspection_delta.max()
# tdmin = df.previous_inspection_delta.min()
# df['previous_inspection_delta_bin'] = pd.cut(df["previous_inspection_delta"], np.arange(tdmin, tdmax, 30))

In [49]:
scores = ['score_lvl_1', 'score_lvl_2', 'score_lvl_3']
# model_features = ['review_delta', 'previous_inspection_delta', 'polarity', 'subjectivity', 'neg', 'pos', 'neu', 'compound']
model_features = ['review_delta', 'polarity', 'subjectivity', 'neg', 'pos', 'neu', 'compound']
X, y = extract_features(df[model_features + scores].dropna())

In [50]:
print df.shape
print X.shape
print y.shape

(4071065, 140)
(1925254, 7)
(1925254, 3)


In [37]:
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import RandomForestClassifier

# set classifiers to test
# estimator = LinearRegression()
estimator = RandomForestClassifier(n_jobs=-1, random_state=42)
# estimator = SGDClassifier(n_jobs=-1, random_state=42)
# estimator = BaggingClassifier(random_state=42)

# can use with text if convert X to dense with .toarray() but is super heavy on ram
pipeline = Pipeline([
        ('normalizer', Normalizer()),
        ('scaler', StandardScaler()),
        ('clf', estimator),
])

for score in scores:
    print(score)
    score_model(X, y[score], pipeline)

print
contest_scoring(X, y, pipeline)

[Parallel(n_jobs=1)]: Done   1 jobs       | elapsed:    0.3s
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    0.9s finished


score_lvl_1
CV score of 1.0 +/- 0.0

[Parallel(n_jobs=1)]: Done   1 jobs       | elapsed:    0.3s
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    0.9s finished



score_lvl_2
CV score of 1.0 +/- 0.0

[Parallel(n_jobs=1)]: Done   1 jobs       | elapsed:    0.3s
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    0.8s finished



score_lvl_3
CV score of 1.0 +/- 0.0

Contest score of 0.0


0.0

In [51]:
import statsmodels.formula.api as smf
model = smf.ols(formula='score_lvl_1 ~'+'+'.join(model_features), data=df[model_features + scores].dropna()).fit()
model.summary()

0,1,2,3
Dep. Variable:,score_lvl_1,R-squared:,0.0
Model:,OLS,Adj. R-squared:,0.0
Method:,Least Squares,F-statistic:,93.56
Date:,"Sat, 11 Jul 2015",Prob (F-statistic):,3.76e-137
Time:,13:23:38,Log-Likelihood:,-5753400.0
No. Observations:,1925254,AIC:,11510000.0
Df Residuals:,1925246,BIC:,11510000.0
Df Model:,7,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5
,coef,std err,t,P>|t|,[95.0% Conf. Int.]
Intercept,3.4146,0.449,7.600,0.000,2.534 4.295
review_delta,0.0001,5.1e-06,23.013,0.000,0.000 0.000
polarity,-0.0344,0.022,-1.538,0.124,-0.078 0.009
subjectivity,0.0540,0.025,2.168,0.030,0.005 0.103
neg,0.9708,0.455,2.132,0.033,0.079 1.863
pos,0.8400,0.451,1.862,0.063,-0.044 1.724
neu,0.8203,0.450,1.825,0.068,-0.061 1.701
compound,-0.0675,0.009,-7.234,0.000,-0.086 -0.049

0,1,2,3
Omnibus:,1115658.461,Durbin-Watson:,0.017
Prob(Omnibus):,0.0,Jarque-Bera (JB):,13477102.476
Skew:,2.589,Prob(JB):,0.0
Kurtosis:,14.882,Cond. No.,294000.0


In [17]:
import statsmodels.api as sm
sm.stats.anova_lm(model, typ=2) # Type 2 ANOVA DataFrame


Unnamed: 0,sum_sq,df,F,PR(>F)
review_delta,19206.698621,1,841.97584,4.4336739999999994e-185
previous_inspection_delta,463551.215022,1,20320.979218,0.0
polarity,11.553256,1,0.506467,0.4766723
subjectivity,32.118953,1,1.408018,0.2353858
neg,90.939919,1,3.986589,0.04586396
pos,62.607017,1,2.744542,0.09758718
neu,58.17063,1,2.550062,0.1102901
compound,1005.620308,1,44.083995,3.14667e-11
Residual,43210335.339146,1894238,,


In [38]:
print model.params
A=np.identity(len(model.params)) # identity matrix with size = number of params
GroupTest=A[1:3,:] # for the categorical var., keep the corresponding rows of A
CovTest=A[3,:] # row for the continuous var.
print "Group effect test",model.f_test(GroupTest).fvalue
print "Covariate effect test",model.f_test(CovTest).fvalue

Intercept      -3.550546
review_delta    0.000927
polarity        0.294398
subjectivity   -0.430076
neg             8.069998
pos             8.031545
neu             7.440557
compound       -0.328166
dtype: float64
Group effect test [[ 242.21228155]]
Covariate effect test [[ 7.02122616]]


In [44]:
y

Unnamed: 0,score_lvl_1,score_lvl_2,score_lvl_3
2848354,0,0,0
2848355,0,0,0
2848356,0,0,0


In [40]:
from pymc3 import Model, Normal, HalfNormal
basic_model = Model()

with basic_model:

    # Priors for unknown model parameters
    alpha = Normal('alpha', mu=0, sd=10)
    beta = Normal('beta', mu=0, sd=10, shape=2)
    sigma = HalfNormal('sigma', sd=1)

    # Expected value of outcome
    mu = alpha + beta[0]*X.review_delta + beta[1]*X.polarity

    # Likelihood (sampling distribution) of observations
    Y_obs = Normal('Y_obs', mu=mu, sd=sigma, observed=Y.score_lvl_1)

NameError: name 'X1' is not defined