In [4]:
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import scipy.stats as st
# import pymc3 as pm
import seaborn as sns
import wtascripts.wta_helpers as wta_helpers

# enables inline plots, without it plots don't show up in the notebook
%matplotlib inline
%config InlineBackend.figure_format = 'svg'
# %config InlineBackend.figure_format = 'png'
# mpl.rcParams['figure.dpi']= 300

In [5]:
# Sklearn specific imports
from sklearn.preprocessing import StandardScaler, LabelBinarizer, MultiLabelBinarizer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LinearRegression
from sklearn import metrics

In [6]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 25)
pd.set_option('display.precision', 3)
pd.set_option('display.float_format', lambda x: '%.3f' % x)
np.set_printoptions(suppress=True)

In [7]:
# Load then clean dataset
df = wta_helpers.load_raw_df()
df = wta_helpers.clean_raw_df(df)

## Plan of Action
* Use multi-label binarizer :  fee column
* Use label binarizer : region
* Probably remove votes or countreports due to collinearity. Which one is better? I think votes is bad too due to direct tie to trail
  * A better model wouldn't have votes or count of reports - because those are not trail features

#### Make pipeline with this stuff

In [8]:
### Preprocessing

In [9]:
onehot_feats = ['Wildflowers/Meadows', 'Ridges/passes',
'Wildlife','Waterfalls','Old growth','Summits','Good for kids','Dogs allowed on leash',
'Fall foliage','Lakes','Rivers','Coast','Mountain views','Established campsites',]
# Not used: lat, long, fee, region, subregion, lengthtype

# cont_feats = ['votes', 'countreports', 'length', 'gain', 'hpoint']
cont_feats = ['votes', 'length', 'gain', 'hpoint']


oh_cols = df[onehot_feats]

#Standardize the non one-hot encoding variables into XS
s = StandardScaler()
XS = s.fit_transform(df[cont_feats])
#print(oh_cols)
# combine with the one hot encoding variables

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


In [17]:
df['fee'].str.split(",")

0       [Northwest Forest Pass]
1                        [None]
2          [National Park Pass]
3       [Northwest Forest Pass]
5       [Northwest Forest Pass]
6               [Discover Pass]
8       [Northwest Forest Pass]
12                       [None]
14                       [None]
23                       [None]
24                       [None]
25      [Northwest Forest Pass]
                 ...           
3395         [Sno-Parks Permit]
3402         [Sno-Parks Permit]
3403    [Northwest Forest Pass]
3407    [Northwest Forest Pass]
3408    [Northwest Forest Pass]
3413       [National Park Pass]
3465    [Northwest Forest Pass]
3509         [Sno-Parks Permit]
3512    [Northwest Forest Pass]
3522    [Northwest Forest Pass]
3528       [National Park Pass]
3535    [Northwest Forest Pass]
Name: fee, Length: 1150, dtype: object

In [98]:
# multilabelbinarizer for 'fees' column
mlb = MultiLabelBinarizer()
fees_processed = mlb.fit_transform(df['fee'].str.replace(", ",",").str.split(","))
mlb.fit_transform(df['fee'].str.replace(", ",",").str.split(","))
mlb.classes_

array(['Discover Pass', 'National Monument Fee', 'National Park Pass',
       'None', 'Northwest Forest Pass', 'Oregon State Parks Day-Use',
       'Refuge Entrance Pass', 'Sno-Parks Permit'], dtype=object)

In [24]:
# labelbinarizer for 'region' column
lb = LabelBinarizer()
region_processed = lb.fit_transform(df['region'])
lb.classes_
region_processed

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 1, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 1, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 1, 0]])

In [25]:
X = np.concatenate([XS, oh_cols, fees_processed, region_processed], axis=1)
y = df['rating']

In [26]:
# X = make_pipeline(StandardScaler(df[cont_feats]), MultiLabelBinarizer(df['fee'].str.split(",")), LabelBinarizer(df['region']))

In [27]:
X_train, X_test, y_train, ytest = train_test_split(X, y)

reg = LinearRegression()

reg.fit(X_train, y_train)

print("intercept:",reg.intercept_)
print("coefficients,",reg.coef_)

y_pred = reg.predict(X_test)

print("RMSE:", np.sqrt(metrics.mean_squared_error(ytest, y_pred)))
print("MSE:", metrics.mean_squared_error(ytest, y_pred))
print("Score:", )

intercept: 3.3021683646725504
coefficients, [ 0.14228162  0.03322886 -0.00019636  0.04158784  0.02384419  0.06929321
 -0.16536648  0.18403333  0.03160041  0.21247978  0.180958    0.02202666
  0.08709014  0.09989567 -0.01591556  0.37751028  0.15103101  0.03811052
 -0.3588524   0.3515122  -0.16940734  0.54504354  0.05433978 -0.12913634
 -0.1175724   0.         -0.09471874 -0.0885485  -0.07236443 -0.01311358
  0.25030336 -0.20442514 -0.02299985 -0.01345088  0.00866608  0.11936192
 -0.09396924 -0.1345393   0.17653106]
RMSE: 0.6514168823160014
MSE: 0.42434395456629914
Score:


In [28]:
# I may not be creating my model correctly by leaving "fee" and "region" all alone

In [29]:
newlist = cont_feats + onehot_feats + list(mlb.classes_) + list(lb.classes_)
newlist

['votes',
 'length',
 'gain',
 'hpoint',
 'Wildflowers/Meadows',
 'Ridges/passes',
 'Wildlife',
 'Waterfalls',
 'Old growth',
 'Summits',
 'Good for kids',
 'Dogs allowed on leash',
 'Fall foliage',
 'Lakes',
 'Rivers',
 'Coast',
 'Mountain views',
 'Established campsites',
 ' Northwest Forest Pass',
 ' Sno-Parks Permit',
 'Discover Pass',
 'National Monument Fee',
 'National Park Pass',
 'None',
 'Northwest Forest Pass',
 'Oregon State Parks Day-Use',
 'Refuge Entrance Pass',
 'Sno-Parks Permit',
 'Central Cascades',
 'Central Washington',
 'Eastern Washington',
 'Issaquah Alps',
 'Mount Rainier Area',
 'North Cascades',
 'Olympic Peninsula',
 'Puget Sound and Islands',
 'Snoqualmie Region',
 'South Cascades',
 'Southwest Washington']

In [30]:
hold = sorted(zip(df[newlist], reg.coef_), key=lambda x: x[1], reverse=True)
print(hold)

KeyError: "[' Northwest Forest Pass' ' Sno-Parks Permit' 'Discover Pass'\n 'National Monument Fee' 'National Park Pass' 'None'\n 'Northwest Forest Pass' 'Oregon State Parks Day-Use'\n 'Refuge Entrance Pass' 'Sno-Parks Permit' 'Central Cascades'\n 'Central Washington' 'Eastern Washington' 'Issaquah Alps'\n 'Mount Rainier Area' 'North Cascades' 'Olympic Peninsula'\n 'Puget Sound and Islands' 'Snoqualmie Region' 'South Cascades'\n 'Southwest Washington'] not in index"

In [None]:
df.rating.plot.hist(50)

In [None]:
# votes, count of tripreports, mountains, waterfall, summits, then hpoint, then ridges/passes

In [None]:
sns.pairplot(df[(df.votes > 3) & (df.length < 60) & (df.hpoint < 9000)],
             x_vars=['votes','length','gain','hpoint','countreports'],
             y_vars='rating')

In [None]:
# votes and count reports are highly correlated
plt.scatter(df['votes'], df['countreports']);

In [None]:
cont_cols= ['votes', 'countreports', 'length', 'gain', 'hpoint', 'Wildflowers/Meadows', 'Ridges/passes',
            'Wildlife','Waterfalls','Old growth','Summits','Good for kids','Dogs allowed on leash',
            'Fall foliage','Lakes','Rivers','Coast','Mountain views','Established campsites',]

In [None]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn import metrics

cont_cols = ['votes', 'countreports', 'length', 'gain', 'hpoint']
oh_cols = df[onehot_feats]

#Standardize the non one-hot encoding variables into XS
s = StandardScaler()
XS = s.fit_transform(X)
#print(oh_cols)
# combine with the one hot encoding variables

X = np.concatenate([XS, oh_cols], axis=1)
y = df['rating']

In [None]:
newlist = cont_cols + onehot_feats
newlist

In [None]:
d = {i: len(df[i].unique()) for i in df.columns}
d