In [1]:
%load_ext autoreload
%autoreload 2 

In [3]:
from sklearn.linear_model import LogisticRegression 
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, precision_recall_curve       
from sklearn.model_selection import train_test_split, GridSearchCV

from src.features import add_total_hours, is_restaurant
from src.config import INTERIM_DATA_DIR, PROCESSED_DATA_DIR
from src.plots import plot_coefficients

import pandas as pd 
import matplotlib.pyplot as plt
import numpy as np

In [3]:
df = pd.read_parquet(INTERIM_DATA_DIR / 'business.parquet')
df = add_total_hours(df)

X = df.drop(columns=['is_open', 'longitude', 'latitude', 'business_id', 'name', 'address', 'review_count', 'is_open', 'attributes', 'hours',])
y = df['is_open']   
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [4]:
X

Unnamed: 0,city,state,postal_code,stars,categories,total_hours_open
0,Santa Barbara,CA,93101,5.0,"Doctors, Traditional Chinese Medicine, Naturop...",0.0
1,Affton,MO,63123,3.0,"Shipping Centers, Local Services, Notaries, Ma...",48.0
2,Tucson,AZ,85711,3.5,"Department Stores, Shopping, Fashion, Home & G...",100.0
3,Philadelphia,PA,19107,4.0,"Restaurants, Food, Bubble Tea, Coffee & Tea, B...",94.0
4,Green Lane,PA,18054,4.5,"Brewpubs, Breweries, Food",40.0
...,...,...,...,...,...,...
150341,Edmonton,AB,T6J 5H2,3.0,"Nail Salons, Beauty & Spas",61.0
150342,Nashville,TN,37204,4.0,"Pets, Nurseries & Gardening, Pet Stores, Hobby...",64.0
150343,Indianapolis,IN,46250,3.5,"Shopping, Jewelry, Piercing, Toy Stores, Beaut...",0.0
150344,Edwardsville,IL,62025,4.0,"Fitness/Exercise Equipment, Eyewear & Optician...",70.0


## Building CV for Parameter Search

In [5]:
model = LogisticRegression(class_weight = "balanced", max_iter=1000, penalty='l2')

column_trans = ColumnTransformer([
    ('categorical', OneHotEncoder(dtype='int'), ['city', 'state','postal_code']),
    ('numerical', StandardScaler(), ['total_hours_open', 'stars'])
     ])

pipe = Pipeline([
    ('preprocessor', column_trans),
    ('cat_encoder', YelpCategoryEncoder(threshold=100)),
    ('scaler', StandardScaler()),
    #('logreg', model)
])


pipe
pipe.fit(X_train, y_train)
# y_pred = pipe.predict(X_test)

# print(classification_report(y_test, y_pred))

AttributeError: 'csr_matrix' object has no attribute 'categories'

In [None]:
pipe

0,1,2
,steps,"[('preprocessor', ...), ('cat_encoder', ...), ...]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('categorical', ...), ('numerical', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,'int'
,handle_unknown,'error'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,copy,True
,with_mean,True
,with_std,True


In [None]:
X_train

Unnamed: 0,city,state,postal_code,stars,categories,total_hours_open
78491,Apollo Beach,FL,33572,4.5,"Beauty & Spas, Skin Care, Health & Medical, Ha...",30.0
19791,Largo,FL,33770,4.0,"Knife Sharpening, Home & Garden, Propane, Nurs...",63.0
79705,Philadelphia,PA,19143,2.0,"Dry Cleaning & Laundry, Local Services, Laundr...",0.0
49763,Boise,ID,83702,4.5,"Delis, Restaurants",0.0
72601,Tucson,AZ,85705,5.0,"Pet Services, Pets, Pet Sitting, Pet Groomers,...",44.0
...,...,...,...,...,...,...
119879,New Port Richey,FL,34652,2.5,"Food, Cosmetics & Beauty Supply, Shopping, Con...",103.0
103694,Newark,DE,19702,3.5,"Nail Salons, Beauty & Spas",69.0
131932,Indianapolis,IN,46226,3.0,"Home Services, Windows Installation, Contracto...",42.5
146867,Tucson,AZ,85704,2.5,"Home Services, Real Estate, Property Management",0.0


In [6]:
from src.modeling.train import df, pipe

In [3]:
df.head()

Unnamed: 0,city,state,postal_code,stars,review_count,is_open,categories,total_hours_open
0,Santa Barbara,CA,93101,5.0,7,0,"Doctors, Traditional Chinese Medicine, Naturop...",0.0
1,Affton,MO,63123,3.0,15,1,"Shipping Centers, Local Services, Notaries, Ma...",48.0
2,Tucson,AZ,85711,3.5,22,0,"Department Stores, Shopping, Fashion, Home & G...",100.0
3,Philadelphia,PA,19107,4.0,80,1,"Restaurants, Food, Bubble Tea, Coffee & Tea, B...",94.0
4,Green Lane,PA,18054,4.5,13,1,"Brewpubs, Breweries, Food",40.0


In [8]:
pipe = Pipeline([
    ('cat_encoder', YelpCategoryEncoder(threshold=100, col='categories')),
    ('preprocessor', ColumnTransformer(
        transformers=[
            ('categorical', OneHotEncoder(dtype=int, handle_unknown='ignore'),
             ['city', 'state', 'postal_code']),
            ('numerical', StandardScaler(),
             ['total_hours_open', 'stars', 'review_count']),
        ],
        remainder='drop'
    )),
    # ('logreg', logit),
])

pipe.fit_transform(df)   # should now get past YelpCategoryEncoder.fit

NameError: name 'YelpCategoryEncoder' is not defined

In [13]:
pipe.fit_transform(df)

[autoreload of src.modeling.train failed: Traceback (most recent call last):
  File "/Users/spencervenancio/Downloads/projects/yelp_analysis/.venv/lib/python3.10/site-packages/IPython/extensions/autoreload.py", line 276, in check
    superreload(m, reload, self.old_objects)
  File "/Users/spencervenancio/Downloads/projects/yelp_analysis/.venv/lib/python3.10/site-packages/IPython/extensions/autoreload.py", line 475, in superreload
    module = reload(module)
  File "/opt/anaconda3/lib/python3.10/importlib/__init__.py", line 169, in reload
    _bootstrap._exec(spec, module)
  File "<frozen importlib._bootstrap>", line 619, in _exec
  File "<frozen importlib._bootstrap_external>", line 883, in exec_module
  File "<frozen importlib._bootstrap>", line 241, in _call_with_frames_removed
  File "/Users/spencervenancio/Downloads/projects/yelp_analysis/src/modeling/train.py", line 28, in <module>
    ("select_1d", FunctionTransformer(lambda X: np.asarray(X).ravel(), validate=False)),
NameError: 

ValueError: Specifying the columns using strings is only supported for dataframes.