In [0]:
import pandas as pd
import numpy as np
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import make_pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import GridSearchCV

In [2]:
df_music_train = pd.read_csv("https://raw.githubusercontent.com/reillynski/data301-finalproject/master/df_music.csv", index_col=0)
df_music_train.head()

Unnamed: 0,name,type,promoter.name,info,pleaseNote,priceMin,priceMax,subGenre,city,state,venueName,attractionName,artists,num.artists,latitude,longitude,venueUpcoming,meanPrice,date
0,"Lit In Ac 2020 With Lil Kim, Fat Joe, Ja Rule,...",event,PROMOTED BY VENUE,,,52.0,92.0,French Rap,Atlantic City,New Jersey,Boardwalk Hall,Lil Kim,"['Lil Kim', 'Fat Joe', 'Ja Rule', 'State Prope...",10,39.354905,-74.438391,15,72.0,2020-04-04
2,"Yo Gotti, Da Baby, Kevin Gates, Kash Doll & more",event,PROMOTED BY VENUE,,"Originally scheduled to take place Sunday, Mar...",54.0,154.0,French Rap,Detroit,Michigan,Little Caesars Arena,Yo Gotti,"['Yo Gotti', 'Kash Doll', 'DaBaby', 'Kevin Gat...",7,42.341089,-83.055434,26,104.0,2020-05-20
3,Spring MegaFest,event,PROMOTED BY VENUE,,,53.0,179.0,French Rap,Indianapolis,Indiana,Bankers Life Fieldhouse,Lil Baby,"['Lil Baby', '2 Chainz', 'Rod Wave', 'Jacquees...",5,39.764064,-86.155507,8,116.0,2020-04-10
4,No Limit Reunion Tour,event,PROMOTED BY VENUE,,Artists subject to change. All sales are final...,55.0,195.0,French Rap,Atlanta,Georgia,State Farm Arena,Master P,"['Master P', 'Mia X', 'Silkk the Shocker', 'My...",5,33.757796,-84.394569,21,125.0,2020-05-01
5,Feed The Streetz Tour 2020,event,PROMOTED BY VENUE,,Lineup subject to change.,75.0,175.0,French Rap,Brooklyn,New York,Barclays Center,Rick Ross,"['Rick Ross', 'Jeezy', '2 Chainz', 'Yo Gotti',...",9,40.683504,-73.976617,21,125.0,2020-05-15


In [0]:
#adjust some columns to make it easier to use in ml model 

#fill/replace NaNs
df_music_train["info"] = df_music_train["info"].fillna("")
df_music_train["pleaseNote"] = df_music_train["pleaseNote"].fillna("")
df_music_train["promoter.name"] = df_music_train["promoter.name"].fillna("None")

#create quantitative date column
df_music_train["date_quant"] = df_music_train["date"].map(lambda x: int(x[5:7]) * 30 + int(x[8:10]))

# priceMin Model

**Variable and Hyperparameter Testing**

The following are a series of functions to assist in determining the best combination of features to include in a given model. 

In [0]:
quantitative = ["num.artists", "latitude", "longitude", "venueUpcoming", "date_quant"]
categorical = ["promoter.name", "subGenre", "city", "state", "venueName", "attractionName"]
text = ["name", "info", "pleaseNote"]

In [0]:
#create pipeline with combos of quantitative/categorical
def create_pipeline(cat_vars, text_vars, model):
  if len(text_vars) == 3:  
    ct = make_column_transformer(
          (TfidfVectorizer(max_features=50, norm=None), "name"), 
          (TfidfVectorizer(max_features=50, norm=None), "info"), 
          (TfidfVectorizer(max_features=50, norm=None), "pleaseNote"),
          (OneHotEncoder(handle_unknown="ignore"), cat_vars), 
          remainder="passthrough"
    )
  elif len(text_vars) == 2: 
    ct = make_column_transformer(
          (TfidfVectorizer(max_features=50, norm=None), text_vars[0]), 
          (TfidfVectorizer(max_features=50, norm=None), text_vars[1]), 
          (OneHotEncoder(handle_unknown="ignore"), cat_vars), 
          remainder="passthrough"
    )
  elif len(text_vars) == 1: 
    ct = make_column_transformer(
          (TfidfVectorizer(max_features=50, norm=None), text_vars[0]), 
          (OneHotEncoder(handle_unknown="ignore"), cat_vars), 
          remainder="passthrough"
    )

  else: 
    ct = make_column_transformer(
          (OneHotEncoder(handle_unknown="ignore"), cat_vars), 
          remainder="passthrough"
    )

  pipeline = make_pipeline(
      ct, 
      StandardScaler(with_mean=False), 
      model
  )

  return pipeline 

In [0]:
#calculate cross-val error 
def calc_error(quant_features, cat_features, text_features, feature, model): 
  pipeline = create_pipeline(cat_features, text_features, model)

  all_features = quant_features + cat_features + text_features
  
  errs = -cross_val_score(pipeline, X=df_music_train[all_features], 
                          y=df_music_train[feature],
                          scoring="neg_mean_squared_error", cv=10)
  return np.sqrt(errs).mean()

In [0]:
#get error for different combinations of different types of variables 
def get_combo_errors(feature, model): 
  quant_temp = quantitative[:] 
  cat_temp = categorical[:]
  text_temp = text[:]

  qct_errs = pd.Series()
  while len(quant_temp) > 0: 
    while len(cat_temp) > 0:
      while len(text_temp) > 0: 
        full = [] 
        full = quant_temp + cat_temp + text_temp
    
        qct_errs[str(full)] = calc_error(quant_temp, cat_temp, text_temp, feature, model) 
        text_temp.pop()

      text_temp = text[:]
      full = [] 
      full = quant_temp + cat_temp + text_temp
    
      qct_errs[str(full)] = calc_error(quant_temp, cat_temp, text_temp, feature, model)
      cat_temp.pop()
  
    cat_temp = categorical[:]
    full = [] 
    full = quant_temp + cat_temp + text_temp
    qct_errs[str(full)] = calc_error(quant_temp, cat_temp, text_temp, feature, model)
    quant_temp.pop()

  return qct_errs

K-Nearest Neighbors Regression

In [8]:
knn_errs = get_combo_errors("priceMin", KNeighborsRegressor(n_neighbors=10))
knn_errs

['num.artists', 'latitude', 'longitude', 'venueUpcoming', 'date_quant', 'promoter.name', 'subGenre', 'city', 'state', 'venueName', 'attractionName', 'name', 'info', 'pleaseNote']    14.815060
['num.artists', 'latitude', 'longitude', 'venueUpcoming', 'date_quant', 'promoter.name', 'subGenre', 'city', 'state', 'venueName', 'attractionName', 'name', 'info']                  15.081498
['num.artists', 'latitude', 'longitude', 'venueUpcoming', 'date_quant', 'promoter.name', 'subGenre', 'city', 'state', 'venueName', 'attractionName', 'name']                          14.847839
['num.artists', 'latitude', 'longitude', 'venueUpcoming', 'date_quant', 'promoter.name', 'subGenre', 'city', 'state', 'venueName', 'name', 'info', 'pleaseNote']                      14.616319
['num.artists', 'latitude', 'longitude', 'venueUpcoming', 'date_quant', 'promoter.name', 'subGenre', 'city', 'state', 'venueName', 'name', 'info']                                    14.597399
                                        

In [9]:
knn_errs.idxmin()

"['num.artists', 'latitude', 'promoter.name', 'subGenre', 'city', 'state', 'venueName', 'name', 'info', 'pleaseNote']"

In [10]:
#find optimal value of k
knn_best_vars = ['num.artists', 'latitude', 'promoter.name', 'subGenre', 'city', 'state', 'venueName', 'name', 'info', 'pleaseNote']

knn_ct = make_column_transformer(
    (TfidfVectorizer(max_features=50, norm=None), "name"), 
    (TfidfVectorizer(max_features=50, norm=None), "info"), 
    (TfidfVectorizer(max_features=50, norm=None), "pleaseNote"), 
    (OneHotEncoder(handle_unknown="ignore"), ["promoter.name", "subGenre", "city", "state", "venueName"]), 
    remainder= "passthrough"
)

knn_pipeline = make_pipeline(
    knn_ct, 
    StandardScaler(with_mean=False), 
    KNeighborsRegressor(n_neighbors=10)
)
clf = GridSearchCV(knn_pipeline,
                   param_grid={
                       "kneighborsregressor__n_neighbors": range(1, 20)
                       },
                   scoring="neg_mean_squared_error",
                   cv=10)

clf.fit(df_music_train[knn_best_vars], df_music_train["priceMin"])
clf.best_estimator_

Pipeline(memory=None,
         steps=[('columntransformer',
                 ColumnTransformer(n_jobs=None, remainder='passthrough',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('tfidfvectorizer-1',
                                                  TfidfVectorizer(analyzer='word',
                                                                  binary=False,
                                                                  decode_error='strict',
                                                                  dtype=<class 'numpy.float64'>,
                                                                  encoding='utf-8',
                                                                  input='content',
                                                                  lowercase=True,
                                                                  max_df=1.0,
  

In [11]:
knn_final_pipeline = make_pipeline(
    knn_ct, 
    StandardScaler(with_mean=False), 
    KNeighborsRegressor(n_neighbors=9)
)

knn_final_errs = -cross_val_score(knn_final_pipeline, X=df_music_train[knn_best_vars], 
                          y=df_music_train["priceMin"],
                          scoring="neg_mean_squared_error", cv=10)
np.sqrt(knn_final_errs).mean()

14.572863656524973

Linear Regression

In [12]:
linear_errs = get_combo_errors("priceMin", LinearRegression())
linear_errs

['num.artists', 'latitude', 'longitude', 'venueUpcoming', 'date_quant', 'promoter.name', 'subGenre', 'city', 'state', 'venueName', 'attractionName', 'name', 'info', 'pleaseNote']    17.755153
['num.artists', 'latitude', 'longitude', 'venueUpcoming', 'date_quant', 'promoter.name', 'subGenre', 'city', 'state', 'venueName', 'attractionName', 'name', 'info']                  19.823341
['num.artists', 'latitude', 'longitude', 'venueUpcoming', 'date_quant', 'promoter.name', 'subGenre', 'city', 'state', 'venueName', 'attractionName', 'name']                          36.634006
['num.artists', 'latitude', 'longitude', 'venueUpcoming', 'date_quant', 'promoter.name', 'subGenre', 'city', 'state', 'venueName', 'name', 'info', 'pleaseNote']                      24.670809
['num.artists', 'latitude', 'longitude', 'venueUpcoming', 'date_quant', 'promoter.name', 'subGenre', 'city', 'state', 'venueName', 'name', 'info']                                    31.669782
                                        

In [13]:
linear_errs.idxmin()

"['num.artists', 'latitude', 'promoter.name', 'name']"

In [14]:
linear_best_vars = ['num.artists', 'latitude', 'promoter.name', 'name']

linear_ct = make_column_transformer(
    (TfidfVectorizer(max_features=50, norm=None), "name"), 
    (OneHotEncoder(handle_unknown="ignore"), ["promoter.name"]), 
    remainder= "passthrough"
)

linear_final_pipeline = make_pipeline(
    linear_ct, 
    StandardScaler(with_mean=False), 
    LinearRegression()
)

linear_final_errs = -cross_val_score(linear_final_pipeline, X=df_music_train[linear_best_vars], 
                          y=df_music_train["priceMin"],
                          scoring="neg_mean_squared_error", cv=10)
np.sqrt(linear_final_errs).mean()

14.69988366914221

RandomForest Regression

In [15]:
rf_errs = get_combo_errors("priceMin", RandomForestRegressor())
rf_errs

['num.artists', 'latitude', 'longitude', 'venueUpcoming', 'date_quant', 'promoter.name', 'subGenre', 'city', 'state', 'venueName', 'attractionName', 'name', 'info', 'pleaseNote']    14.038818
['num.artists', 'latitude', 'longitude', 'venueUpcoming', 'date_quant', 'promoter.name', 'subGenre', 'city', 'state', 'venueName', 'attractionName', 'name', 'info']                  14.074098
['num.artists', 'latitude', 'longitude', 'venueUpcoming', 'date_quant', 'promoter.name', 'subGenre', 'city', 'state', 'venueName', 'attractionName', 'name']                          13.991792
['num.artists', 'latitude', 'longitude', 'venueUpcoming', 'date_quant', 'promoter.name', 'subGenre', 'city', 'state', 'venueName', 'name', 'info', 'pleaseNote']                      14.165035
['num.artists', 'latitude', 'longitude', 'venueUpcoming', 'date_quant', 'promoter.name', 'subGenre', 'city', 'state', 'venueName', 'name', 'info']                                    13.995288
                                        

In [16]:
rf_errs.idxmin()

"['num.artists', 'latitude', 'promoter.name', 'subGenre', 'city', 'state', 'venueName', 'attractionName', 'name', 'info', 'pleaseNote']"

In [17]:
rf_best_vars = ['num.artists', 'latitude', 'promoter.name', 'subGenre', 'city', 'state', 'venueName', 'attractionName', 'name', 'info', 'pleaseNote']

rf_ct = make_column_transformer(
    (TfidfVectorizer(max_features=50, norm=None), "name"), 
    (TfidfVectorizer(max_features=50, norm=None), "info"),
    (TfidfVectorizer(max_features=50, norm=None), "pleaseNote"),
    (OneHotEncoder(handle_unknown="ignore"), ["promoter.name", "subGenre", "city", "state", "venueName", "attractionName"]), 
    remainder= "passthrough"
)

rf_final_pipeline = make_pipeline(
    rf_ct, 
    StandardScaler(with_mean=False), 
    RandomForestRegressor()
)

rf_final_errs = -cross_val_score(rf_final_pipeline, X=df_music_train[rf_best_vars], 
                          y=df_music_train["priceMin"],
                          scoring="neg_mean_squared_error", cv=10)
np.sqrt(rf_final_errs).mean()

13.72409064399621

Each model had a different set of variables that minimized error. 

Their respective included features and RMSEs are as follows:

KNeighborsRegressor(n_neighbors=8): 
*   num.artists, latitude, promoter.name, subGenre, city, state, venueName, name, info, pleaseNote
*   14.572863656524973

LinearRegressor:
*   num.artists, latitude, promoter.name, name
*   14.69988366914221

RandomForestRegressor: 
*   num.artists, latitude, promoter.name, subGenre, city, state, venueName, attractionName, name, info, pleaseNote
*   13.72409064399621


The RandomForest model minimized error the most with an RMSE of 13.72409064399621



# priceMax Model

In [0]:
quantitative = ["num.artists", "latitude", "longitude", "venueUpcoming", "date_quant"]
categorical = ["promoter.name", "subGenre", "city", "state", "venueName", "attractionName"]
text = ["name", "info", "pleaseNote"]

K-Nearest Neighbors Regressor

In [19]:
knn_errs_max = get_combo_errors("priceMax", KNeighborsRegressor(n_neighbors=10))
knn_errs_max

['num.artists', 'latitude', 'longitude', 'venueUpcoming', 'date_quant', 'promoter.name', 'subGenre', 'city', 'state', 'venueName', 'attractionName', 'name', 'info', 'pleaseNote']     75.568314
['num.artists', 'latitude', 'longitude', 'venueUpcoming', 'date_quant', 'promoter.name', 'subGenre', 'city', 'state', 'venueName', 'attractionName', 'name', 'info']                   98.372027
['num.artists', 'latitude', 'longitude', 'venueUpcoming', 'date_quant', 'promoter.name', 'subGenre', 'city', 'state', 'venueName', 'attractionName', 'name']                           96.882677
['num.artists', 'latitude', 'longitude', 'venueUpcoming', 'date_quant', 'promoter.name', 'subGenre', 'city', 'state', 'venueName', 'name', 'info', 'pleaseNote']                       77.060018
['num.artists', 'latitude', 'longitude', 'venueUpcoming', 'date_quant', 'promoter.name', 'subGenre', 'city', 'state', 'venueName', 'name', 'info']                                    106.135867
                                   

In [20]:
knn_errs_max.idxmin()

"['num.artists', 'promoter.name', 'name']"

In [22]:
#find optimal value of k 
knn_max_best_vars = ['num.artists', 'promoter.name', 'name']

knn_max_ct = make_column_transformer(
    (TfidfVectorizer(max_features=50, norm=None), "name"), 
    (OneHotEncoder(handle_unknown="ignore"), ["promoter.name"]), 
    remainder= "passthrough"
)

knn_max_pipeline = make_pipeline(
    knn_max_ct, 
    StandardScaler(with_mean=False), 
    KNeighborsRegressor(n_neighbors=10)
)
clf_max = GridSearchCV(knn_max_pipeline,
                   param_grid={
                       "kneighborsregressor__n_neighbors": range(1, 20)
                       },
                   scoring="neg_mean_squared_error",
                   cv=10)

clf_max.fit(df_music_train[knn_max_best_vars], df_music_train["priceMax"])
clf_max.best_estimator_

Pipeline(memory=None,
         steps=[('columntransformer',
                 ColumnTransformer(n_jobs=None, remainder='passthrough',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('tfidfvectorizer',
                                                  TfidfVectorizer(analyzer='word',
                                                                  binary=False,
                                                                  decode_error='strict',
                                                                  dtype=<class 'numpy.float64'>,
                                                                  encoding='utf-8',
                                                                  input='content',
                                                                  lowercase=True,
                                                                  max_df=1.0,
    

In [24]:
knn_max_final_pipeline = make_pipeline(
    knn_max_ct, 
    StandardScaler(with_mean=False), 
    KNeighborsRegressor(n_neighbors=10)
)

knn_max_final_errs = -cross_val_score(knn_max_final_pipeline, X=df_music_train[knn_max_best_vars], 
                          y=df_music_train["priceMax"],
                          scoring="neg_mean_squared_error", cv=10)
np.sqrt(knn_max_final_errs).mean()

70.4998378655803

Linear Regression

In [25]:
linear_errs_max = get_combo_errors("priceMax", LinearRegression())
linear_errs_max

['num.artists', 'latitude', 'longitude', 'venueUpcoming', 'date_quant', 'promoter.name', 'subGenre', 'city', 'state', 'venueName', 'attractionName', 'name', 'info', 'pleaseNote']     76.529347
['num.artists', 'latitude', 'longitude', 'venueUpcoming', 'date_quant', 'promoter.name', 'subGenre', 'city', 'state', 'venueName', 'attractionName', 'name', 'info']                   80.943493
['num.artists', 'latitude', 'longitude', 'venueUpcoming', 'date_quant', 'promoter.name', 'subGenre', 'city', 'state', 'venueName', 'attractionName', 'name']                          115.614278
['num.artists', 'latitude', 'longitude', 'venueUpcoming', 'date_quant', 'promoter.name', 'subGenre', 'city', 'state', 'venueName', 'name', 'info', 'pleaseNote']                       87.722006
['num.artists', 'latitude', 'longitude', 'venueUpcoming', 'date_quant', 'promoter.name', 'subGenre', 'city', 'state', 'venueName', 'name', 'info']                                     93.931507
                                   

In [26]:
linear_errs_max.idxmin()

"['num.artists', 'promoter.name', 'name']"

In [27]:
linear_max_best_vars = ['num.artists', 'promoter.name', 'name']

linear_max_ct = make_column_transformer(
    (TfidfVectorizer(max_features=50, norm=None), "name"), 
    (OneHotEncoder(handle_unknown="ignore"), ["promoter.name"]), 
    remainder= "passthrough"
)

linear_max_final_pipeline = make_pipeline(
    linear_max_ct, 
    StandardScaler(with_mean=False), 
    LinearRegression()
)

linear_max_final_errs = -cross_val_score(linear_max_final_pipeline, X=df_music_train[linear_max_best_vars], 
                          y=df_music_train["priceMax"],
                          scoring="neg_mean_squared_error", cv=10)
np.sqrt(linear_max_final_errs).mean()

74.88816313574499

RandomForest Regression

In [28]:
rf_max_errs = get_combo_errors("priceMax", RandomForestRegressor())
rf_max_errs

['num.artists', 'latitude', 'longitude', 'venueUpcoming', 'date_quant', 'promoter.name', 'subGenre', 'city', 'state', 'venueName', 'attractionName', 'name', 'info', 'pleaseNote']    67.979130
['num.artists', 'latitude', 'longitude', 'venueUpcoming', 'date_quant', 'promoter.name', 'subGenre', 'city', 'state', 'venueName', 'attractionName', 'name', 'info']                  64.107707
['num.artists', 'latitude', 'longitude', 'venueUpcoming', 'date_quant', 'promoter.name', 'subGenre', 'city', 'state', 'venueName', 'attractionName', 'name']                          64.331644
['num.artists', 'latitude', 'longitude', 'venueUpcoming', 'date_quant', 'promoter.name', 'subGenre', 'city', 'state', 'venueName', 'name', 'info', 'pleaseNote']                      68.724208
['num.artists', 'latitude', 'longitude', 'venueUpcoming', 'date_quant', 'promoter.name', 'subGenre', 'city', 'state', 'venueName', 'name', 'info']                                    65.969952
                                        

In [29]:
rf_max_errs.idxmin()

"['num.artists', 'latitude', 'longitude', 'venueUpcoming', 'promoter.name', 'subGenre', 'city', 'state', 'venueName', 'attractionName', 'name']"

In [30]:
rf_max_best_vars = ['num.artists', 'latitude', 'longitude', 'venueUpcoming', 'promoter.name', 'subGenre', 'city', 'state', 'venueName', 'attractionName', 'name']

rf_max_ct = make_column_transformer(
    (TfidfVectorizer(max_features=50, norm=None), "name"), 
    (OneHotEncoder(handle_unknown="ignore"), ["promoter.name", "subGenre", "city", "state", "venueName", "attractionName"]), 
    remainder= "passthrough"
)

rf_max_final_pipeline = make_pipeline(
    rf_max_ct, 
    StandardScaler(with_mean=False), 
    RandomForestRegressor()
)

rf_max_final_errs = -cross_val_score(rf_max_final_pipeline, X=df_music_train[rf_max_best_vars], 
                          y=df_music_train["priceMax"],
                          scoring="neg_mean_squared_error", cv=10)
np.sqrt(rf_max_final_errs).mean()

63.43886369366631

Each model had a different set of variables that minimized error. 

Their respective included features RMSEs are as follows:

KNeighborsRegressor(n_neighbors=10): 
*   num.artists, promoter.name, name
*   70.4998378655803

LinearRegressor:
*   num.artists, promoter.name, name
*   74.88816313574499

RandomForestRegressor: 
*   num.artists, latitude, longitude, venueUpcoming, promoter.name, subGenre, city, state, venueName, attractionName, name
*   63.43886369366631

The RandomForest model minimized error the most with an RMSE of 63.43886369366631

# Predictions

Using a test data set in which the priceRanges feature was NaN, we will predict the priceMin and priceMax of each event using the model that minimized error the most for the particular variable. 

In [31]:
df_music_test = pd.read_csv("https://raw.githubusercontent.com/reillynski/data301-finalproject/master/df_music_test.csv", index_col=0)
df_music_test

Unnamed: 0,name,type,promoter.name,info,pleaseNote,subGenre,city,state,venueName,attractionName,artists,num.artists,latitude,longitude,venueUpcoming,date
0,Kevin Gates,event,,,,Hip-Hop/Rap,Baltimore,Maryland,Rams Head Live,Kevin Gates,['Kevin Gates'],1,39.297401,-76.607399,16,2020-06-09
1,NF - The Search Tour,event,LIVE NATION MUSIC,,,French Rap,Tulsa,Oklahoma,Brady Theater,NF,['NF'],1,36.158186,-95.995284,6,2020-04-10
2,NF - The Search Tour,event,LIVE NATION MUSIC,,,French Rap,Milwaukee,Wisconsin,Eagles Club/The Rave/Eagles Ballroom,NF,['NF'],1,43.038074,-87.943308,19,2020-04-16
3,NF - The Search Tour,event,LIVE NATION MUSIC,,,French Rap,Buffalo,New York,Buffalo RiverWorks,NF,['NF'],1,42.869917,-78.872638,1,2020-04-18
4,NF - The Search Tour,event,LIVE NATION MUSIC,,,French Rap,Kansas City,Missouri,Starlight Theatre,NF,['NF'],1,39.006963,-94.531517,49,2020-05-12
5,POSTPONED :: Watsky - Placement Album Tour,event,,Doors: 7 p.m. || Music: 8 p.m. || All Ages$20:...,,Alternative Rap,Lincoln,Nebraska,Bourbon Theatre,Watsky,"['Watsky', 'Feed the Biirds']",2,40.813344,-96.700617,56,2020-04-28
6,[POSTPONED] Watsky - Placement Album Tour,event,,"ALL AGESSHOW POSTPONED:Unfortunately, WATSKY a...",,Urban,Boise,Idaho,Knitting Factory Concert House - Boise,Watsky,"['Watsky', 'Hollis']",2,43.613149,-116.207134,56,2020-05-05
7,Pitbull,event,,,,Hip-Hop/Rap,Edinburg,Texas,Bert Ogden Arena,Pitbull,['Pitbull'],1,26.2938,-98.1548,11,2020-05-10
8,Pitbull,event,,,,Hip-Hop/Rap,Vienna,Virginia,Filene Center,Pitbull,['Pitbull'],1,38.9062,-77.294899,54,2020-08-30


In [0]:
#adjust some columns to make it easier to use in ml model 

#fill/replace NaNs
df_music_test["info"] = df_music_test["info"].fillna("")
df_music_test["pleaseNote"] = df_music_test["pleaseNote"].fillna("")
df_music_test["promoter.name"] = df_music_test["promoter.name"].fillna("None")

#create quantitative date column
df_music_test["date_quant"] = df_music_test["date"].map(lambda x: int(x[5:7]) * 30 + int(x[8:10]))

In [0]:
#make priceMin prediction
X_test_min = df_music_test[rf_best_vars]

rf_final_pipeline.fit(X=df_music_train[rf_best_vars], y=df_music_train["priceMin"])

df_music_test["priceMin"] = rf_final_pipeline.predict(X=X_test_min)

In [34]:
#make priceMax prediction
X_test_max = df_music_test[rf_max_best_vars]

rf_max_final_pipeline.fit(X=df_music_train[rf_max_best_vars], y=df_music_train["priceMax"])

df_music_test["priceMax"] = rf_max_final_pipeline.predict(X=X_test_max)
df_music_test

Unnamed: 0,name,type,promoter.name,info,pleaseNote,subGenre,city,state,venueName,attractionName,artists,num.artists,latitude,longitude,venueUpcoming,date,date_quant,priceMin,priceMax
0,Kevin Gates,event,,,,Hip-Hop/Rap,Baltimore,Maryland,Rams Head Live,Kevin Gates,['Kevin Gates'],1,39.297401,-76.607399,16,2020-06-09,189,35.8214,70.702
1,NF - The Search Tour,event,LIVE NATION MUSIC,,,French Rap,Tulsa,Oklahoma,Brady Theater,NF,['NF'],1,36.158186,-95.995284,6,2020-04-10,130,33.4613,77.2671
2,NF - The Search Tour,event,LIVE NATION MUSIC,,,French Rap,Milwaukee,Wisconsin,Eagles Club/The Rave/Eagles Ballroom,NF,['NF'],1,43.038074,-87.943308,19,2020-04-16,136,38.039,66.369
3,NF - The Search Tour,event,LIVE NATION MUSIC,,,French Rap,Buffalo,New York,Buffalo RiverWorks,NF,['NF'],1,42.869917,-78.872638,1,2020-04-18,138,37.2545,76.3297
4,NF - The Search Tour,event,LIVE NATION MUSIC,,,French Rap,Kansas City,Missouri,Starlight Theatre,NF,['NF'],1,39.006963,-94.531517,49,2020-05-12,162,36.9831,54.4925
5,POSTPONED :: Watsky - Placement Album Tour,event,,Doors: 7 p.m. || Music: 8 p.m. || All Ages$20:...,,Alternative Rap,Lincoln,Nebraska,Bourbon Theatre,Watsky,"['Watsky', 'Feed the Biirds']",2,40.813344,-96.700617,56,2020-04-28,148,31.127,42.36
6,[POSTPONED] Watsky - Placement Album Tour,event,,"ALL AGESSHOW POSTPONED:Unfortunately, WATSKY a...",,Urban,Boise,Idaho,Knitting Factory Concert House - Boise,Watsky,"['Watsky', 'Hollis']",2,43.613149,-116.207134,56,2020-05-05,155,34.082,67.8
7,Pitbull,event,,,,Hip-Hop/Rap,Edinburg,Texas,Bert Ogden Arena,Pitbull,['Pitbull'],1,26.2938,-98.1548,11,2020-05-10,160,35.5503,70.0334
8,Pitbull,event,,,,Hip-Hop/Rap,Vienna,Virginia,Filene Center,Pitbull,['Pitbull'],1,38.9062,-77.294899,54,2020-08-30,270,33.6946,35.0445
