In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score

In [2]:
df = pd.read_csv("../data/location_rev.csv")
df.head()

Unnamed: 0,revenue,num_competitors,median_income,num_loyalty_members,population_density,location_age
0,42247.8,3.0,30527.57,1407.0,3302.0,12.0
1,38628.37,3.0,30185.49,1025.0,4422.0,11.0
2,39715.16,1.0,32182.24,1498.0,3260.0,12.0
3,35593.3,5.0,29728.65,2340.0,4325.0,10.0
4,35128.18,4.0,30691.17,847.0,3774.0,11.0


In [None]:
df.describe()

Unnamed: 0,revenue,num_competitors,median_income,num_loyalty_members,population_density,location_age
count,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0
mean,38433.46933,3.397,31360.6685,1597.2,3351.199,11.041
std,11665.825242,1.016082,3943.278358,496.874663,975.664263,3.034885
min,5000.0,0.0,20000.0,0.0,0.0,0.0
25%,30277.8975,3.0,28792.5925,1253.0,2689.25,9.0
50%,38323.095,3.0,31134.555,1605.0,3353.0,11.0
75%,45894.67,4.0,34050.9925,1925.25,4017.0,13.0
max,79342.07,7.0,43676.9,3280.0,6489.0,24.0


In [4]:
X = df[["num_competitors",
        "median_income",
        "population_density",
        "num_loyalty_members",
        "location_age"]]
scaler = StandardScaler()
X_scale = scaler.fit_transform(X)
y=df["revenue"]

In [5]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [6]:
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.model_selection import GridSearchCV
import time

In [None]:
#SVR
grid_params = {
    "C": [0.1, 1, 10, 100, 1000],
    "epsilon": [0.1, 1],
    "kernel": ["linear"],
    "gamma": ["scale", "auto"]
}
model_grid = GridSearchCV(SVR(), grid_params,cv=5)
model_grid.fit(X_train, y_train)

print("R2: ",model_grid.best_score_)
print("Parameters: ",model_grid.best_params_)

In [None]:
# Decision Tree
grid_params = {
    "max_depth": [5, 10, 20, None],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [1,5,10],
    "max_features": ["auto", "sqrt","log2"]
}
model_grid = GridSearchCV(DecisionTreeRegressor(), grid_params,cv=5)
model_grid.fit(X_train, y_train)

print("R2: ",model_grid.best_score_)
print("Parameters: ",model_grid.best_params_)

In [None]:
# Random Forest
grid_params = {
    "n_estimators": [50,100,200],
    "max_depth": [10, 20, 30,None],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [1,2,4],
    "max_features": ["auto", "sqrt","log2"]
}
model = RandomForestRegressor(n_jobs=1)
model_grid = GridSearchCV(model, grid_params,cv=5)
model_grid.fit(X_train, y_train)

print("R2: ",model_grid.best_score_)
print("Parameters: ",model_grid.best_params_)

In [None]:
# KNN
grid_params = {
    "n_neighbors": [3,5,7,9,11,13,15],
    "weight": ["uniform","distance"],
    "metric": ["minkowski","euclidean","manhattan"],
}

model_grid = GridSearchCV(KNeighborsRegressor(), grid_params,cv=5)
model_grid.fit(X_train, y_train)

print("R2: ",model_grid.best_score_)
print("Parameters: ",model_grid.best_params_)