In [9]:
import pandas as pd
import numpy as np
import re
from collections import defaultdict

from sklearn.model_selection import train_test_split 
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Ridge, Lasso, ElasticNet

from sklearn.metrics import mean_squared_error, r2_score

from sklearn.preprocessing import MinMaxScaler, StandardScaler, Normalizer

from sklearn.decomposition import PCA
from sklearn.preprocessing import OneHotEncoder

## Summary

|Model| Comments|
|-----|---------|
| Simple LR w/o scaling| Test $R^2$ = 4.1%|
| Lasso w/o scaling| Test $R^2$ = 3.5%|
| Ridge w/o scaling| Test $R^2$ = 3.7%|
| ElasticNet w/o scaling| Test $R^2$ = 3.4%|
| LR with Minmax(0,1) scaling| Test $R^2$ = 3.7%|
| Lasso with Minmax(0,1) scaling| Test $R^2$ = 0.9%|
| Ridge with Minmax(0,1) scaling| Test $R^2$ = 3.6%|
| ElasticNet with Minmax(0,1) scaling| Test $R^2$ = 0.1%|
| LR with Standard scaling| Test $R^2$ = 4.0%|
| Lasso with Standard scaling| Test $R^2$ = 3.3%|
| Ridge with Standard scaling| Test $R^2$ = 3.7%|
| Lasso with Standard scaling and gridsearch| Test $R^2$ = 3.3%|
| Ridge with Standard scaling and gridsearch| Test $R^2$ = 4.2%|
| Random Forest with Standard scaling | Test $R^2$ = 28.5%|


> Simple LR or Ridge regression is best without feature selection or any data imputation
> PCA will have reduced the number of columns from 260 to 190 with 99% variance.
> Random forest works better than others and can be improved with gridsearch but will take lot of time to train.


In [2]:
df = pd.read_csv("../data/train_data.zip")

In [3]:
df.head()

Unnamed: 0,external_id,month,year,monthly_number_of_sessions,monthly_unique_sessions,monthly_repeated_sessions,monthly_avg_length_of_session,monthly_avg_light_activity,monthly_avg_moderate_activity,monthly_avg_vigorous_activity,...,avg_wind_9_10,avg_wind_10_11,avg_wind_11_12,avg_wind_12_above,perfect_days,unacast_session_count,hpi,state_and_local_amount_per_capita,state_amount_per_capita,local_amount_per_capita
0,1900203,3,2019,0,0,0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,78.0,323.61,0.132207,0.018519,0.113688
1,1900203,6,2018,0,0,0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,4.0,111.0,323.61,0.132207,0.018519,0.113688
2,1900203,8,2018,0,0,0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,2.0,110.0,323.61,0.132207,0.018519,0.113688
3,MR00101775,1,2019,0,0,0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,10.0,110.38,0.076247,0.011966,0.064281
4,MR00101775,8,2019,0,0,0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,11.0,110.38,0.076247,0.011966,0.064281


## Function for pre-processing biba data

In [4]:
from collections import defaultdict
def dict_to_columns_df(col, key, val):
    """
    This functions takes a dataframe column which is in the
    form of list of dictionaries and creates a dataframe
    from the keys of the in the inner list of dictionaries 
    e.g. "[{'key': A, 'val': 1}, {'key': B, 'val': 2}]"
    
    Parameters
    ----------------
    col : DataFrame Series, the columns whose values are the in the format
    of a list of dictionaries.
    
    key : the keys in the inner dictionary from which column names are to be extracted
    
    val : the keys in the inner dictionary from which values in the column needs to
    be extracted
    
    
    Returns
    ----------------
    DataFrame
        With the new columns created from the keys of the inner dictionary
        
    """
    key_list = set()
    i=0
    # getting all the new column names
    while i < len(col):
        if type(col[i]) != float:
            dic_list = eval(col[i]) #converting col value from string to list
            for dic in range(len(dic_list)):
                if re.match('[a-zA-Z]', dic_list[dic][str(key)][0]): #removing spanish names
                    key_list.add("monthly_"+dic_list[dic][str(key)])
        i+=1
    
    all_cols_dict = defaultdict(list)
    
    i = 0
    while i < len(col):
        if type(col[i]) != float:
            dic_list = eval(col[i]) #converting col value from string to list

            for col_names in list(key_list):
                flag = 0 #to check if a column name exists in the dictionary
                for dic in range(len(dic_list)):
                    if dic_list[dic][str(key)] == col_names[8:]: #getting values from the inner dictionary matching the key
                        all_cols_dict[col_names].append(dic_list[dic][str(val)]) #putting inner dict values to new default dict
                        flag = 1
                        break
                
                if flag==0:
                    all_cols_dict[col_names].append(None)

        else:
            for col_names in list(key_list):
                all_cols_dict[col_names].append(None)

        i+=1
    new_cols_df = pd.DataFrame(all_cols_dict)
    
    # checking new df has same number of columns as given column
    if new_cols_df.shape[0] == col.shape[0]:
        return new_cols_df
    else:
        print("Column dimensions don't match")


def biba_pp(full_data):  
    
    """
    Performs the pre-processing of the columns for the biba data
    
    Paramters
    ---------------
    
    full_data : DataFrame, with no operations done on the biba columns
    
    Returns
    ---------------
    DataFrame
        with processed biba columns
    
    """
    biba_games_df = pd.DataFrame()
    biba_games_df = pd.concat([full_data.loc[:, 'monthly_number_of_sessions':'distance_to_nearest_bus_stop'],
                               full_data.loc[:, 'historic_number_of_sessions':'historic_snow']], axis = 1)
    
    #extracting categorical features
    categorical_features = biba_games_df.loc[:, biba_games_df.dtypes == "object"]
     
    # creating cols from list of dictionaries
    monthly_survey_df = dict_to_columns_df(categorical_features['monthly_survey'], 'question', 'avg_answer')
    monthly_weekday_counts_df = dict_to_columns_df(categorical_features['monthly_weekday_counts'], 'weekday', 'count')
    
    biba_games_df = pd.concat([biba_games_df, monthly_survey_df, monthly_weekday_counts_df], axis = 1)
    
    #dropping categorical features
    biba_games_df = biba_games_df.drop(columns = list(categorical_features.columns))
    
    #dropping historic hours with low fill rate
    numerical_cols_to_remove = ['historic_hour_0', 'historic_hour_23', 'historic_hour_22', 'historic_hour_21',
                                'historic_hour_7','historic_hour_6','historic_hour_5','historic_hour_4', 
                                'historic_hour_3','historic_hour_2','historic_hour_1', 'MonthYear']
    
    biba_games_df = biba_games_df.drop(columns = numerical_cols_to_remove)
    
    impute_biba_games_df =  biba_games_df.fillna(0)
    
    #removing the previous columns in the input data
    cols_to_drop = list(df.loc[:, 'monthly_number_of_sessions': 'distance_to_nearest_bus_stop'].columns) +\
                    list(df.loc[:, 'historic_number_of_sessions' : 'historic_snow'].columns)
    
    
    full_data = full_data.drop(columns = cols_to_drop)
    
    #adding processed columns
    full_data = pd.concat([full_data, impute_biba_games_df], axis = 1)
    
    return full_data

In [18]:
def preprocess_weather(input_data):
    """
    Given the original dataframe, preprocess the columns
    related to weather information (`Democrats_08_Votes` to
    the end + `climate`). Impute NaN of `Number_of_holidays`
    by using the values the we have for the same month,
    impute NaN of `Green_2016` by using values found online, or 0,
    and replace remaining NaN values with 0.
    Parameters
    ----------
    input_data : pandas.core.frame.DataFrame
    Returns
    -------
    output_data : pandas.core.frame.DataFrame
    """
    df_weather = input_data.loc[:, 'Democrats_08_Votes':]
    df_weather['state'] = input_data['state']
    df_weather['climate'] = input_data['climate']
    df_weather['external_id'] = input_data['external_id']
    df_weather['month'] = input_data['month']
    df_weather['year'] = input_data['year']
    #fill up NaNs for `Number_of_holidays` column
    #I sorted the values so that the values are ordered by time, and the NaNs are at the end of each time period
    df_weather = df_weather.sort_values(['month', 'year', 'Number_of_holidays'])
    df_weather['Number_of_holidays'] = df_weather['Number_of_holidays'].fillna(method='ffill')
    #fill up NaNs for the `Green_2016` column
    #I only found values for Alaska and North Carolina, so I just put 0 for the other states
    df_weather['Green_2016'] = np.where(
     df_weather['state'] == 'Alaska', 5735,
         np.where(
            df_weather['state'] == 'North Carolina', 12105,
             np.where(
                df_weather['Green_2016'].isnull(), 0, df_weather['Green_2016']
             )
         )
    )
    df_weather['climate'] = df_weather['climate'].fillna(df_weather['climate'].mode()[0])
    #Substitute every remaining NaNs by 0
    df_weather = df_weather.fillna(value=0)
    output_data = input_data.copy()
    output_data.loc[:, 'Democrats_08_Votes':] = df_weather.loc[:, 'Democrats_08_Votes':]
    output_data['climate'] = df_weather['climate']
    return output_data

In [19]:
def preprocess_neighbour(input_data):
    """
    Given the original dataframe, preprocess the columns
    related to locale information (`city` to
    `houses_per_sq_km`). Drop columns with >30%
    NaN values and replace remaining NaN values with 0.
    Parameters
    ----------
    input_data : pandas.core.frame.DataFrame
    Returns
    -------
    output_data : pandas.core.frame.DataFrame
    """
    df_neighbour = input_data.loc[:, 'city':'houses_per_sq_km']
    missing = df_neighbour.isna()
    # Count number of missing values for each column
    num_missing = missing.sum().sort_values(ascending=False)
    # Calculate proportion of missing values for each column
    prop_missing = num_missing / df.shape[0]
    # Create a list of columns with >30% of values missing
    to_drop = prop_missing[prop_missing > 0.3].index.to_list()
    # Add `country` to the list since all playgrounds are in the U.S.
    # Add `city` and `county` since lat. and long. should take care of them
    to_drop.append('country')
    to_drop.append('city')
    to_drop.append('county')
    # Drop columns with names in list
    output_data = input_data.drop(to_drop, axis=1)
    # Fill in remaining NaN values in locale-related columns with 0
    to_impute = prop_missing[(0 < prop_missing) & (prop_missing <= 0.3)].index.to_list()
    to_impute.remove('city')
    to_impute.remove('county')
    to_impute.remove('climate')
    output_data[to_impute] = output_data[to_impute].fillna(0)
    #output_data.loc[:, to_impute] = output_data.loc[:, to_impute].fillna(0)
    return output_data

In [20]:
clean_df1 = biba_pp(df)

In [21]:
clean_df2 = preprocess_weather(clean_df1)

In [22]:
clean_df3 = preprocess_neighbour(clean_df2)

ValueError: list.remove(x): x not in list

In [14]:
clean_df3

Unnamed: 0,external_id,month,year,B20004e10,B11016e1,B12001e12,B20004e11,B19125e1,B12001e13,B23008e22,...,monthly_Wednesday,monthly_Sunday,monthly_Monday,monthly_Thursday,monthly_Friday,monthly_Saturday,monthly_Tuesday,A,C,D
0,1900203,3,2019,51111,1868,688,0,78934,1342,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,1,0
1,1900203,6,2018,51111,1868,688,0,78934,1342,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,1,0
2,1900203,8,2018,51111,1868,688,0,78934,1342,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,1,0
3,MR00101775,1,2019,45484,2613,980,30417,45578,1097,66,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,1,0
4,MR00101775,8,2019,45484,2613,980,30417,45578,1097,66,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
50115,MR00101116,4,2019,84079,2295,361,135721,145595,1965,7,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,1,0
50116,FM00171280,2,2018,37473,2460,1097,20307,46094,1052,65,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,0,0
50117,FM00167991,7,2018,48462,2005,772,62037,93603,1443,42,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,1,0
50118,MR00098241,6,2019,40571,2104,700,0,36340,834,308,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,1,0


In [56]:
# Treating categorical features
categorical_features = biba_games_df.loc[:, biba_games_df.dtypes == "object"]
categorical_features.loc[categorical_features['monthly_weekday_counts'].notnull(), 'monthly_weekday_counts']
categorical_features.loc[categorical_features['monthly_survey'].notnull(), 'monthly_survey']

19       [{'question': 'allages', 'avg_answer': 5.0}, {...
34       [{'question': 'variety', 'avg_answer': 3.0}, {...
35       [{'question': 'condition', 'avg_answer': 3.0},...
69       [{'question': 'accessible', 'avg_answer': 3.5}...
70       [{'question': 'cleanliness', 'avg_answer': 4.0...
                               ...                        
50057    [{'question': 'variety', 'avg_answer': 5.0}, {...
50058    [{'question': 'regular', 'avg_answer': 0.5}, {...
50083    [{'question': 'accessible', 'avg_answer': 5.0}...
50100    [{'question': 'revisit', 'avg_answer': 1.0}, {...
50114    [{'question': 'safety', 'avg_answer': 0.0}, {'...
Name: monthly_survey, Length: 4798, dtype: object

In [51]:
eval(categorical_features.loc[69, 'monthly_weekday_counts'])

[{'weekday': 'Sunday', 'count': 1},
 {'weekday': 'Monday', 'count': 8},
 {'weekday': 'Thursday', 'count': 1},
 {'weekday': 'Saturday', 'count': 1}]

In [52]:
monthly_survey_df = dict_to_columns_df(categorical_features['monthly_survey'], 'question', 'avg_answer')
monthly_weekday_counts_df = dict_to_columns_df(categorical_features['monthly_weekday_counts'], 'weekday', 'count')

In [55]:
monthly_weekday_counts_df[monthly_weekday_counts_df['monthly_Monday'].notnull()]

Unnamed: 0,monthly_Thursday,monthly_Saturday,monthly_Monday,monthly_Tuesday,monthly_Friday,monthly_Wednesday,monthly_Sunday
19,2.0,6.0,3.0,2.0,,1.0,1.0
35,,2.0,1.0,,,,
69,1.0,1.0,8.0,,,,1.0
70,,2.0,1.0,4.0,,5.0,3.0
128,,,1.0,,,,
...,...,...,...,...,...,...,...
49937,,4.0,2.0,,1.0,,
49969,2.0,2.0,1.0,,4.0,3.0,2.0
50027,,2.0,1.0,,,,
50045,1.0,,1.0,,2.0,2.0,1.0


In [57]:
# Adding the new columns to whole dataframe

biba_games_df = pd.concat([biba_games_df, monthly_survey_df, monthly_weekday_counts_df], axis = 1)

In [58]:
# Dropping the original catergorical columns with dictionaries
biba_games_df = biba_games_df.drop(columns = list(categorical_features.columns))
biba_games_df.head()

Unnamed: 0,month,year,monthly_number_of_sessions,monthly_unique_sessions,monthly_repeated_sessions,monthly_avg_length_of_session,monthly_avg_light_activity,monthly_avg_moderate_activity,monthly_avg_vigorous_activity,monthly_count_slide_single,...,monthly_cleanliness,monthly_regular,monthly_condition,monthly_Thursday,monthly_Saturday,monthly_Monday,monthly_Tuesday,monthly_Friday,monthly_Wednesday,monthly_Sunday
0,3,2019,0,0,0,0.0,0.0,0.0,0.0,0,...,,,,,,,,,,
1,6,2018,0,0,0,0.0,0.0,0.0,0.0,0,...,,,,,,,,,,
2,8,2018,0,0,0,0.0,0.0,0.0,0.0,0,...,,,,,,,,,,
3,1,2019,0,0,0,0.0,0.0,0.0,0.0,0,...,,,,,,,,,,
4,8,2019,0,0,0,0.0,0.0,0.0,0.0,0,...,,,,,,,,,,


In [60]:
def show_scores(model, X, y, show = True):
    """
    Shows classification and regression scores
    
    Parameters
    ----------
    model: The sklearn model
    X: numpy.ndarray        
        The X part of the data
    y: numpy.ndarray
        The y part of the data
    Returns
    -------
        rmse: (float)
        r2: (float)
            
    """        
        
    y_preds = model.predict(X)                 
    rmse = mean_squared_error(y, y_preds, squared=False)
    r2 = r2_score(y, y_preds)
    if show: 
        print("Root mean squared error: %0.3f and r^2 score: %0.3f" % (rmse,r2))
    return rmse, r2

In [61]:
impute_biba_games_df.shape

(50120, 264)

## Modelling with all biba variables without any changes

In [63]:
X = impute_biba_games_df.drop(columns = ['target'])
y = impute_biba_games_df['target']

In [64]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 2020)

### 1. Linear Regression

In [65]:
lr = LinearRegression()
lr.fit(X_train, y_train)
print('Simple linear regression scores: ')
print('Train error: ')
show_scores(lr, X_train, y_train)

print('Test error: ')    
show_scores(lr, X_test, y_test)

Simple linear regression scores: 
Train error: 
Root mean squared error: 455.458 and r^2 score: 0.090
Test error: 
Root mean squared error: 699.593 and r^2 score: 0.041


(699.5930797124148, 0.04053007326469338)

### Observations 

- Very poor model with $R^2 = 4.1 \%$

### 2. SVR

In [27]:
svr = SVR()
svr.fit(X_train, y_train)
print('Simple linear regression scores: ')
print('Train error: ')
show_scores(svr, X_train, y_train)

print('Test error: ')    
show_scores(svr, X_test, y_test)

Simple linear regression scores: 
Train error: 
Root mean squared error: 483.367 and r^2 score: -0.024
Test error: 
Root mean squared error: 719.283 and r^2 score: -0.014


(719.2832836305175, -0.01423896628865462)

### Observations
- Very long train runtime. Not feasible on whole dataset
- Very Poor model with negative $R^2$


### 3. Lasso L1

In [30]:
lasso_lr = Lasso(max_iter = 2000, random_state = 2020)
lasso_lr.fit(X_train, y_train)
print('Lasso regression scores: ')
print('Train error: ')
show_scores(lasso_lr, X_train, y_train)

print('Test error: ')
show_scores(lasso_lr, X_test, y_test)

Lasso regression scores: 
Train error: 
Root mean squared error: 458.633 and r^2 score: 0.078
Test error: 
Root mean squared error: 701.577 and r^2 score: 0.035


  positive)


(701.5773181102657, 0.03507971362012974)

### Observations
- Same performance as simple LR 

### 4. Ridge L2

In [66]:
ridge_lr = Ridge(max_iter=2000, random_state = 2020)
ridge_lr.fit(X_train, y_train)
print('Ridge regression scores: ')
print('Train error: ')
show_scores(ridge_lr, X_train, y_train)

print('Test error: ')    
show_scores(ridge_lr, X_test, y_test)

Ridge regression scores: 
Train error: 
Root mean squared error: 455.481 and r^2 score: 0.090
Test error: 
Root mean squared error: 699.447 and r^2 score: 0.041


  overwrite_a=True).T


(699.4472076641025, 0.04093014941199169)

### Observations

- Same performance as Simple LR

### 4. ElasticNet L1 and L2 Regression

In [33]:
elastic_lr = ElasticNet(max_iter= 2000, random_state = 2020)
elastic_lr.fit(X_train, y_train)
print('Elastic regression scores: ')
print('Train error: ')
show_scores(elastic_lr, X_train, y_train)

print('Test error: ')    
show_scores(elastic_lr, X_test, y_test)

Elastic regression scores: 
Train error: 
Root mean squared error: 460.542 and r^2 score: 0.070
Test error: 
Root mean squared error: 702.101 and r^2 score: 0.034


  positive)


(702.100988553132, 0.03363870689591786)

### Observations

- Same performance as Simple LR

## Step 2: Scaling all columns with MinMaxScaler

In [40]:
scaler = MinMaxScaler((0,1))
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

### Observations:
- Simple LR - slight improvement from last case
- Lasso performs worse than previous case
- Ridge performs same as w/o scaling
- Elastic net performs worse than all models


### Step 3: Scaling all columns with Normalizer

In [49]:
scaler = Normalizer()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

### Observation
- All model performs worse than the minmax scaler

### Step 4: Scaling all columns with StandardScaler

In [67]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

### Observation
- Standard Scaler works best for all models among other scalers
- Moving on to grid search

In [85]:
lr = LinearRegression()
lr.fit(X_train_scaled, y_train)
print('Simple linear regression scores: ')
print('Train error: ')
show_scores(lr, X_train_scaled, y_train)

print('Test error: ')    
show_scores(lr, X_test_scaled, y_test)

Simple linear regression scores: 
Train error: 
Root mean squared error: 455.465 and r^2 score: 0.090
Test error: 
Root mean squared error: 699.617 and r^2 score: 0.040


(699.6173545864126, 0.04046348765561891)

In [82]:
params = {'alpha' : [0.001, 0.1, 1, 10, 100]}


In [83]:
ridge_lr = Ridge(max_iter=2000, random_state = 2020)

clf_ridge = GridSearchCV(ridge_lr, params, cv =5)

clf_ridge.fit(X_train_scaled, y_train)
print('Ridge regression scores: ')
print('Train error: ')
show_scores(clf_ridge, X_train_scaled, y_train)

print('Test error: ')
show_scores(clf_ridge, X_test_scaled, y_test)

Ridge regression scores: 
Train error: 
Root mean squared error: 455.757 and r^2 score: 0.089
Test error: 
Root mean squared error: 699.204 and r^2 score: 0.042


(699.2038649219163, 0.04159736800138891)

In [84]:
clf_ridge.best_params_

{'alpha': 100}

## PCA on this data

In [87]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

pca = PCA().fit(X_scaled)
| Ridge with Standard scaling and gridsearch| Test $R^2$ = 4.2%|
 = pca.transform(X_scaled)

In [98]:
for i in range(len(pca.explained_variance_ratio_)):
    if np.sum(pca.explained_variance_ratio_[0:i]) >=0.99:
        print("Number of principal components to get 99% variance = ", i)
        break

Number of principal components to get 99% variance =  190


In [99]:
for i in range(len(pca.explained_variance_ratio_)):
    if np.sum(pca.explained_variance_ratio_[0:i]) >=0.95:
        print("Number of principal components to get 95% variance = ", i)
        break

Number of principal components to get 95% variance =  135


### Observation
- PCA will not help in this case

## Trying Random Forest regression

In [103]:
params = {'max_depth' : [10, 100, 1000], 'n_estimators' : [100, 1000]}
rf = RandomForestRegressor(max_depth = 30, random_state = 2020)

# clf_rf = GridSearchCV(rf, params, cv =4)

rf.fit(X_train_scaled, y_train)
print('Ridge regression scores: ')
print('Train error: ')
show_scores(rf, X_train_scaled, y_train)

print('Test error: ')
show_scores(rf, X_test_scaled, y_test)

Ridge regression scores: 
Train error: 
Root mean squared error: 139.954 and r^2 score: 0.914
Test error: 
Root mean squared error: 615.272 and r^2 score: 0.258


(615.2720957918488, 0.257878872860947)

## RF with PCA columns

In [107]:
Z_99_per_var = Z[:, 0:190]

X_train, X_test, y_train, y_test = train_test_split(Z_99_per_var, y, test_size = 0.2, random_state = 2020)

In [108]:
rf = RandomForestRegressor(max_depth = 50, random_state = 2020)

# clf_rf = GridSearchCV(rf, params, cv =4)

rf.fit(X_train, y_train)
print('Ridge regression scores: ')
print('Train error: ')
show_scores(rf, X_train, y_train)

print('Test error: ')
show_scores(rf, X_test, y_test)

Ridge regression scores: 
Train error: 
Root mean squared error: 155.343 and r^2 score: 0.894
Test error: 
Root mean squared error: 681.761 and r^2 score: 0.089


(681.7608337063552, 0.08881942403011678)