Let's start with a regression problem. We'll use the housing dataset built into Scikit-Learn's datasets module.

In [30]:
from sklearn.datasets import fetch_california_housing
import pandas as pd

# Load the data directly as a pandas DataFrame
# The result is a Bunch object, but its 'frame' attribute is a DataFrame
california_housing = fetch_california_housing(as_frame=True)

# The 'frame' attribute contains both the data and the target
df = california_housing.frame

print(df.head())
print(df.describe())
print(df.info())
print(df.shape)

   MedInc  HouseAge  AveRooms  AveBedrms  Population  AveOccup  Latitude  \
0  8.3252      41.0  6.984127   1.023810       322.0  2.555556     37.88   
1  8.3014      21.0  6.238137   0.971880      2401.0  2.109842     37.86   
2  7.2574      52.0  8.288136   1.073446       496.0  2.802260     37.85   
3  5.6431      52.0  5.817352   1.073059       558.0  2.547945     37.85   
4  3.8462      52.0  6.281853   1.081081       565.0  2.181467     37.85   

   Longitude  MedHouseVal  
0    -122.23        4.526  
1    -122.22        3.585  
2    -122.24        3.521  
3    -122.25        3.413  
4    -122.25        3.422  
             MedInc      HouseAge      AveRooms     AveBedrms    Population  \
count  20640.000000  20640.000000  20640.000000  20640.000000  20640.000000   
mean       3.870671     28.639486      5.429000      1.096675   1425.476744   
std        1.899822     12.585558      2.474173      0.473911   1132.462122   
min        0.499900      1.000000      0.846154      0.3333

In [33]:
cali_df=pd.DataFrame(california_housing["data"],columns=california_housing["feature_names"])
cali_df["target"]=pd.Series(california_housing["target"])
cali_df.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,target
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23,4.526
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22,3.585
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24,3.521
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25,3.413
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25,3.422


In [34]:
len(cali_df)

20640

1) RidgeRegression Algorithm

In [40]:
from sklearn.linear_model import Ridge
from sklearn.model_selection import train_test_split
import numpy as np

In [41]:
np.random.seed(42)

In [42]:
x=cali_df.drop("target",axis=1)
y=cali_df["target"]

In [43]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

In [44]:
model = Ridge()
model.fit(x_train, y_train)

In [45]:
model.score(x_test,y_test)

0.5758549611440131

What if RidgeRegression didn't work? Or what if we wanted to improve our results?

In [46]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
import numpy as np

In [47]:
np.random.seed(42)

In [48]:
x=cali_df.drop("target",axis=1)
y=cali_df["target"]

In [49]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

In [51]:
model = RandomForestRegressor()
model.fit(x_train, y_train)

In [53]:
model.score(x_test, y_test)

0.806652667101436

Woah, we get a boost in score on the test set of almost 0.2 with a change of model