# Machine Learning for Absolute Beginners

## 0. Import Libraries

In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns


from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score

## 1. Import and Verify Data

In [2]:
# import data as pandas dataframe
df = pd.read_csv("data/calhousing.csv")

In [3]:
# check first few records
df.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,Price
0,8.3252,41,6.984127,1.02381,322,2.555556,37.88,-122.23,4.526
1,8.3014,21,6.238137,0.97188,2401,2.109842,37.86,-122.22,3.585
2,7.2574,52,8.288136,1.073446,496,2.80226,37.85,-122.24,3.521
3,5.6431,52,5.817352,1.073059,558,2.547945,37.85,-122.25,3.413
4,3.8462,52,6.281853,1.081081,565,2.181467,37.85,-122.25,3.422


In [4]:
# check dataframe info
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 9 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   MedInc      20640 non-null  float64
 1   HouseAge    20640 non-null  int64  
 2   AveRooms    20640 non-null  float64
 3   AveBedrms   20640 non-null  float64
 4   Population  20640 non-null  int64  
 5   AveOccup    20640 non-null  float64
 6   Latitude    20640 non-null  float64
 7   Longitude   20640 non-null  float64
 8   Price       20640 non-null  float64
dtypes: float64(7), int64(2)
memory usage: 1.4 MB


## 2. Setup Model

- Our target variable is ``sales``
- Our feature variables are ``TV``, ``newspaper``, and `` radio``
- Our model type is multiple linear regression. 

In [6]:
X = df.drop(['Price'],axis=1)
y = df['Price']

In [7]:
X_train,X_test,y_train,y_test = train_test_split(X,y, test_size=0.3,random_state=4)

## 3. Fit Multiple Linear Regression Model to Data 

In [8]:
model = LinearRegression()
model.fit(X_train,y_train)

LinearRegression()

## 4. Evaluate Model Results

### Print Model Parameters

In [9]:
print(model.coef_)
print(model.intercept_)

[ 4.38284149e-01  9.02141800e-03 -1.13436409e-01  6.48834993e-01
 -7.98668870e-06 -3.97828751e-03 -4.21168217e-01 -4.34680482e-01]
-36.92588429915667


### R2 Score

In [10]:
# generate predictions from model for test data
y_predict = model.predict(X_test)

In [11]:
# R2 score for test data
r2_score(y_test, y_predict)

0.6010291338773821

## 5. Generate Predictions for New Data