In [1]:
import pandas as pd
import numpy as np

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

from sklearn.metrics import mean_squared_error, r2_score

In [2]:
df = pd.read_csv('datasets/50_Startups.csv')
df.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State,Profit
0,165349.2,136897.8,471784.1,New York,192261.83
1,162597.7,151377.59,443898.53,California,191792.06
2,153441.51,101145.55,407934.54,Florida,191050.39
3,144372.41,118671.85,383199.62,New York,182901.99
4,142107.34,91391.77,366168.42,Florida,166187.94


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   R&D Spend        50 non-null     float64
 1   Administration   50 non-null     float64
 2   Marketing Spend  50 non-null     float64
 3   State            50 non-null     object 
 4   Profit           50 non-null     float64
dtypes: float64(4), object(1)
memory usage: 2.1+ KB


In [4]:
df.describe()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,Profit
count,50.0,50.0,50.0,50.0
mean,73721.6156,121344.6396,211025.0978,112012.6392
std,45902.256482,28017.802755,122290.310726,40306.180338
min,0.0,51283.14,0.0,14681.4
25%,39936.37,103730.875,129300.1325,90138.9025
50%,73051.08,122699.795,212716.24,107978.19
75%,101602.8,144842.18,299469.085,139765.9775
max,165349.2,182645.56,471784.1,192261.83


In [5]:
# looking into categorical columns
df['State'].unique()

array(['New York', 'California', 'Florida'], dtype=object)

In [6]:
df = pd.get_dummies(df, columns=['State'], prefix='state_', drop_first=True, dtype='int')

### Any idea why drop_first is set to True?
 - When you have *K* mutually exclusive categories, you actually only need *K-1* new dummy variables to encode the same information
 - Also keeping the other two columns its just gonna keep redundant information
    - Eg: If we have California=0 and Florida=0, then New York is 1: The extra dummy variable literally contains redundant information.

In [7]:
df.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,Profit,state__Florida,state__New York
0,165349.2,136897.8,471784.1,192261.83,0,1
1,162597.7,151377.59,443898.53,191792.06,0,0
2,153441.51,101145.55,407934.54,191050.39,1,0
3,144372.41,118671.85,383199.62,182901.99,0,1
4,142107.34,91391.77,366168.42,166187.94,1,0


In [14]:
df.columns

Index(['R&D Spend', 'Administration', 'Marketing Spend', 'Profit',
       'state__Florida', 'state__New York'],
      dtype='object')

In [8]:
x = df.drop(columns=['Profit'], axis=1)
y = df['Profit']

In [9]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [10]:
x_train.shape, y_train.shape, x_test.shape, y_test.shape

((40, 5), (40,), (10, 5), (10,))

In [11]:
model = LinearRegression()
model.fit(x_train, y_train)

In [12]:
y_pred = model.predict(x_test)

In [13]:
r2 = r2_score(y_pred, y_test)
mse = mean_squared_error(y_pred, y_test)
rmse = np.sqrt(mse)

print('r2 Score: ', r2)
print('mse score: ', mse)
print('rmse score: ', rmse)

r2 Score:  0.8851366465092358
mse score:  82010363.04430106
rmse score:  9055.957323458468


## R2 Score or R-Squared 
- An R-Squared value shows how well the model predicts the outcome of the dependent variable. R-Squared values range from 0 to 1.

- An R-Squared value of 0 means that the model explains or predicts 0% of the relationship between the dependent and independent variables.

## Next Week
- Essentials for machine learning
- Project - Predict Taxi Trip Duration
- Introduction to Logistic Regression
- Logistic regression from Scratch
