<a href="https://colab.research.google.com/github/ryonce/Daily-Projects/blob/main/First_Model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Imports

import pandas as pd
from sklearn.compose import make_column_transformer, make_column_selector
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn import set_config
set_config(display='diagram')

In [2]:
# Load in data

df = pd.read_csv('https://docs.google.com/spreadsheets/d/e/2PACX-1vSLwImhoEah5uQE9W77NSC1KSGfrhznh6Yhs0IWcSTcN-JdeRAn_1XoA11e2n1emoxAn0tfyiJcLwth/pub?output=csv')

df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [3]:
# Check for duplicates

df.duplicated().sum()

1

In [7]:
# Drop and run sum again

df.drop_duplicates(inplace=True)

df.duplicated().sum()

0

In [8]:
# Check info

df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1337 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1337 non-null   int64  
 1   sex       1337 non-null   object 
 2   bmi       1337 non-null   float64
 3   children  1337 non-null   int64  
 4   smoker    1337 non-null   object 
 5   region    1337 non-null   object 
 6   charges   1337 non-null   float64
dtypes: float64(2), int64(2), object(3)
memory usage: 83.6+ KB


In [9]:
# Check for any outliers

df.describe()

Unnamed: 0,age,bmi,children,charges
count,1337.0,1337.0,1337.0,1337.0
mean,39.222139,30.663452,1.095737,13279.121487
std,14.044333,6.100468,1.205571,12110.359656
min,18.0,15.96,0.0,1121.8739
25%,27.0,26.29,0.0,4746.344
50%,39.0,30.4,1.0,9386.1613
75%,51.0,34.7,2.0,16657.71745
max,64.0,53.13,5.0,63770.42801


In [10]:
X = df.drop(columns = 'charges')

y = df['charges']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42)

In [11]:
# Instantiate the transformers

scaler = StandardScaler()
mean_imputer = SimpleImputer(strategy='mean')
freq_imputer = SimpleImputer(strategy='most_frequent')
ohe = OneHotEncoder(sparse=False, handle_unknown='ignore')

# Prepare separate processing pipelines for numeric and categorical data

num_pipe = make_pipeline(mean_imputer, scaler)
cat_pipe = make_pipeline(freq_imputer, ohe)

# Create ColumnSelectors for the the numeric and categorical data

cat_selector = make_column_selector(dtype_include='object')
num_selector = make_column_selector(dtype_include='number')

# Combine the Pipelines and ColumnSelectors into tuples for the ColumnTransformer

cat_tuple = (cat_pipe, cat_selector)
num_tuple = (num_pipe, num_selector)

# Create the preprocessing ColumnTransformer

preprocessor = make_column_transformer(cat_tuple, num_tuple, remainder='drop')
preprocessor


In [12]:
# Instantiate a linear regression model

linreg = LinearRegression()

# Combine the preprocessing ColumnTransformer and the linear regression model in a Pipeline

linreg_pipe = make_pipeline(preprocessor, linreg)
linreg_pipe


In [13]:
# Fit the model pipeline on the training data

linreg_pipe.fit(X_train, y_train)



In [14]:
# Predictions

train_pred = linreg_pipe.predict(X_train)
test_pred = linreg_pipe.predict(X_test)

In [19]:
# Calculating R2

train_r2 = r2_score(y_train, train_pred)
test_r2 = r2_score(y_test, test_pred)

print(f"Train R-Squared: {round((train_r2), 3)}")
print(f"Test R-Squared: {round((test_r2), 3)}")

Train R-Squared: 0.73
Test R-Squared: 0.795
