In [78]:
from google.colab import files
uploaded = files.upload()
!rm -rf ~/.kaggle
!mkdir -p ~/.kaggle
!mv /content/kaggle.json ~/.kaggle


Saving kaggle.json to kaggle.json


In [79]:
!rm -rf /content/*.json

In [80]:
!chmod 600 /root/.kaggle/kaggle.json
!kaggle datasets download -d mirichoi0218/insurance --force

Downloading insurance.zip to /content
  0% 0.00/16.0k [00:00<?, ?B/s]
100% 16.0k/16.0k [00:00<00:00, 26.9MB/s]


In [81]:
from zipfile import ZipFile
with ZipFile("/content/insurance.zip", "r") as zipobj:
  zipobj.extractall()

In [82]:
import pandas as pd
import numpy as np

In [83]:
dataset = pd.read_csv("insurance.csv")
X = dataset.iloc[:,:-1].values
y = dataset.iloc[:, -1].values

In [84]:
print(X)
print(y)

[[19 'female' 27.9 0 'yes' 'southwest']
 [18 'male' 33.77 1 'no' 'southeast']
 [28 'male' 33.0 3 'no' 'southeast']
 ...
 [18 'female' 36.85 0 'no' 'southeast']
 [21 'female' 25.8 0 'no' 'southwest']
 [61 'female' 29.07 0 'yes' 'northwest']]
[16884.92  1725.55  4449.46 ...  1629.83  2007.94 29141.36]


In [85]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
ct = ColumnTransformer(transformers=[("encoder", OneHotEncoder(), [1,4,5])], remainder="passthrough")
ct.fit(X)
feature_names=ct.get_feature_names_out()
X = np.array(ct.transform(X))

In [86]:
print(feature_names)

['encoder__x1_female' 'encoder__x1_male' 'encoder__x4_no'
 'encoder__x4_yes' 'encoder__x5_northeast' 'encoder__x5_northwest'
 'encoder__x5_southeast' 'encoder__x5_southwest' 'remainder__x0'
 'remainder__x2' 'remainder__x3']


In [87]:
print(X[0])

[1.0 0.0 0.0 1.0 0.0 0.0 0.0 1.0 19 27.9 0]


In [88]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2, random_state=1)

In [89]:
print(X_train)
print(X_test)

[[1.0 0.0 1.0 ... 53 26.6 0]
 [0.0 1.0 1.0 ... 53 21.4 1]
 [0.0 1.0 1.0 ... 18 37.29 0]
 ...
 [1.0 0.0 0.0 ... 51 34.96 2]
 [1.0 0.0 0.0 ... 40 22.22 2]
 [0.0 1.0 1.0 ... 57 27.94 1]]
[[0.0 1.0 1.0 ... 19 35.53 0]
 [0.0 1.0 1.0 ... 57 31.54 0]
 [0.0 1.0 1.0 ... 51 37.0 0]
 ...
 [0.0 1.0 0.0 ... 47 38.94 2]
 [1.0 0.0 1.0 ... 19 36.575 0]
 [1.0 0.0 1.0 ... 35 26.125 0]]


In [90]:
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(X_train, y_train)

In [91]:
y_pred = regressor.predict(X_test)
np.set_printoptions(precision=2)
y_pred_reshaped = y_pred.reshape(len(y_pred), 1)
y_test_reshaped = y_test.reshape(len(y_pred), 1)
print(np.concatenate((y_test_reshaped, y_pred_reshaped), 1))

[[ 1646.43  4383.68]
 [11353.23 12885.04]
 [ 8798.59 12589.22]
 [10381.48 13286.23]
 [ 2103.08   544.73]
 [38746.36 32117.58]
 [ 9304.7  12919.04]
 [11658.12 12318.62]
 [ 3070.81  3784.29]
 [19539.24 29468.46]
 [12629.9  11002.81]
 [11538.42 17539.69]
 [ 6338.08  8681.35]
 [ 7050.64  8349.04]
 [ 1137.47  3130.13]
 [ 8968.33 10445.84]
 [21984.47  3863.74]
 [ 6414.18  6944.63]
 [28287.9  15009.63]
 [13462.52 14441.6 ]
 [ 9722.77 12543.66]
 [40932.43 32958.73]
 [ 8026.67  9072.64]
 [ 8444.47  8986.86]
 [ 2203.47  3022.86]
 [ 6664.69  8164.97]
 [ 8606.22  9556.08]
 [ 8283.68 10743.2 ]
 [ 5375.04  7694.02]
 [ 3645.09  4373.44]
 [11674.13 14140.94]
 [11737.85  5811.79]
 [24873.38 34631.91]
 [33750.29 27009.11]
 [24180.93 33348.14]
 [ 9863.47  9532.97]
 [36837.47 30421.65]
 [17942.11 26648.91]
 [11856.41 15157.78]
 [39725.52 33895.76]
 [ 4349.46  6303.39]
 [11743.93 14059.15]
 [19749.38 10713.45]
 [12347.17 15089.36]
 [ 4931.65  4187.95]
 [30260.   13106.43]
 [27724.29  4336.2 ]
 [34672.15 28

In [92]:
print(regressor.coef_)
print(regressor.intercept_)

[   121.08   -121.08 -11893.24  11893.24    584.38    188.28   -454.
   -318.66    257.49    321.62    408.06]
-109.81988139935493


In [94]:
#  A female who is a smoker and of southeast with 32 age 23.23 bmi and 5 children
print(regressor.predict([[1,0,0,1,0,0,1,0,32, 23.23, 5]]))

[29201.77]
