<a href="https://colab.research.google.com/github/rajasreekalli/Data-Visualization/blob/main/FirstModel.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.compose import make_column_selector
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LinearRegression

In [2]:
df = pd.read_csv('/content/drive/MyDrive/insurance.csv')
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [3]:
df.dtypes

age           int64
sex          object
bmi         float64
children      int64
smoker       object
region       object
charges     float64
dtype: object

In [4]:
df.shape

(1338, 7)

In [5]:
print(df.isna().sum().sum(), 'missing values')

0 missing values


In [6]:
y = df['charges'] 
X = df.drop(columns=['charges'])

In [7]:
# Train Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [8]:
X_train.shape

(1003, 6)

In [9]:
y_train.shape

(1003,)

In [10]:
#instantiate the selectors to for numeric and categorical data types
num_selector = make_column_selector(dtype_include='number')
cat_selector = make_column_selector(dtype_include='object')
#select the numeric columns of each type
num_columns = num_selector(X_train)
cat_columns = cat_selector(X_train)
#check our lists
print('numeric columns are', num_columns)
print('categorical columns are', cat_columns)

numeric columns are ['age', 'bmi', 'children']
categorical columns are ['sex', 'smoker', 'region']


Sex - nominal,
Smoker - nominal,
Region - nominal,
Age - numerical,
BMI - numerical,
Children - ordinal.

In [11]:
#select categorical columns
cat_data = X_train[cat_selector(X_train)]
cat_data

Unnamed: 0,sex,smoker,region
693,male,no,northwest
1297,female,no,southeast
634,male,no,southwest
1022,male,yes,southeast
178,female,no,southwest
...,...,...,...
1095,female,no,northeast
1130,female,no,southeast
1294,male,no,northeast
860,female,yes,southwest


In [12]:
#instantiate one hot encoder
ohe_encoder = OneHotEncoder(sparse=False, handle_unknown='ignore')
ohe_encoder.fit(cat_data)
cat_ohe = ohe_encoder.transform(cat_data)
cat_ohe

array([[0., 1., 1., ..., 1., 0., 0.],
       [1., 0., 1., ..., 0., 1., 0.],
       [0., 1., 1., ..., 0., 0., 1.],
       ...,
       [0., 1., 1., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 1.],
       [0., 1., 1., ..., 0., 0., 1.]])

In [13]:
#convert to dataframe

df_ohe = pd.DataFrame(cat_ohe, columns=ohe_encoder.get_feature_names(cat_data.columns))
df_ohe



Unnamed: 0,sex_female,sex_male,smoker_no,smoker_yes,region_northeast,region_northwest,region_southeast,region_southwest
0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0
1,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
2,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0
3,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0
4,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...
998,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
999,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
1000,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0
1001,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0


In [14]:
X_train.reset_index(drop=True, inplace=True)
df_ohe.reset_index(drop=True, inplace=True)
X_train = pd.merge(X_train, df_ohe, left_index=True, right_index=True)
X_train

Unnamed: 0,age,sex,bmi,children,smoker,region,sex_female,sex_male,smoker_no,smoker_yes,region_northeast,region_northwest,region_southeast,region_southwest
0,24,male,23.655,0,no,northwest,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0
1,28,female,26.510,2,no,southeast,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
2,51,male,39.700,1,no,southwest,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0
3,47,male,36.080,1,yes,southeast,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0
4,46,female,28.900,2,no,southwest,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
998,18,female,31.350,4,no,northeast,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
999,39,female,23.870,5,no,southeast,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
1000,58,male,25.175,0,no,northeast,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0
1001,37,female,47.600,2,yes,southwest,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0


In [15]:
X_train = X_train.drop(columns=['sex', 'smoker', 'region']) 
X_train

Unnamed: 0,age,bmi,children,sex_female,sex_male,smoker_no,smoker_yes,region_northeast,region_northwest,region_southeast,region_southwest
0,24,23.655,0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0
1,28,26.510,2,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
2,51,39.700,1,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0
3,47,36.080,1,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0
4,46,28.900,2,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...
998,18,31.350,4,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
999,39,23.870,5,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
1000,58,25.175,0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0
1001,37,47.600,2,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0


In [16]:
y_train

693      2352.96845
1297     4340.44090
634      9391.34600
1022    42211.13820
178      8823.27900
           ...     
1095     4561.18850
1130     8582.30230
1294    11931.12525
860     46113.51100
1126    10214.63600
Name: charges, Length: 1003, dtype: float64

In [17]:
reg = LinearRegression()

In [18]:
#train the model to learn the relationship between the features and the target
reg.fit(X_train, y_train)

LinearRegression()

In [19]:
train_score = reg.score(X_train, y_train)
print(train_score)

0.7449555328228536


In [20]:
cat_data_test = X_test[cat_selector(X_test)]
cat_data_test

Unnamed: 0,sex,smoker,region
764,female,no,northeast
887,female,no,northwest
890,female,yes,northwest
1293,male,no,northwest
259,male,yes,northwest
...,...,...,...
342,female,no,northeast
308,male,no,northeast
1128,male,no,southwest
503,male,yes,southeast


In [21]:
df_ohe_test = pd.DataFrame(cat_ohe, columns=ohe_encoder.get_feature_names(cat_data_test.columns))
df_ohe_test




Unnamed: 0,sex_female,sex_male,smoker_no,smoker_yes,region_northeast,region_northwest,region_southeast,region_southwest
0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0
1,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
2,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0
3,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0
4,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...
998,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
999,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
1000,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0
1001,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0


In [22]:
X_test.reset_index(drop=True, inplace=True)
df_ohe_test.reset_index(drop=True, inplace=True)
X_test = pd.merge(X_test, df_ohe_test, left_index=True, right_index=True)
X_test

Unnamed: 0,age,sex,bmi,children,smoker,region,sex_female,sex_male,smoker_no,smoker_yes,region_northeast,region_northwest,region_southeast,region_southwest
0,45,female,25.175,2,no,northeast,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0
1,36,female,30.020,0,no,northwest,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
2,64,female,26.885,0,yes,northwest,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0
3,46,male,25.745,3,no,northwest,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0
4,19,male,31.920,0,yes,northwest,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
330,60,female,27.550,0,no,northeast,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0
331,58,male,34.865,0,no,northeast,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
332,34,male,32.800,1,no,southwest,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
333,19,male,30.250,0,yes,southeast,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0


In [23]:
X_test = X_test.drop(columns=['sex', 'smoker', 'region']) 
X_test

Unnamed: 0,age,bmi,children,sex_female,sex_male,smoker_no,smoker_yes,region_northeast,region_northwest,region_southeast,region_southwest
0,45,25.175,2,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0
1,36,30.020,0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
2,64,26.885,0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0
3,46,25.745,3,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0
4,19,31.920,0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...
330,60,27.550,0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0
331,58,34.865,0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
332,34,32.800,1,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
333,19,30.250,0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0


In [24]:
test_score = reg.score(X_train, y_train)
print(test_score)

0.7449555328228536
