In [2]:
# Installing xgboost for our notebook instance
!pip3 install xgboost==0.82

Collecting xgboost==0.82
  Downloading xgboost-0.82-py2.py3-none-manylinux1_x86_64.whl (114.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m114.0/114.0 MB[0m [31m6.9 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Installing collected packages: xgboost
  Attempting uninstall: xgboost
    Found existing installation: xgboost 1.2.0
    Uninstalling xgboost-1.2.0:
      Successfully uninstalled xgboost-1.2.0
Successfully installed xgboost-0.82


In [184]:
#importing necessary libraries
import pandas as pd
import xgboost as xgb
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
from google.cloud import bigquery

In [185]:
# BigQuery has made many datasets publicly available for your exploration. For this lab, we

In [186]:
# We have to specify the top 100 
query="""
SELECT
  weight_pounds,
  is_male,
  mother_age,
  plurality,
  gestation_weeks
FROM
  publicdata.samples.natality
WHERE year > 2000
LIMIT 10000
"""

In [187]:
df = bigquery.Client().query(query).to_dataframe()
df.head()

Unnamed: 0,weight_pounds,is_male,mother_age,plurality,gestation_weeks
0,7.561856,False,18,1,39
1,4.750962,True,22,1,40
2,7.374463,False,28,1,37
3,7.813183,False,31,1,41
4,3.688334,False,29,2,35


In [188]:
df['is_male'].value_counts()

True     5152
False    4848
Name: is_male, dtype: Int64

In [189]:
df.isnull().sum() #checking null values

weight_pounds       7
is_male             0
mother_age          0
plurality           0
gestation_weeks    72
dtype: int64

In [190]:
df = df.dropna(axis=0) #dropping all null values

In [191]:
df.isnull().sum() #checking the null values after dropping the null values

weight_pounds      0
is_male            0
mother_age         0
plurality          0
gestation_weeks    0
dtype: int64

In [192]:
df.head() #top most rows

Unnamed: 0,weight_pounds,is_male,mother_age,plurality,gestation_weeks
0,7.561856,False,18,1,39
1,4.750962,True,22,1,40
2,7.374463,False,28,1,37
3,7.813183,False,31,1,41
4,3.688334,False,29,2,35


In [193]:
# Shuffle the data
df = shuffle(df,random_state = 42)

In [194]:
df.head()

Unnamed: 0,weight_pounds,is_male,mother_age,plurality,gestation_weeks
5309,7.813183,True,34,1,39
514,7.828615,False,35,1,39
4359,7.716179,False,24,1,38
8445,6.311835,True,23,1,39
9104,4.373971,False,37,3,34


In [141]:
#--------------------------

In [90]:
# query1="""
# SELECT
#   count(plurality),
# FROM
#   publicdata.samples.natality
#   WHERE year >2000
# """

In [91]:
# df = bigquery.Client().query(query1).to_dataframe()
# df.head()

Unnamed: 0,f0_
0,33271914


In [98]:
# query2="""
# SELECT
#   count(plurality),
# FROM
#   publicdata.samples.natality
#   WHERE year = 2000
# """

In [99]:
# df = bigquery.Client().query(query2).to_dataframe()

Unnamed: 0,f0_
0,4063823


In [100]:
# df

Unnamed: 0,f0_
0,4063823


In [None]:
#----------------------

In [229]:
labels = df['weight_pounds'] #defining the labels
data = df.drop(columns = ['weight_pounds'],axis = 1) #dropping the column

In [230]:
data.head()

Unnamed: 0,is_male,mother_age,plurality,gestation_weeks
5309,True,34,1,39
514,False,35,1,39
4359,False,24,1,38
8445,True,23,1,39
9104,False,37,3,34


In [231]:
#changing the datatype
data['is_male'] = data['is_male'].astype(float)
data['mother_age'] = data['mother_age'].astype(float)
data['plurality'] = data['plurality'].astype(float)
data['gestation_weeks'] = data['gestation_weeks'].astype(float)

In [232]:
data.head()

Unnamed: 0,is_male,mother_age,plurality,gestation_weeks
5309,1.0,34.0,1.0,39.0
514,0.0,35.0,1.0,39.0
4359,0.0,24.0,1.0,38.0
8445,1.0,23.0,1.0,39.0
9104,0.0,37.0,3.0,34.0


In [246]:
data.isnull().sum()

is_male            0
mother_age         0
plurality          0
gestation_weeks    0
dtype: int64

In [234]:
#splitting the dataset
x,y =data,labels
x_train,x_test,y_train,y_test = train_test_split(x,y)

In [235]:
model = xgb.XGBRegressor(objective = 'reg:linear') #Making the model

In [236]:
model.fit(x_train,y_train) #fitting the model

XGBRegressor()

In [250]:
y_pred = model.predict(x_test) #predicting the value for test test

In [251]:
#finding the loss
from sklearn.metrics import r2_score
r2_score(y_test,pred)

0.39298018764605847

In [252]:
for i in range(20):
    print('Predicted weight: ', y_pred[i])
    print('Actual weight: ', y_test.iloc[i])
    print()

Predicted weight:  7.4452763
Actual weight:  7.936641432

Predicted weight:  7.4733458
Actual weight:  6.4992274837599995

Predicted weight:  7.5529876
Actual weight:  7.25761766504

Predicted weight:  7.4503307
Actual weight:  6.1244416383599996

Predicted weight:  7.453843
Actual weight:  8.2342654857

Predicted weight:  7.18921
Actual weight:  6.9996768185

Predicted weight:  7.5006804
Actual weight:  7.89695822484

Predicted weight:  7.2753935
Actual weight:  6.8122838958

Predicted weight:  7.7400694
Actual weight:  8.000575487979999

Predicted weight:  7.4503307
Actual weight:  7.24879917456

Predicted weight:  7.6954613
Actual weight:  9.25059651352

Predicted weight:  7.5742545
Actual weight:  8.16151293924

Predicted weight:  7.661717
Actual weight:  8.81849048

Predicted weight:  7.5742545
Actual weight:  10.56234697242

Predicted weight:  7.453843
Actual weight:  7.6500404913999995

Predicted weight:  7.462534
Actual weight:  8.6972362359

Predicted weight:  7.293844
Actual 

In [253]:
model.save_model('sachin.bst')  #saving the model

In [254]:
# # Deployment
# Step 1: Create a Cloud Storage bucket for our model.

In [270]:
# Update these to your own GCP project, model, and version names
GCP_PROJECT = 'My first Project'
MODEL_BUCKET = 'gs://sachin-buckets'
VERSION_NAME = 'v1'
MODEL_NAME = 'sachin-model'

In [271]:
!gsutil mb $MODEL_BUCKET

6577.37s - pydevd: Sending message related to process being replaced timed-out after 5 seconds


Creating gs://jose-buckets/...
ServiceException: 409 A Cloud Storage bucket named 'jose-buckets' already exists. Try another name. Bucket names must be globally unique across all Google Cloud projects, including those outside of your organization.


In [275]:
!gsutil cp ./sachin_model.bst $MODEL_BUCKET

6938.65s - pydevd: Sending message related to process being replaced timed-out after 5 seconds


Copying file://./jose_model.bst [Content-Type=application/octet-stream]...
/ [1 files][ 64.4 KiB/ 64.4 KiB]                                                
Operation completed over 1 objects/64.4 KiB.                                     
