In [2]:
import boto3
import sagemaker

In [3]:
from sagemaker import get_execution_role
role = get_execution_role()
role

'arn:aws:iam::257548842387:role/service-role/AmazonSageMaker-ExecutionRole-20230618T190858'

In [4]:
region = boto3.session.Session().region_name
region

'ap-south-1'

In [5]:
from sagemaker.sklearn.processing import SKLearnProcessor
sklearn_processor = SKLearnProcessor(framework_version='0.20.0', role=role, instance_type='ml.t3.medium', instance_count=1)

In [6]:
from sagemaker.processing import ProcessingInput, ProcessingOutput

In [7]:
import pandas as pd
d = pd.read_csv("s3://preprocessoroutputs/carclaims.csv")

In [8]:
d.head()

Unnamed: 0,Month,WeekOfMonth,DayOfWeek,Make,AccidentArea,DayOfWeekClaimed,MonthClaimed,WeekOfMonthClaimed,Sex,MaritalStatus,...,AgeOfPolicyHolder,PoliceReportFiled,WitnessPresent,AgentType,NumberOfSuppliments,AddressChange-Claim,NumberOfCars,Year,BasePolicy,FraudFound
0,Dec,5,Wednesday,Honda,Urban,Tuesday,Jan,1,Female,Single,...,26 to 30,No,No,External,none,1 year,3 to 4,1994,Liability,No
1,Jan,3,Wednesday,Honda,Urban,Monday,Jan,4,Male,Single,...,31 to 35,Yes,No,External,none,no change,1 vehicle,1994,Collision,No
2,Oct,5,Friday,Honda,Urban,Thursday,Nov,2,Male,Married,...,41 to 50,No,No,External,none,no change,1 vehicle,1994,Collision,No
3,Jun,2,Saturday,Toyota,Rural,Friday,Jul,1,Male,Married,...,51 to 65,Yes,No,External,more than 5,no change,1 vehicle,1994,Liability,No
4,Jan,5,Monday,Honda,Urban,Tuesday,Feb,2,Female,Single,...,31 to 35,No,No,External,none,no change,1 vehicle,1994,Collision,No


In [9]:
d.to_csv("carclaim.csv")

In [10]:
input_data = "s3://preprocessoroutputs/carclaims.csv"

sklearn_processor.run(
    code='preprocessing.py',
    inputs = [
        ProcessingInput(source=input_data, 
                        destination = '/opt/ml/processing/input')],
    outputs = [
        ProcessingOutput(output_name = 'train_data',
                         source = '/opt/ml/processing/train',
                         destination = "s3://preprocessoroutputs/train"),
        
        ProcessingOutput(output_name = 'test_data',
                         source = '/opt/ml/processing/test',
                         destination = "s3://preprocessoroutputs/test")],
    
    arguments = ['--train-test-split-ratio','0.3']
)

INFO:sagemaker:Creating processing-job with name sagemaker-scikit-learn-2023-06-23-07-04-44-289


  import imp[0m
  LARGE_SPARSE_SUPPORTED = LooseVersion(scipy_version) >= '0.14.0'[0m
[34mrecieved args: Namespace(train_test_split_ratio=0.3)[0m
[34mMonth : {'Apr': 0, 'Aug': 1, 'Dec': 2, 'Feb': 3, 'Jan': 4, 'Jul': 5, 'Jun': 6, 'Mar': 7, 'May': 8, 'Nov': 9, 'Oct': 10, 'Sep': 11}[0m
[34mDayOfWeek : {'Friday': 0, 'Monday': 1, 'Saturday': 2, 'Sunday': 3, 'Thursday': 4, 'Tuesday': 5, 'Wednesday': 6}[0m
[34mMake : {'Accura': 0, 'BMW': 1, 'Chevrolet': 2, 'Dodge': 3, 'Ferrari': 4, 'Ford': 5, 'Honda': 6, 'Jaguar': 7, 'Lexus': 8, 'Mazda': 9, 'Mecedes': 10, 'Mercury': 11, 'Nisson': 12, 'Pontiac': 13, 'Porche': 14, 'Saab': 15, 'Saturn': 16, 'Toyota': 17, 'VW': 18}[0m
[34mAccidentArea : {'Rural': 0, 'Urban': 1}[0m
[34mDayOfWeekClaimed : {'Friday': 0, 'Monday': 1, 'Saturday': 2, 'Sunday': 3, 'Thursday': 4, 'Tuesday': 5, 'Wednesday': 6}[0m
[34mMonthClaimed : {'Apr': 0, 'Aug': 1, 'Dec': 2, 'Feb': 3, 'Jan': 4, 'Jul': 5, 'Jun': 6, 'Mar': 7, 'May': 8, 'Nov': 9, 'Oct': 10, 'Sep': 11}[0m


In [11]:
preprocessing_job_description = sklearn_processor.jobs[-1].describe()

In [12]:
preprocessing_job_description

{'ProcessingInputs': [{'InputName': 'input-1',
   'AppManaged': False,
   'S3Input': {'S3Uri': 's3://preprocessoroutputs/carclaims.csv',
    'LocalPath': '/opt/ml/processing/input',
    'S3DataType': 'S3Prefix',
    'S3InputMode': 'File',
    'S3DataDistributionType': 'FullyReplicated',
    'S3CompressionType': 'None'}},
  {'InputName': 'code',
   'AppManaged': False,
   'S3Input': {'S3Uri': 's3://sagemaker-ap-south-1-257548842387/sagemaker-scikit-learn-2023-06-23-07-04-44-289/input/code/preprocessing.py',
    'LocalPath': '/opt/ml/processing/input/code',
    'S3DataType': 'S3Prefix',
    'S3InputMode': 'File',
    'S3DataDistributionType': 'FullyReplicated',
    'S3CompressionType': 'None'}}],
 'ProcessingOutputConfig': {'Outputs': [{'OutputName': 'train_data',
    'S3Output': {'S3Uri': 's3://preprocessoroutputs/train',
     'LocalPath': '/opt/ml/processing/train',
     'S3UploadMode': 'EndOfJob'},
    'AppManaged': False},
   {'OutputName': 'test_data',
    'S3Output': {'S3Uri': 's3:

In [13]:
output_config = preprocessing_job_description["ProcessingOutputConfig"]

In [14]:
output_config

{'Outputs': [{'OutputName': 'train_data',
   'S3Output': {'S3Uri': 's3://preprocessoroutputs/train',
    'LocalPath': '/opt/ml/processing/train',
    'S3UploadMode': 'EndOfJob'},
   'AppManaged': False},
  {'OutputName': 'test_data',
   'S3Output': {'S3Uri': 's3://preprocessoroutputs/test',
    'LocalPath': '/opt/ml/processing/test',
    'S3UploadMode': 'EndOfJob'},
   'AppManaged': False}]}

In [15]:
for output in output_config['Outputs']:
    if output['OutputName'] == 'train_data':
        preprocessed_training_data = output['S3Output']['S3Uri']
    if output['OutputName'] == 'test_data':
         preprocessed_test_data = output['S3Output']['S3Uri']

In [16]:
preprocessed_training_data

's3://preprocessoroutputs/train'

In [17]:
preprocessed_test_data

's3://preprocessoroutputs/test'

In [18]:
x = pd.read_csv('s3://preprocessoroutputs/train/Xtrain.csv', header=None)

y = pd.read_csv('s3://preprocessoroutputs/test/ytrain.csv',header=None)


In [19]:
x.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,21,22,23,24,25,26,27,28,29,30
0,1,4,5,0,1,6,1,4,0,1,...,5,3,0,0,0,0,2,1,1994,2
1,5,1,1,9,1,0,5,5,1,1,...,5,5,0,0,0,3,3,0,1994,1
2,5,3,5,13,1,5,5,3,1,1,...,6,5,0,0,0,2,3,0,1995,0
3,10,1,5,13,1,4,10,1,1,1,...,3,3,0,0,0,3,3,0,1994,1
4,8,1,6,6,1,6,8,1,1,2,...,5,4,0,0,0,2,3,0,1995,1


In [20]:
from sklearn.feature_selection import SelectKBest, f_classif
selector = SelectKBest(f_classif, k=20)
selector.fit(x,y)
x.columns[selector.get_support()]

  y = column_or_1d(y, warn=True)


Int64Index([0, 2, 3, 4, 6, 8, 10, 11, 12, 13, 14, 15, 16, 20, 21, 22, 25, 27,
            29, 30],
           dtype='int64')

In [21]:
x_new = x[x.columns[selector.get_support()]]

In [22]:
x.shape

(10570, 31)

In [23]:
x_new.shape

(10570, 20)

In [24]:
overall = pd.concat([y,x_new], axis=1)

In [25]:
overall.head()

Unnamed: 0,0,0.1,2,3,4,6,8,10,11,12,...,14,15,16,20,21,22,25,27,29,30
0,0,1,5,0,1,1,0,-1.194267,0,2,...,1,4,400,3,5,3,0,2,1994,2
1,0,5,1,9,1,5,1,0.836753,0,1,...,0,5,400,3,5,5,0,3,1994,1
2,0,5,5,13,1,5,1,1.161717,1,0,...,0,2,400,1,6,5,0,3,1995,0
3,0,10,5,13,1,10,1,-0.625581,1,1,...,0,8,400,3,3,3,0,3,1994,1
4,0,8,6,6,1,8,1,0.105586,0,1,...,0,3,400,2,5,4,0,3,1995,1


In [26]:
overall.to_csv("s3://preprocessoroutputs/overall.csv",header=False, index=False)

In [27]:
ov = pd.read_csv("s3://preprocessoroutputs/overall.csv", header=None)
ov.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,11,12,13,14,15,16,17,18,19,20
0,0,1,5,0,1,1,0,-1.194267,0,2,...,1,4,400,3,5,3,0,2,1994,2
1,0,5,1,9,1,5,1,0.836753,0,1,...,0,5,400,3,5,5,0,3,1994,1
2,0,5,5,13,1,5,1,1.161717,1,0,...,0,2,400,1,6,5,0,3,1995,0
3,0,10,5,13,1,10,1,-0.625581,1,1,...,0,8,400,3,3,3,0,3,1994,1
4,0,8,6,6,1,8,1,0.105586,0,1,...,0,3,400,2,5,4,0,3,1995,1


In [28]:
from sagemaker.amazon.amazon_estimator import get_image_uri

In [29]:
sess = sagemaker.Session()
sess

<sagemaker.session.Session at 0x7fb744268b50>

In [30]:
container = get_image_uri(boto3.Session().region_name, 'xgboost','1.0-1')
xgb = sagemaker.estimator.Estimator(container, role, train_instance_count=1, train_instance_type="ml.m4.xlarge", output_path="s3://preprocessoroutputs/vehicle/model/", sagemaker_session=sess)

See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
INFO:sagemaker.image_uris:Defaulting to only available Python version: py3
INFO:sagemaker.image_uris:Defaulting to only supported image scope: cpu.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.


In [31]:
xgb.set_hyperparameters(
    max_depth= "5",
    eta= "0.2",
    gamma= "4",
    min_child_weight="6",
    subsample= "0.7",
    objective= "binary:logistic",
    num_round="50",
    scale_pos_weight= "15.7",
)

In [32]:
from sagemaker.inputs import TrainingInput

train_input = TrainingInput("s3://preprocessoroutputs/overall.csv", content_type='csv')

In [33]:
xgb.fit({'train': train_input})

INFO:sagemaker:Creating training-job with name: sagemaker-xgboost-2023-06-23-07-22-27-067


2023-06-23 07:22:27 Starting - Starting the training job...
2023-06-23 07:22:47 Starting - Preparing the instances for training.........
2023-06-23 07:24:23 Downloading - Downloading input data
2023-06-23 07:24:23 Training - Downloading the training image......
2023-06-23 07:25:14 Training - Training image download completed. Training in progress...[34m[2023-06-23 07:25:30.480 ip-10-0-142-217.ap-south-1.compute.internal:7 INFO utils.py:27] RULE_JOB_STOP_SIGNAL_FILENAME: None[0m
[34mINFO:sagemaker-containers:Imported framework sagemaker_xgboost_container.training[0m
[34mINFO:sagemaker-containers:Failed to parse hyperparameter objective value binary:logistic to Json.[0m
[34mReturning the value itself[0m
[34mINFO:sagemaker-containers:No GPUs detected (normal if no gpus installed)[0m
[34mINFO:sagemaker_xgboost_container.training:Running XGBoost Sagemaker in algorithm mode[0m
[34mINFO:root:Determined delimiter of CSV input is ','[0m
[34mINFO:root:Determined delimiter of CSV i

In [35]:
endpoint_name = 'insurancefraud-endpoint-1a'
vehicle_predictor = xgb.deploy(initial_instance_count=1,
                                       instance_type='ml.m4.xlarge',
                                       endpoint_name=endpoint_name)

INFO:sagemaker:Creating model with name: sagemaker-xgboost-2023-06-23-07-26-56-826
INFO:sagemaker:Creating endpoint-config with name insurancefraud-endpoint-1a
INFO:sagemaker:Creating endpoint with name insurancefraud-endpoint-1a


-----!

In [36]:
from sagemaker.predictor import RealTimePredictor
from sagemaker.serializers import CSVSerializer
from sagemaker.deserializers import JSONDeserializer

vehicle_predictor = RealTimePredictor(endpoint_name, sess, content_type='text/csv')


vehicle_predictor.serializer = CSVSerializer()
vehicle_predictor.deserializer = JSONDeserializer()

See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.


In [37]:
test_vectors = pd.read_csv("s3://preprocessoroutputs/train/Xtest.csv", header=None)

In [38]:
test_vectors = pd.DataFrame(test_vectors[test_vectors.columns[selector.get_support()]], columns=None)

In [39]:
test_vectors.head()

Unnamed: 0,0,2,3,4,6,8,10,11,12,13,14,15,16,20,21,22,25,27,29,30
0,2,3,13,0,2,1,0.105586,1,1,0,0,4,400,0,3,4,0,3,1996,1
1,4,0,5,1,7,1,0.186827,0,6,2,5,15,400,0,5,4,0,3,1996,0
2,10,3,13,1,10,1,0.593031,0,2,1,1,7,400,1,6,5,0,3,1996,2
3,9,6,6,1,9,1,1.080476,0,2,1,0,7,400,1,6,5,0,2,1994,2
4,0,5,6,1,0,1,-1.031785,0,2,1,0,1,400,2,5,3,0,3,1994,2


In [40]:
test_vectors = test_vectors.values

In [41]:
test_vectors[2]

array([1.00000000e+01, 3.00000000e+00, 1.30000000e+01, 1.00000000e+00,
       1.00000000e+01, 1.00000000e+00, 5.93030986e-01, 0.00000000e+00,
       2.00000000e+00, 1.00000000e+00, 1.00000000e+00, 7.00000000e+00,
       4.00000000e+02, 1.00000000e+00, 6.00000000e+00, 5.00000000e+00,
       0.00000000e+00, 3.00000000e+00, 1.99600000e+03, 2.00000000e+00])

In [42]:
print(vehicle_predictor.predict(test_vectors[2]))

0.051483672112226486


In [43]:
test_vectors.shape

(4530, 20)

In [44]:
import numpy as np

predictions = []
for array in range(4530):
    result = vehicle_predictor.predict(test_vectors[array])
    if result >= 0.5:
        ans = 1
    else:
        ans = 0
    predictions.append(ans)

predictions = np.array(predictions)


In [45]:
predictions

array([0, 0, 0, ..., 0, 1, 0])

In [46]:
predictions = pd.DataFrame(predictions, columns=["pred"])

In [47]:
predictions.pred.value_counts()

0    3504
1    1026
Name: pred, dtype: int64

In [48]:
original = pd.read_csv("s3://preprocessoroutputs/test/ytest.csv", header=None)

In [49]:
from sklearn.metrics import confusion_matrix, accuracy_score
score = accuracy_score(original, predictions)
score*100

78.78587196467991

In [None]:
matrix = confusion_matrix(original, predictions)
matrix

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
sns.heatmap(matrix, annot=True, fmt=".1f")
plt.show()

In [None]:
sagemaker.Session().delete_endpoint(vehicle_predictor.endpoint)