In [None]:
import os, pandas as pd
df_data_1 = pd.read_csv(os.environ['DSX_PROJECT_DIR']+'/datasets/Titanictrain.csv')
df_data_1.head()

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.metrics import confusion_matrix, precision_recall_curve, roc_curve, auc, log_loss
from sklearn.metrics import accuracy_score, classification_report, precision_score, recall_score 

In [None]:
df_train = pd.get_dummies(df_data_1, columns=["Pclass","Embarked","Sex"])
df_train.fillna(df_train["Age"].median(skipna=True), inplace=True)
df_train.drop('Sex_female', axis=1, inplace=True)
df_train.drop('PassengerId', axis=1, inplace=True)
df_train.drop('Name', axis=1, inplace=True)
df_train.drop('Ticket', axis=1, inplace=True)
df_train.drop('Cabin', axis=1, inplace=True)
df_train.head()

In [None]:
X = df_train[["Age","SibSp","Parch","Fare","Pclass_1","Pclass_2","Pclass_3","Embarked_C","Embarked_Q","Embarked_S","Sex_male"]]
# y = df_train['Survived'].values.tolist()
y = pd.DataFrame(df_train['Survived'])

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2)

In [None]:
logreg = LogisticRegression()
logreg.fit(X_train, y_train)
y_pred = logreg.predict(X_test)
y_pred_proba = logreg.predict_proba(X_test)[:, 1]
[fpr, tpr, thr] = roc_curve(y_test, y_pred_proba)

In [None]:
print('Train/Test split results:')
print(logreg.__class__.__name__+" accuracy is %2.3f" % accuracy_score(y_test, y_pred))
print(logreg.__class__.__name__+" log_loss is %2.3f" % log_loss(y_test, y_pred_proba))
print(logreg.__class__.__name__+" auc is %2.3f" % auc(fpr, tpr))

In [None]:
from dsx_ml.ml import save
save_model_resp = save(name='GurvinderScikitRegression', 
     model=logreg, 
     algorithm_type = 'Regression', 
     x_test=X_test,
     y_test=y_test,
     source='GurvinderTitanicNotebook-Copy1.ipynb',
     description='This is my first model for Titanic')
print(save_model_resp)

## Test Local Saved Model

In [None]:
from sklearn.externals import joblib

df_test = pd.read_csv(os.environ['DSX_PROJECT_DIR']+'/datasets/Titanictrain -- Blinddataset.csv')
df_test = pd.get_dummies(df_test, columns=["Pclass","Embarked","Sex"])
df_test.fillna(df_train["Age"].median(skipna=True), inplace=True)
df_test.drop('Sex_female', axis=1, inplace=True)
df_test.drop('PassengerId', axis=1, inplace=True)
df_test.drop('Name', axis=1, inplace=True)
df_test.drop('Ticket', axis=1, inplace=True)
df_test.drop('Cabin', axis=1, inplace=True)
df_test.head()
X_test = df_test[["Age","SibSp","Parch","Fare","Pclass_1","Pclass_2","Pclass_3","Embarked_C","Embarked_Q","Embarked_S","Sex_male"]]

model_path = os.getenv("DSX_PROJECT_DIR") + "/models/GurvinderScikitRegression/4/model"

# load the model from disk
loaded_model = joblib.load(open(model_path, 'rb'))

# predictions
scoring_result = loaded_model.predict(X_test)

df_results = df_test.join(pd.DataFrame({'prediction': scoring_result}))
print(df_results.head())
df_results.to_csv(os.environ['DSX_PROJECT_DIR']+'/datasets/gurvsin3_test_results.csv', index=False)

# Test Using Scoring Endpoint

In [None]:
import requests, json, os
import numpy as np

X_json = X_test.to_json(orient='records')
# json_payload = {'values': X.values.tolist()}  
# json_payload = {'values': X_json}
     
to_predict_arr = np.asarray(X_test)
json_payload = {'values': to_predict_arr.tolist()}

header_online = {'Content-Type': 'application/json', 'Authorization':os.environ['DSX_TOKEN']}
response_scoring = requests.post(save_model_resp["scoring_endpoint"], json=json_payload, headers=header_online)
print(response_scoring)
print("\n\n******************\n\n")
print(response_scoring.content)
print("\n\n******************\n\n")
resp = response_scoring.json()
print(resp)
     

In [None]:

import dsx_core_utils, requests, jaydebeapi, os, io, sys
from pyspark.sql import SparkSession
import pandas as pd
df1 = None
dataSet = dsx_core_utils.get_remote_data_set_info('submissions')
dataSource = dsx_core_utils.get_data_source_info(dataSet['datasource'])
table_name = (dataSet['schema'] + '.' if (len(dataSet['schema'].strip()) != 0) else '') + dataSet['table']
if (sys.version_info >= (3, 0)):
	conn = jaydebeapi.connect(dataSource['driver_class'], dataSource['URL'], [dataSource['user'], dataSource['password']])
else:
	conn = jaydebeapi.connect(dataSource['driver_class'], [dataSource['URL'], dataSource['user'], dataSource['password']])
query = 'select * from ' + table_name
df1 = pd.read_sql(query, con=conn)
df1.head()



In [None]:
!python $DSX_PROJECT_DIR/scripts/submission.py --username gurvsin3