# 02. Perform Data Discovery in Vantage using Python

It shows:

1. Load libraries and set up connection
2. Read raw data from Vantage
3. Explore the columns, number of rows
4. Determine the number of NUM, MIN values and MAX Values using SQL
5. Determine the number of NUM, MIN values and MAX Values using python

## 02.01. Load libraries and set up connection

In [11]:
# Load standard libraries
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns
import getpass


In [2]:
# import statements
from teradataml.dataframe.dataframe import DataFrame
from teradataml.dataframe.copy_to import copy_to_sql
from teradataml.context.context import create_context, remove_context
from teradataml.options.display import display

In [3]:
host = '40.121.65.25'
#user = input("Username:")
#password = getpass.getpass("Password:")
user='pocuser'
password ='pocuser'
eng = create_context(host = host, username = user, password = password) # connects using goSQL driver
conn = eng.connect()
print(eng)
print(conn)

Engine(teradatasql://pocuser:***@40.121.65.25)
<sqlalchemy.engine.base.Connection object at 0x7fc474099f28>


## 02.02 Read the raw dataset

In [4]:
from teradataml.dataframe.dataframe import in_schema

In [5]:
df_all_data = DataFrame('titanic_all_data_0') 

In [6]:
df_all_data.head()

passengerid,survived,pclass,pname,gender,age,sibsp,parch,ticket,fare,cabin,embarked,set_type
3,1,3,Heikkinen Miss. Laina,female,26.0,0,0,STON/O2. 3101282,7.925,,S,train
5,0,3,Allen Mr. William Henry,male,35.0,0,0,373450,8.05,,S,train
6,0,3,Moran Mr. James,male,,0,0,330877,8.4583,,Q,train
7,0,1,McCarthy Mr. Timothy J,male,54.0,0,0,17463,51.8625,E46,S,train
9,1,3,Johnson Mrs. Oscar W (Elisabeth Vilhelmina Berg),female,27.0,0,2,347742,11.1333,,S,train
10,1,2,Nasser Mrs. Nicholas (Adele Achem),female,14.0,1,0,237736,30.0708,,C,train
8,0,3,Palsson Master. Gosta Leonard,male,2.0,3,1,349909,21.075,,S,train
4,1,1,Futrelle Mrs. Jacques Heath (Lily May Peel),female,35.0,1,0,113803,53.1,C123,S,train
2,1,1,Cumings Mrs. John Bradley (Florence Briggs Thayer),female,38.0,1,0,PC 17599,71.2833,C85,C,train
1,0,3,Braund Mr. Owen Harris,male,22.0,1,0,A/5 21171,7.25,,S,train


## 02.03. Explore the columns, number of rows

#### View the column names

In [7]:
df_all_data.info()

<class 'teradataml.dataframe.dataframe.DataFrame'>
Data columns (total 13 columns):
passengerid      int
survived         int
pclass           int
pname            str
gender           str
age            float
sibsp            int
parch            int
ticket           str
fare           float
cabin            str
embarked         str
set_type         str
dtypes: str(6), float(2), int(5)


#### Number of rows

In [8]:
df_all_data.count()


count_passengerid,count_survived,count_pclass,count_pname,count_gender,count_age,count_sibsp,count_parch,count_ticket,count_fare,count_cabin,count_embarked,count_set_type
891,891,891,891,891,714,891,891,891,891,204,889,891


## 02.04. Determine the number of NUM, MIN values and MAX Values using SQL

### 02.04.01 NULL Script

In [9]:
# Create a dataframe from SQL Query
qry = "SELECT \
          set_type,\
          SUM(CASE WHEN passengerId IS NULL OR TRIM(passengerId) = '' THEN 1 ELSE 0 END) AS PassengerId_null,\
          SUM(CASE WHEN pclass IS NULL OR TRIM(pclass) = '' THEN 1 ELSE 0 END) AS Pclass_null,\
          SUM(CASE WHEN pname IS NULL OR TRIM(pname) = ''  THEN 1 ELSE 0 END) AS Name_null,\
          SUM(CASE WHEN gender IS NULL OR TRIM(gender) = ''  THEN 1 ELSE 0 END) AS Sex_null,\
          SUM(CASE WHEN age IS NULL OR TRIM(age) = ''  THEN 1 ELSE 0 END) AS Age_null,\
          SUM(CASE WHEN sibsp IS NULL OR TRIM(sibsp) = ''  THEN 1 ELSE 0 END) AS SibSp_null,\
          SUM(CASE WHEN parch IS NULL OR TRIM(parch) = ''  THEN 1 ELSE 0 END) AS Parch_null,\
          SUM(CASE WHEN ticket IS NULL OR TRIM(ticket) = ''  THEN 1 ELSE 0 END) AS Ticket_null,\
          SUM(CASE WHEN fare IS NULL OR TRIM(fare) = ''  THEN 1 ELSE 0 END) AS Fare_null,\
          SUM(CASE WHEN cabin IS NULL OR TRIM(cabin) = ''  THEN 1 ELSE 0 END) AS Cabin_null,\
          SUM(CASE WHEN embarked IS NULL OR TRIM(embarked) = ''  THEN 1 ELSE 0 END) AS Embarked_null,\
          SUM(CASE WHEN survived IS NULL THEN 1 ELSE 0 END) AS Survived_null\
      FROM titanic_all_data_0\
      GROUP BY 1;"

# Create a TeradataML DataFrame "
df_null = DataFrame.from_query(qry)
df_null

set_type,PassengerId_null,Pclass_null,Name_null,Sex_null,Age_null,SibSp_null,Parch_null,Ticket_null,Fare_null,Cabin_null,Embarked_null,Survived_null
train,0,0,0,0,177,0,0,0,0,687,2,0


In [10]:
# Create a TeradataML DataFrame "
df_null = df_null.to_pandas()

In [11]:
df_null

Unnamed: 0,set_type,PassengerId_null,Pclass_null,Name_null,Sex_null,Age_null,SibSp_null,Parch_null,Ticket_null,Fare_null,Cabin_null,Embarked_null,Survived_null
0,train,0,0,0,0,177,0,0,0,0,687,2,0


### 02.04.02 MAX Script

In [12]:
# Create a dataframe from SQL Query
qry = "SELECT \
            set_type,\
            MAX(passengerid) AS PassengerId_max,\
            MAX(pclass) AS Pclass_max,\
            MAX(pname) AS pname_max,\
            MAX(gender) AS Sex_max,\
            MAX(age) AS Age_max,\
            MAX(sibsp) AS SibSp_max,\
            MAX(parch) AS Parch_max,\
            MAX(ticket) AS Ticket_max,\
            MAX(fare) AS Fare_max,\
            MAX(cabin) AS Cabin_max,\
            MAX(embarked) AS Embarked_max,\
            MAX(survived) AS Survived_max\
        FROM titanic_all_data_0\
        GROUP BY 1; "

# Create a TeradataML DataFrame "
df_MAX = DataFrame.from_query(qry)

In [13]:
# Create a TeradataML DataFrame "
df_MAX = df_MAX.to_pandas()
df_MAX

Unnamed: 0,set_type,PassengerId_max,Pclass_max,pname_max,Sex_max,Age_max,SibSp_max,Parch_max,Ticket_max,Fare_max,Cabin_max,Embarked_max,Survived_max
0,train,891,3,Zimmerman Mr. Leo,male,80.0,8,6,WE/P 5735,512.3292,T,S,1


### 02.04.03 MIN Script

In [14]:
# Create a dataframe from SQL Query
qry = "SELECT \
          set_type,\
          MIN(passengerid) AS PassengerId_min,\
          MIN(pclass) AS Pclass_min,\
          MIN(pname) AS Name_min,\
          MIN(gender) AS Sex_min,\
          MIN(age) AS Age_min,\
          MIN(sibsp) AS SibSp_min,\
          MIN(parch) AS Parch_min,\
          MIN(ticket) AS Ticket_min,\
          MIN(fare) AS Fare_min,\
          MIN(cabin) AS Cabin_min,\
          MIN(embarked) AS Embarked_min,\
          MIN(survived) AS Survived_min\
      FROM titanic_all_data_0\
      GROUP BY 1;"

# Create a TeradataML DataFrame "
df_minimum = DataFrame.from_query(qry)

In [15]:
# Create a TeradataML DataFrame "
df_minimum = df_minimum.to_pandas()
df_minimum

Unnamed: 0,set_type,PassengerId_min,Pclass_min,Name_min,Sex_min,Age_min,SibSp_min,Parch_min,Ticket_min,Fare_min,Cabin_min,Embarked_min,Survived_min
0,train,1,1,Abbing Mr. Anthony,female,0.42,0,0,110152,0.0,A10,C,0


## 02.05. Determine the number of NUM, MIN values and MAX Values using Pandas

In [16]:
df_all_data.to_pandas().isnull().sum()

passengerid      0
survived         0
pclass           0
pname            0
gender           0
age            177
sibsp            0
parch            0
ticket           0
fare             0
cabin          687
embarked         2
set_type         0
dtype: int64

In [17]:
df_all_data.to_pandas().min()

passengerid                     1
survived                        0
pclass                          1
pname          Abbing Mr. Anthony
gender                     female
age                          0.42
sibsp                           0
parch                           0
ticket                     110152
fare                            0
set_type                    train
dtype: object

In [18]:
df_all_data.to_pandas().max()

passengerid                           891
survived                                1
pclass                                  3
pname          van Melkebeke Mr. Philemon
gender                               male
age                                    80
sibsp                                   8
parch                                   6
ticket                          WE/P 5735
fare                              512.329
set_type                            train
dtype: object