# Introducing Pandas Objects

At the very basic level, Pandas objects can be thought of as enhanced versions of
NumPy structured arrays in which the rows and columns are identified with labels
rather than simple integer indices. 

In [1]:
import numpy as np
import pandas as pd

## The Pandas Series Object


In [2]:
data = pd.Series([0.25, 0.5, 0.75, 1.0])
data

0    0.25
1    0.50
2    0.75
3    1.00
dtype: float64

In [3]:
# values
data.values


array([0.25, 0.5 , 0.75, 1.  ])

In [4]:
# indexes
data.index

RangeIndex(start=0, stop=4, step=1)

In [5]:
# creating series using python List

data = pd.Series([1, 2, 3, 4],index=['a', 'b', 'c', 'd'])
data


a    1
b    2
c    3
d    4
dtype: int64

In [6]:
data['a']

1

### Series as specialized dictionary


In [7]:
population_dict = { 'California': 38332521,
                    'Texas': 26448193,
                    'New York': 19651127,
                    'Florida': 19552860,
                    'Illinois': 12882135
                  }
population = pd.Series(population_dict)
population


California    38332521
Texas         26448193
New York      19651127
Florida       19552860
Illinois      12882135
dtype: int64

In [8]:
population['California']


38332521

In [9]:
population['California':'Illinois']

California    38332521
Texas         26448193
New York      19651127
Florida       19552860
Illinois      12882135
dtype: int64

### Constructing Series objects

In [10]:
pd.Series([2, 4, 6])

0    2
1    4
2    6
dtype: int64

In [11]:
# Data can be a scalar, which is repeated to fill the specified index
pd.Series(5, index=[100, 200, 300])


100    5
200    5
300    5
dtype: int64

In [12]:
# Data can be a dictionary, in which index defaults to the sorted dictionary keys
pd.Series({2:'a', 1:'b', 3:'c'})

2    a
1    b
3    c
dtype: object

## The Pandas DataFrame Object


### DataFrame as a generalized NumPy array



In [13]:
population_dict = { 'California': 38332521,
                    'Texas': 26448193,
                    'New York': 19651127,
                    'Florida': 19552860,
                    'Illinois': 12882135
                  }


area_dict = {
              'California': 423967,
              'Texas': 695662,
              'New York': 141297,
              'Florida': 170312,
              'Illinois': 149995
            }
# Creating DF with Series object

populationSeries = pd.Series(population_dict)
areaSeries = pd.Series(area_dict)

statesDF = pd.DataFrame( {'population': populationSeries, 'area': areaSeries} )
statesDF

Unnamed: 0,population,area
California,38332521,423967
Texas,26448193,695662
New York,19651127,141297
Florida,19552860,170312
Illinois,12882135,149995


In [14]:
# Indexes 

statesDF.index

Index(['California', 'Texas', 'New York', 'Florida', 'Illinois'], dtype='object')

In [15]:
# Columns (column Indexes)

statesDF.columns

Index(['population', 'area'], dtype='object')

In [16]:
# all Axes 

statesDF.axes

[Index(['California', 'Texas', 'New York', 'Florida', 'Illinois'], dtype='object'),
 Index(['population', 'area'], dtype='object')]

### DataFrame as specialized dictionary


In [17]:
areaSeries : pd.core.series.Series = statesDF['area']
areaSeries


California    423967
Texas         695662
New York      141297
Florida       170312
Illinois      149995
Name: area, dtype: int64

### Constructing DataFrame objects

A Pandas DataFrame can be constructed in a variety of ways. Here we’ll give several
examples.

In [18]:
pd.DataFrame(populationSeries, columns=['population'])

Unnamed: 0,population
California,38332521
Texas,26448193
New York,19651127
Florida,19552860
Illinois,12882135


In [19]:
pd.DataFrame({'population': populationSeries,'area': areaSeries})


Unnamed: 0,population,area
California,38332521,423967
Texas,26448193,695662
New York,19651127,141297
Florida,19552860,170312
Illinois,12882135,149995


In [20]:
pd.DataFrame(
data = np.random.rand(3, 2),
columns = ['col 1', 'col 2'],
index = ['a', 'b', 'c'])


Unnamed: 0,col 1,col 2
a,0.940456,0.149817
b,0.659589,0.215892
c,0.387401,0.082403


## The Pandas Index Object
We have seen here that both the Series and DataFrame objects contain an explicit
index that lets you reference and modify data. This Index object is an interesting
structure in itself, and it can be thought of either as an immutable array or as an
ordered set (technically a multiset, as Index objects may contain repeated values).
Those views have some interesting consequences in the operations available on Index
objects. As a simple example, let’s construct an Index from a list of integers

In [21]:
ind = pd.Index([2, 3, 5, 7, 11])
ind # One difference between Index objects and NumPy arrays is that indices are immutable


Int64Index([2, 3, 5, 7, 11], dtype='int64')

In [22]:
## Index as ordered set


In [23]:
indA = pd.Index([1, 3, 5, 7, 9])
indB = pd.Index([2, 3, 5, 7, 11])


In [24]:
indA & indB # intersection

Int64Index([3, 5, 7], dtype='int64')

In [25]:
indA | indB # union

Int64Index([1, 2, 3, 5, 7, 9, 11], dtype='int64')

In [26]:
indA ^ indB # symmetric difference

Int64Index([1, 2, 9, 11], dtype='int64')

## Data Indexing and Selection


In [27]:
area = pd.Series({'California': 423967, 'Texas': 695662,
                  'New York': 141297, 'Florida': 170312,
                  'Illinois': 149995
                 }
                )

pop = pd.Series({'California': 38332521, 'Texas': 26448193,
                 'New York': 19651127, 'Florida': 19552860,
                 'Illinois': 12882135
                })

data = pd.DataFrame({'area':area, 'pop':pop})
data


Unnamed: 0,area,pop
California,423967,38332521
Texas,695662,26448193
New York,141297,19651127
Florida,170312,19552860
Illinois,149995,12882135


In [28]:
# The individual Series that make up the columns of the DataFrame can be accessed
# via dictionary-style indexing of the column name

data['area']


California    423967
Texas         695662
New York      141297
Florida       170312
Illinois      149995
Name: area, dtype: int64

In [29]:
data.area # doesnt work for column names contining space also attributes name conflicts


California    423967
Texas         695662
New York      141297
Florida       170312
Illinois      149995
Name: area, dtype: int64

In [30]:
data.area is data['area']

True

In [31]:
# DataFrame has a pop() method
data.pop is data['pop']


False

In [32]:
# adding new column 

data['density'] = data['pop'] / data['area']
data



Unnamed: 0,area,pop,density
California,423967,38332521,90.413926
Texas,695662,26448193,38.01874
New York,141297,19651127,139.076746
Florida,170312,19552860,114.806121
Illinois,149995,12882135,85.883763


In [33]:
#another option using .insert()
data.insert(2,'density_insert' , value = data['pop'] / data['area'])
data

Unnamed: 0,area,pop,density_insert,density
California,423967,38332521,90.413926,90.413926
Texas,695662,26448193,38.01874,38.01874
New York,141297,19651127,139.076746,139.076746
Florida,170312,19552860,114.806121,114.806121
Illinois,149995,12882135,85.883763,85.883763


### DataFrame as two-dimensional array


In [34]:
data.values

array([[4.23967000e+05, 3.83325210e+07, 9.04139261e+01, 9.04139261e+01],
       [6.95662000e+05, 2.64481930e+07, 3.80187404e+01, 3.80187404e+01],
       [1.41297000e+05, 1.96511270e+07, 1.39076746e+02, 1.39076746e+02],
       [1.70312000e+05, 1.95528600e+07, 1.14806121e+02, 1.14806121e+02],
       [1.49995000e+05, 1.28821350e+07, 8.58837628e+01, 8.58837628e+01]])

In [35]:
# Transpose 
data.T

Unnamed: 0,California,Texas,New York,Florida,Illinois
area,423967.0,695662.0,141297.0,170312.0,149995.0
pop,38332520.0,26448190.0,19651130.0,19552860.0,12882140.0
density_insert,90.41393,38.01874,139.0767,114.8061,85.88376
density,90.41393,38.01874,139.0767,114.8061,85.88376


In [36]:
data.values[0] #0 row

array([4.23967000e+05, 3.83325210e+07, 9.04139261e+01, 9.04139261e+01])

In [37]:
data.values[0][1] #0 row, #1 col

38332521.0

In [38]:
data['area'] # series!


California    423967
Texas         695662
New York      141297
Florida       170312
Illinois      149995
Name: area, dtype: int64

### loc, iloc, and ix indexers

In [39]:
# row, col
data.iloc[0:3, 0:2]

Unnamed: 0,area,pop
California,423967,38332521
Texas,695662,26448193
New York,141297,19651127


In [40]:
data.loc[:'Illinois', :'pop']


Unnamed: 0,area,pop
California,423967,38332521
Texas,695662,26448193
New York,141297,19651127
Florida,170312,19552860
Illinois,149995,12882135


In [41]:
data.loc[data.density > 100, ['pop', 'density']]




Unnamed: 0,pop,density
New York,19651127,139.076746
Florida,19552860,114.806121


In [42]:
# Any of these indexing conventions may also be used to set or modify values

data.iloc[0, 2] = 90
data


Unnamed: 0,area,pop,density_insert,density
California,423967,38332521,90.0,90.413926
Texas,695662,26448193,38.01874,38.01874
New York,141297,19651127,139.076746,139.076746
Florida,170312,19552860,114.806121,114.806121
Illinois,149995,12882135,85.883763,85.883763


### Additional indexing conventions


In [43]:
data['Florida':'Illinois']


Unnamed: 0,area,pop,density_insert,density
Florida,170312,19552860,114.806121,114.806121
Illinois,149995,12882135,85.883763,85.883763


In [44]:
data[1:3]

Unnamed: 0,area,pop,density_insert,density
Texas,695662,26448193,38.01874,38.01874
New York,141297,19651127,139.076746,139.076746


In [45]:
data[data.density > 100]


Unnamed: 0,area,pop,density_insert,density
New York,141297,19651127,139.076746,139.076746
Florida,170312,19552860,114.806121,114.806121


## Operating on Data in Pandas

### UFuncs: Index Alignment

For binary operations on two Series or DataFrame objects, Pandas will align indices
in the process of performing the operation. This is very convenient when you are
working with incomplete data


In [46]:
area = pd.Series(
    {'Alaska': 1723337,
     'Texas': 695662,
     'California': 423967
    }, name='area')

population = pd.Series(
    {'California': 38332521,
     'Texas': 26448193,
     'New York': 19651127
    }, name='population')

population / area # The resulting array contains the union of indices of the two input arrays, which we
                  # could determine using standard Python set arithmetic on these indices:
                  #  area.index | population.index



Alaska              NaN
California    90.413926
New York            NaN
Texas         38.018740
dtype: float64

In [47]:
A = pd.Series([2, 4, 6], index=[0, 1, 2])
B = pd.Series([1, 3, 5], index=[1, 2, 3])
A + B


0    NaN
1    5.0
2    9.0
3    NaN
dtype: float64

In [48]:
# or remove NaN like this 
A.add(B, fill_value=0)

0    2.0
1    5.0
2    9.0
3    5.0
dtype: float64

### Index alignment in DataFrame


In [49]:
A = pd.DataFrame(np.random.randint(0, 20, (2, 2)),columns=list('AB'))
A

Unnamed: 0,A,B
0,6,18
1,12,14


In [50]:
B = pd.DataFrame(np.random.randint(0, 10, (3, 3)),columns=list('BAC'))
B


Unnamed: 0,B,A,C
0,7,8,4
1,0,2,1
2,0,7,6


In [51]:
A + B

Unnamed: 0,A,B,C
0,14.0,25.0,
1,14.0,14.0,
2,,,


### Ufuncs: Operations Between DataFrame and Series


In [52]:
A = np.random.randint(10, size=(3, 4))
A

array([[3, 6, 1, 4],
       [0, 7, 2, 1],
       [8, 2, 2, 9]])

In [53]:
A - A[0]

array([[ 0,  0,  0,  0],
       [-3,  1,  1, -3],
       [ 5, -4,  1,  5]])

In [54]:
# In Pandas, the convention similarly operates row-wise by default:
df = pd.DataFrame(A, columns=list('QRST'))
df

Unnamed: 0,Q,R,S,T
0,3,6,1,4
1,0,7,2,1
2,8,2,2,9


In [55]:
print(f'  df.iloc[0] -> \n{df.iloc[0]}'  )
df - df.iloc[0]


  df.iloc[0] -> 
Q    3
R    6
S    1
T    4
Name: 0, dtype: int64


Unnamed: 0,Q,R,S,T
0,0,0,0,0
1,-3,1,1,-3
2,5,-4,1,5


## Handling Missing Data


### **None**: Pythonic missing data
The first sentinel value used by Pandas is None, a Python singleton object that is often
used for missing data in Python code. Because None is a Python object, it cannot be
used in any arbitrary NumPy/Pandas array, but only in arrays with data type
'object' (i.e., arrays of Python objects):

In [56]:
vals1 = np.array([1, None, 3, 4])
vals1


array([1, None, 3, 4], dtype=object)

In [57]:
# Will Cause ERROR!
# vals1.sum()


### **NaN**: Missing numerical data
The other missing data representation, NaN (acronym for Not a Number), is different;
it is a special floating-point value recognized by all systems that use the standard
IEEE floating-point representation

In [58]:
vals2 = np.array([1, np.nan, 3, 4])
vals2.dtype

dtype('float64')

In [59]:
# Any operration with NaN will result to NaN
vals2.sum(), vals2.min(), vals2.max()

(nan, nan, nan)

In [60]:
# Therefore special NaN safe functions
np.nansum(vals2), np.nanmin(vals2), np.nanmax(vals2)

(8.0, 1.0, 4.0)

### NaN and None in Pandas
NaN and None both have their place, and Pandas is built to handle the two of them
nearly interchangeably, converting between them where appropriate:


In [61]:
pd.Series([1, np.nan, 2, None])

0    1.0
1    NaN
2    2.0
3    NaN
dtype: float64

In [62]:
x = pd.Series(range(2), dtype=int)
x

0    0
1    1
dtype: int64

In [63]:
x[0] = None
x

0    NaN
1    1.0
dtype: float64

## Operating on Null Values


In [64]:
# isnull()
# Generate a Boolean mask indicating missing values

# notnull()
# Opposite of isnull()

# dropna()
# Return a filtered version of the data

# fillna()
# Return a copy of the data with missing values filled or imputed

In [65]:
data = pd.Series([1, np.nan, 'hello', None])


In [66]:
data.isnull()

0    False
1     True
2    False
3     True
dtype: bool

In [67]:
data[data.notnull()]

0        1
2    hello
dtype: object

In [68]:
#Returns new DF with all NaN values removed
data.dropna() 


0        1
2    hello
dtype: object

In [69]:
df = pd.DataFrame([[1, np.nan, 2],[2, 3, 5],[np.nan, 4, 6]])
df


Unnamed: 0,0,1,2
0,1.0,,2
1,2.0,3.0,5
2,,4.0,6


In [70]:
# By default, dropna() will drop all rows in which any null value is present:
df.dropna()

Unnamed: 0,0,1,2
1,2.0,3.0,5


In [71]:
# The default is how='any', such that any row or column (depending on the axis key‐
# word) containing a null value will be dropped. You can also specify how='all', which
# will only drop rows/columns that are all null values:

df[3] = np.nan # Creating new Row with column name 3 with NaN values
df

Unnamed: 0,0,1,2,3
0,1.0,,2,
1,2.0,3.0,5,
2,,4.0,6,


In [72]:
# Drop along axis 1 or 'column', only if ALL elements are NaN
df.dropna(axis='columns', how='all')

Unnamed: 0,0,1,2
0,1.0,,2
1,2.0,3.0,5
2,,4.0,6


In [73]:
df.iloc[1,1] = np.nan
df

Unnamed: 0,0,1,2,3
0,1.0,,2,
1,2.0,,5,
2,,4.0,6,


In [74]:
# Remove any row along rows except any rows containing atleast 2 non Null values.
df.dropna(axis = 'rows', thresh=2)

Unnamed: 0,0,1,2,3
0,1.0,,2,
1,2.0,,5,
2,,4.0,6,


In [75]:
# Remove any column along column except any column containing atleast 2 non Null values.
df.dropna(axis = 'columns', thresh=2)

Unnamed: 0,0,2
0,1.0,2
1,2.0,5
2,,6


In [76]:
# Filling null values
data = pd.Series([1, np.nan, 2, None, 3], index=list('abcde'))
data

a    1.0
b    NaN
c    2.0
d    NaN
e    3.0
dtype: float64

In [77]:
data.fillna(0)

a    1.0
b    0.0
c    2.0
d    0.0
e    3.0
dtype: float64

In [78]:
# forward-fill
data.fillna(method='ffill')

a    1.0
b    1.0
c    2.0
d    2.0
e    3.0
dtype: float64

In [79]:
# back-fill
data.fillna(method='bfill')

a    1.0
b    2.0
c    2.0
d    3.0
e    3.0
dtype: float64

In [80]:
df

Unnamed: 0,0,1,2,3
0,1.0,,2,
1,2.0,,5,
2,,4.0,6,


In [81]:
df.fillna(method='ffill', axis=1)


Unnamed: 0,0,1,2,3
0,1.0,1.0,2.0,2.0
1,2.0,2.0,5.0,5.0
2,,4.0,6.0,6.0


## Hierarchical Indexing


### A Multiply Indexed Series



#### The bad way


In [82]:
index = [('California', 2000), ('California', 2010),
         ('New York', 2000), ('New York', 2010),
         ('Texas', 2000), ('Texas', 2010)
        ]
populations = [ 33871648, 37253956,
                18976457, 19378102,
                20851820, 25145561 
              ]
pop = pd.Series(populations, index=index)
pop

(California, 2000)    33871648
(California, 2010)    37253956
(New York, 2000)      18976457
(New York, 2010)      19378102
(Texas, 2000)         20851820
(Texas, 2010)         25145561
dtype: int64

In [83]:
# Selecting a row

pop[('California',2000)]

33871648

In [84]:
#Slice a row
pop[('California',2000) : ('Texas',2000)]

(California, 2000)    33871648
(California, 2010)    37253956
(New York, 2000)      18976457
(New York, 2010)      19378102
(Texas, 2000)         20851820
dtype: int64

In [85]:
# Suppose we want to get all record from 2020

pop[[i for i in pop.index if i[1] == 2010]]

# Therefore its quite complex to do these kinds of opearations without using multiindexing properly

(California, 2010)    37253956
(New York, 2010)      19378102
(Texas, 2010)         25145561
dtype: int64

#### The better way: Pandas MultiIndex


In [86]:
index = pd.MultiIndex.from_tuples(index)
index

MultiIndex([('California', 2000),
            ('California', 2010),
            (  'New York', 2000),
            (  'New York', 2010),
            (     'Texas', 2000),
            (     'Texas', 2010)],
           )

In [87]:
pop = pop.reindex(index)
pop

California  2000    33871648
            2010    37253956
New York    2000    18976457
            2010    19378102
Texas       2000    20851820
            2010    25145561
dtype: int64

In [88]:
# same non pythonic code for getting all records of 2020
pop[:, 2010]


California    37253956
New York      19378102
Texas         25145561
dtype: int64

In [89]:
pop['Texas']

2000    20851820
2010    25145561
dtype: int64

In [90]:
# WE can think something to do like this :
#### pop['California' : 'Texas',:]
# But this will generate error 

In [91]:
# To solve this ,we have IndexSlice

idx = pd.IndexSlice
pop.loc[idx['California' : 'Texas',2010]]

California  2010    37253956
New York    2010    19378102
Texas       2010    25145561
dtype: int64

### MultiIndex as extra dimension


In [92]:
pop_df = pop.unstack()
pop_df

# What Unstack besically does is , it takes the deepest index(default) to the column index. Here, level 1 is shifted to columns, 
# making it a DF


Unnamed: 0,2000,2010
California,33871648,37253956
New York,18976457,19378102
Texas,20851820,25145561


In [93]:
# Therefore, Stack will again bring columns to index one by one 
pop_df.stack()


California  2000    33871648
            2010    37253956
New York    2000    18976457
            2010    19378102
Texas       2000    20851820
            2010    25145561
dtype: int64

### MultiIndex level names


In [94]:
pop.index.names = ['state', 'year']
pop

state       year
California  2000    33871648
            2010    37253956
New York    2000    18976457
            2010    19378102
Texas       2000    20851820
            2010    25145561
dtype: int64

### MultiIndex for columns


In [95]:
# hierarchical indices and columns
index = pd.MultiIndex.from_product([[2013, 2014], [1, 2]],
names=['year', 'visit'])
columns = pd.MultiIndex.from_product([['Bob', 'Guido', 'Sue'], ['HR', 'Temp']],
names=['subject', 'type'])


# mock some data
data = np.round(np.random.randn(4, 6), 1)
data[:, ::2] *= 10
data += 37


# create the DataFrame
health_data = pd.DataFrame(data, index=index, columns=columns)
health_data

Unnamed: 0_level_0,subject,Bob,Bob,Guido,Guido,Sue,Sue
Unnamed: 0_level_1,type,HR,Temp,HR,Temp,HR,Temp
year,visit,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
2013,1,34.0,36.7,34.0,36.3,19.0,36.1
2013,2,28.0,36.4,33.0,35.4,49.0,37.2
2014,1,31.0,38.0,33.0,37.5,35.0,36.0
2014,2,30.0,36.9,32.0,36.1,32.0,37.0


In [96]:
health_data.stack()

Unnamed: 0_level_0,Unnamed: 1_level_0,subject,Bob,Guido,Sue
year,visit,type,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2013,1,HR,34.0,34.0,19.0
2013,1,Temp,36.7,36.3,36.1
2013,2,HR,28.0,33.0,49.0
2013,2,Temp,36.4,35.4,37.2
2014,1,HR,31.0,33.0,35.0
2014,1,Temp,38.0,37.5,36.0
2014,2,HR,30.0,32.0,32.0
2014,2,Temp,36.9,36.1,37.0


In [97]:
health_data.stack().stack()

year  visit  type  subject
2013  1      HR    Bob        34.0
                   Guido      34.0
                   Sue        19.0
             Temp  Bob        36.7
                   Guido      36.3
                   Sue        36.1
      2      HR    Bob        28.0
                   Guido      33.0
                   Sue        49.0
             Temp  Bob        36.4
                   Guido      35.4
                   Sue        37.2
2014  1      HR    Bob        31.0
                   Guido      33.0
                   Sue        35.0
             Temp  Bob        38.0
                   Guido      37.5
                   Sue        36.0
      2      HR    Bob        30.0
                   Guido      32.0
                   Sue        32.0
             Temp  Bob        36.9
                   Guido      36.1
                   Sue        37.0
dtype: float64

In [98]:
health_data.stack().stack().unstack()

Unnamed: 0_level_0,Unnamed: 1_level_0,subject,Bob,Guido,Sue
year,visit,type,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2013,1,HR,34.0,34.0,19.0
2013,1,Temp,36.7,36.3,36.1
2013,2,HR,28.0,33.0,49.0
2013,2,Temp,36.4,35.4,37.2
2014,1,HR,31.0,33.0,35.0
2014,1,Temp,38.0,37.5,36.0
2014,2,HR,30.0,32.0,32.0
2014,2,Temp,36.9,36.1,37.0


In [99]:
x = pd.DataFrame(
    
    {
        "id" : [1,1,3,4,5],
        "amt" : [20,30,10,30,30]
    }
)
x

Unnamed: 0,id,amt
0,1,20
1,1,30
2,3,10
3,4,30
4,5,30


In [100]:
x.id.duplicated()

0    False
1     True
2    False
3    False
4    False
Name: id, dtype: bool

In [101]:
x.groupby('id').agg({
    'id' : 'min'
})

Unnamed: 0_level_0,id
id,Unnamed: 1_level_1
1,1
3,3
4,4
5,5


In [102]:
x.head(1000)

Unnamed: 0,id,amt
0,1,20
1,1,30
2,3,10
3,4,30
4,5,30


In [103]:
x.drop_duplicates()

Unnamed: 0,id,amt
0,1,20
1,1,30
2,3,10
3,4,30
4,5,30


In [104]:
import numpy as np




In [111]:
xs = np.linspace(-3,3)
xs

array([-3.        , -2.87755102, -2.75510204, -2.63265306, -2.51020408,
       -2.3877551 , -2.26530612, -2.14285714, -2.02040816, -1.89795918,
       -1.7755102 , -1.65306122, -1.53061224, -1.40816327, -1.28571429,
       -1.16326531, -1.04081633, -0.91836735, -0.79591837, -0.67346939,
       -0.55102041, -0.42857143, -0.30612245, -0.18367347, -0.06122449,
        0.06122449,  0.18367347,  0.30612245,  0.42857143,  0.55102041,
        0.67346939,  0.79591837,  0.91836735,  1.04081633,  1.16326531,
        1.28571429,  1.40816327,  1.53061224,  1.65306122,  1.7755102 ,
        1.89795918,  2.02040816,  2.14285714,  2.26530612,  2.3877551 ,
        2.51020408,  2.63265306,  2.75510204,  2.87755102,  3.        ])

In [106]:
from scipy.stats import norm

In [109]:
ys = norm(0,1).cdf(xs
)

In [110]:
ys


array([0.0013499 , 0.00200388, 0.00293369, 0.00423604, 0.00603307,
       0.00847582, 0.01174695, 0.01606229, 0.02167053, 0.02885073,
       0.03790684, 0.0491592 , 0.06293263, 0.07954137, 0.0992714 ,
       0.12236094, 0.1489804 , 0.17921329, 0.21303974, 0.25032436,
       0.29080984, 0.33411757, 0.37975571, 0.42713481, 0.47559021,
       0.52440979, 0.57286519, 0.62024429, 0.66588243, 0.70919016,
       0.74967564, 0.78696026, 0.82078671, 0.8510196 , 0.87763906,
       0.9007286 , 0.92045863, 0.93706737, 0.9508408 , 0.96209316,
       0.97114927, 0.97832947, 0.98393771, 0.98825305, 0.99152418,
       0.99396693, 0.99576396, 0.99706631, 0.99799612, 0.9986501 ])