In [1]:
import numpy as np
import pandas as pd

# The Series Data Structure

## Series Creation

In [2]:
students = ['Tien', "Bang", "Di"]
pd.Series(students)

0    Tien
1    Bang
2      Di
dtype: object

In [7]:
numbers = [1, 2.0, 3, None]
pd.Series(numbers)

0    1.0
1    2.0
2    3.0
3    NaN
dtype: float64

In [11]:
np.nan == None
np.nan == np.nan

False

In [12]:
# .isnan() function is used to check if there is NaN value in the Series
np.isnan(np.nan)

True

In [15]:
# We can create the index and value column for pandas series
students_classes = {"Tien": "English",
                   "Bang": "English",
                   "Di": "Mathematics"}
sc = pd.Series(students_classes)
sc

In [18]:
print(sc.index)
print(sc.values)
print(sc.dtypes)

Index(['Tien', 'Bang', 'Di'], dtype='object')
['English' 'English' 'Mathematics']
object


In [21]:
students_scores = [{"Di": 9.5}, {"Bang": 9.40}, {"Tien": 9.42}]
ss = pd.Series(students_scores, index=[1,2,3])
ss

1       {'Di': 9.5}
2     {'Bang': 9.4}
3    {'Tien': 9.42}
dtype: object

In [22]:
# Here is an special example where there is an incomtibility between index of Series and index of the dictionary
pd.Series(sc, index=["Tien", "Di", "Thuc"])

Tien        English
Di      Mathematics
Thuc            NaN
dtype: object

## Querying a Series

In [69]:
# Way 1: Query by numeric location: starting at 0 => iloc attribute
print(sc.iloc[0])
print(sc[0])
# Way 2: Query by index label => loc attribute
print(sc.loc["Di"])
print(sc["Di"])
# Here is the problem come in when you don't use the loc and iloc attribute
print(ss.loc[1])
print(ss.iloc[0])
# print(ss[0]) !!!
# With loc attribute, you are not only able to output the record, but can also add new records into the series if the label is not recognized
sc.loc["Thuc"] = "Math"
print(sc)

English
English
Mathematics
Mathematics
{'Di': 9.5}
{'Di': 9.5}
Tien        English
Bang        English
Di      Mathematics
Thuc           Math
dtype: object


In [79]:
nhien_classes = pd.Series(["English", "Chemistry", "Physics"], index=["Nhien","Nhien", "Nhien"])
alls = sc.append(nhien_classes)
print(alls)
print(alls.loc["Nhien"])

Tien         English
Bang         English
Di       Mathematics
Thuc            Math
Nhien        English
Nhien      Chemistry
Nhien        Physics
dtype: object
Nhien      English
Nhien    Chemistry
Nhien      Physics
dtype: object


  alls = sc.append(nhien_classes)


In [78]:
sc

Tien        English
Bang        English
Di      Mathematics
Thuc           Math
dtype: object

## Series Calculations

In [41]:
a = pd.Series(np.random.randint(0,1000,10000))
a.head(10)

0     57
1    319
2    982
3    458
4    448
5    828
6    770
7    949
8    854
9    673
dtype: int32

In [40]:
avg = np.sum(a)/len(a)
avg

508.3133

In [44]:
# Cellular magic functions start with %%
# Eg: timeit magic function to determine on average how long it takes to run through the code

In [49]:
%%timeit -n 100
total = 0
for each in a:
    total += each
total/len(a)

3.12 ms ± 301 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [51]:
%%timeit -n 100
np.sum(a)/len(a)

156 µs ± 51.2 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


The above method is vectorization. It is the ability for a computer to execute multiple instructions at once => hight performance and dramatic speedups.
Using such a function can help in minimizing the running time of code efficiently.

In [52]:
print(a.head(10))
a+=2
print(a.head(10))

0     57
1    319
2    982
3    458
4    448
5    828
6    770
7    949
8    854
9    673
dtype: int32
0     59
1    321
2    984
3    460
4    450
5    830
6    772
7    951
8    856
9    675
dtype: int32


The next mathod of numpy and pandas is broadcasting. We can apply an operation to every value in the series and change it.

In [66]:
# iteritems() helps us to unpack each item in the whole series easily
for label, value in a.iteritems():
    a.at[label]+=1
a.head(10)

0     61
1    323
2    986
3    462
4    452
5    832
6    774
7    953
8    858
9    677
dtype: int32

# The DataFrame Data Structure

In [15]:
# The DataFram data structure is a 2-dimensional series object, where there is an index and multiple columns of content
record1 = pd.Series({"Name": "Tien",
                        "Class": "Data Analytics",
                        "Score": 95})
record2 = pd.Series({"Name": "Di",
                        "Class": "Architecture",
                        "Score": 98})
record3 = pd.Series({"Name": "Bang",
                        "Class": "Communication",
                        "Score": 94})
df = pd.DataFrame([record1, record2, record3], index=["school1", "school2", "school1"])

In [16]:
df

Unnamed: 0,Name,Class,Score
school1,Tien,Data Analytics,95
school2,Di,Architecture,98
school1,Bang,Communication,94


In [17]:
# Data Extraction in DataFrame rely on loc[] and iloc[] with 2 argument corresponds to the 2 dimensions.
df.iloc[1,1:]

Class    Architecture
Score              98
Name: school2, dtype: object

In [34]:
print(df.loc["school1"])
print(type(df.loc["school1"]))
print(type(df.loc["school2"]))
# We can also use the bracket notation for selecting data by column. With loc[], we can only use it for row or row and column extraction
print(df["Name"])

         Name           Class  Score
school1  Tien  Data Analytics     95
school1  Bang   Communication     94
<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.series.Series'>
school1    Tien
school2      Di
school1    Bang
Name: Name, dtype: object


In [24]:
# With loc[], you can only use index name and column name to extract data. It won't work if you try to use the default index value
df.loc["school1", "Name"]

school1    Tien
school1    Bang
Name: Name, dtype: object

In [25]:
# Transpose will pivot all the rows into columns and vice versa.
df.T

Unnamed: 0,school1,school2,school1.1
Name,Tien,Di,Bang
Class,Data Analytics,Architecture,Communication
Score,95,98,94


In [29]:
df.T.loc["Name"]

school1    Tien
school2      Di
school1    Bang
Name: Name, dtype: object

In [33]:
# Using the chain operation to take output of the previous extraction to be the input of the next selection. 
df.loc['school1']["Name"]

school1    Tien
school1    Bang
Name: Name, dtype: object

In [35]:
# loc[] can also be used for sling with :
df.loc[:,"Class"]

school1    Data Analytics
school2      Architecture
school1     Communication
Name: Class, dtype: object

In [39]:
df.loc["school1", :"Class"]

Unnamed: 0,Name,Class
school1,Tien,Data Analytics
school1,Bang,Communication


In [46]:
# To drop data, use the drop() function which will return you a copy of the dataframe with data removed. The original DataFrame remains intact
df.drop("school2")

Unnamed: 0,Name,Class,Score
school1,Tien,Data Analytics,95
school1,Bang,Communication,94


In [41]:
df

Unnamed: 0,Name,Class,Score
school1,Tien,Data Analytics,95
school2,Di,Architecture,98
school1,Bang,Communication,94


In [61]:
copy_df = df.copy()
copy_df.drop("Name", inplace=True, axis=1)

In [62]:
copy_df

Unnamed: 0,Class,Score
school1,Data Analytics,95
school2,Architecture,98
school1,Communication,94


In [63]:
copy_df.drop("school1", inplace=False, axis=0)

Unnamed: 0,Class,Score
school2,Architecture,98


In [64]:
copy_df

Unnamed: 0,Class,Score
school1,Data Analytics,95
school2,Architecture,98
school1,Communication,94


In [65]:
# Another fiercer way to drop a column is to use the del keyword. This will have immediate effect on the df with the specified column removed
del copy_df["Score"]
copy_df
# del copy_df["school2"] del can't be used to drop rows

Unnamed: 0,Class
school1,Data Analytics
school2,Architecture
school1,Communication


In [66]:
# We can easily create a new column with [] notation when pd df
df["Class Ranks"] = [1,2,3]
df

Unnamed: 0,Name,Class,Score,Class Ranks
school1,Tien,Data Analytics,95,1
school2,Di,Architecture,98,2
school1,Bang,Communication,94,3


## DataFrame Extraction

In [2]:
# Lets create a bigger dataframe from many series to play with.
gre = pd.Series(np.random.randint(70,170,100), name="GRE Score") # you can specify the name of column right here or use the df.rename(columns={...})
toefl = pd.Series(np.random.randint(70,120,100), name="TOEFL Score") # to change column's names
uni_rate = pd.Series(np.random.randint(1,4,100), name="University Rate")
research = pd.Series(np.random.randint(0,5,100), name="Research")
admission = pd.Series(np.random.randint(0,100,100), name="Admission Rate")
df_file = pd.concat([gre, toefl, uni_rate, research, admission], axis=1)
df = pd.DataFrame(df_file, index=np.arange(1,101,1))
df.head()

Unnamed: 0,GRE Score,TOEFL Score,University Rate,Research,Admission Rate
1,155.0,82.0,3.0,2.0,90.0
2,103.0,91.0,3.0,3.0,84.0
3,156.0,91.0,1.0,4.0,57.0
4,154.0,92.0,3.0,4.0,88.0
5,104.0,73.0,2.0,2.0,56.0


In [3]:
# Naming of columns
cols = list(df.columns)
cols

['GRE Score', 'TOEFL Score', 'University Rate', 'Research', 'Admission Rate']

In [4]:
# the strip() function checks for uneccessary space in th left and right of a string and remove them.
cols = [i.lower().strip() for i in cols]
cols
df.columns = cols

In [5]:
df

Unnamed: 0,gre score,toefl score,university rate,research,admission rate
1,155.0,82.0,3.0,2.0,90.0
2,103.0,91.0,3.0,3.0,84.0
3,156.0,91.0,1.0,4.0,57.0
4,154.0,92.0,3.0,4.0,88.0
5,104.0,73.0,2.0,2.0,56.0
...,...,...,...,...,...
96,122.0,101.0,2.0,1.0,70.0
97,146.0,98.0,2.0,0.0,73.0
98,113.0,81.0,3.0,0.0,51.0
99,108.0,107.0,3.0,2.0,86.0


In [6]:
# Use the apply() function to apply a method to the dataframe and make changes
def avg(row):
    row["average of gre and toefl"] = (row['gre score'] + row['toefl score']) / 2
    return row
df=df.apply(avg,axis='columns')
df.head()

Unnamed: 0,gre score,toefl score,university rate,research,admission rate,average of gre and toefl
1,155.0,82.0,3.0,2.0,90.0,118.5
2,103.0,91.0,3.0,3.0,84.0,97.0
3,156.0,91.0,1.0,4.0,57.0,123.5
4,154.0,92.0,3.0,4.0,88.0,123.0
5,104.0,73.0,2.0,2.0,56.0,88.5


In [15]:
# Selecting data with conditions
df[df["admission rate"] > 85.0]

Unnamed: 0,gre score,toefl score,university rate,research,admission rate
6,150.0,81.0,2.0,4.0,99.0
13,103.0,108.0,3.0,3.0,92.0
17,99.0,73.0,3.0,0.0,94.0
23,81.0,113.0,2.0,0.0,92.0
33,112.0,90.0,2.0,3.0,89.0
41,155.0,91.0,3.0,1.0,88.0
47,75.0,89.0,3.0,3.0,88.0
52,113.0,84.0,3.0,0.0,93.0
53,77.0,89.0,1.0,3.0,86.0
56,96.0,83.0,1.0,2.0,93.0


In [16]:
len(df[df["admission rate"] > 85.0])

14

In [19]:
# Building Boolean Masking
admission_masked = df['admission rate'] > 85.0

In [24]:
# After having a boolean masking, we can use the where() function to output True data from the DataFrame. False data will be displayed as NaN
df.where(admission_masked)

Unnamed: 0,gre score,toefl score,university rate,research,admission rate
1,,,,,
2,,,,,
3,,,,,
4,,,,,
5,,,,,
...,...,...,...,...,...
96,84.0,74.0,2.0,1.0,95.0
97,104.0,109.0,1.0,0.0,92.0
98,,,,,
99,,,,,


In [25]:
df.where(admission_masked).dropna().head()

Unnamed: 0,gre score,toefl score,university rate,research,admission rate
6,150.0,81.0,2.0,4.0,99.0
13,103.0,108.0,3.0,3.0,92.0
17,99.0,73.0,3.0,0.0,94.0
23,81.0,113.0,2.0,0.0,92.0
33,112.0,90.0,2.0,3.0,89.0


In [26]:
df[["gre score", "research"]].head()

Unnamed: 0,gre score,research
1,152.0,4.0
2,159.0,4.0
3,160.0,1.0
4,145.0,1.0
5,80.0,1.0


In [33]:
gre_toefl = (df['gre score'] > 80) & (df['toefl score'] > 80)
# or
gre_toefl2 = (df['gre score'].gt(80).lt(150))

In [34]:
df.where(gre_toefl).dropna().head()

Unnamed: 0,gre score,toefl score,university rate,research,admission rate
1,152.0,103.0,2.0,4.0,85.0
3,160.0,83.0,2.0,1.0,6.0
4,145.0,82.0,3.0,1.0,25.0
6,150.0,81.0,2.0,4.0,99.0
7,118.0,118.0,1.0,4.0,66.0


In [54]:
df.where(gre_toefl2).dropna().head()

Unnamed: 0_level_0,gre score,toefl score,university rate,research
admission rate,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1


## DataFrame Indexing

In [78]:
df.set_index(["admission rate", 'university rate'], inplace=True)
df.sort_index

<bound method DataFrame.sort_index of                                 gre score  toefl score  research
admission rate university rate                                  
77.0           3.0                  132.0        108.0       0.0
76.0           1.0                   89.0         88.0       2.0
               3.0                   99.0         94.0       2.0
1.0            1.0                  142.0         91.0       0.0
27.0           2.0                  126.0         96.0       4.0
...                                   ...          ...       ...
99.0           3.0                   81.0        113.0       1.0
45.0           1.0                  138.0        107.0       3.0
64.0           3.0                  164.0        100.0       2.0
94.0           1.0                   78.0         75.0       3.0
NaN            NaN                    NaN          NaN       NaN

[100 rows x 3 columns]>

In [79]:
df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,gre score,toefl score,research
admission rate,university rate,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
77.0,3.0,132.0,108.0,0.0
76.0,1.0,89.0,88.0,2.0
76.0,3.0,99.0,94.0,2.0
1.0,1.0,142.0,91.0,0.0
27.0,2.0,126.0,96.0,4.0


In [81]:
df.loc[[(76.0,1.0)]]

Unnamed: 0_level_0,Unnamed: 1_level_0,gre score,toefl score,research
admission rate,university rate,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
76.0,1.0,89.0,88.0,2.0


## Missing Value

In [82]:
# isnull() function is used for checking Null and NaN values in the dataframe
df.isnull()

Unnamed: 0_level_0,Unnamed: 1_level_0,gre score,toefl score,research
admission rate,university rate,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
77.0,3.0,False,False,False
76.0,1.0,False,False,False
76.0,3.0,False,False,False
1.0,1.0,False,False,False
27.0,2.0,False,False,False
...,...,...,...,...
99.0,3.0,False,False,False
45.0,1.0,False,False,False
64.0,3.0,False,False,False
94.0,1.0,False,False,False


In [83]:
# There are many ways to deal with missing value. The 1st method is to remove all of them with dropna()
df.dropna()

Unnamed: 0_level_0,Unnamed: 1_level_0,gre score,toefl score,research
admission rate,university rate,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
77.0,3.0,132.0,108.0,0.0
76.0,1.0,89.0,88.0,2.0
76.0,3.0,99.0,94.0,2.0
1.0,1.0,142.0,91.0,0.0
27.0,2.0,126.0,96.0,4.0
...,...,...,...,...
1.0,1.0,118.0,108.0,4.0
99.0,3.0,81.0,113.0,1.0
45.0,1.0,138.0,107.0,3.0
64.0,3.0,164.0,100.0,2.0


In [84]:
# The 2nd way is to fill that missing value with an appropriate value with fillna()
df.fillna(0)
# you can also use 
# ffill(): forward fill use previous value cell
df.ffill()
# bfill(): backward fill use folling value cell

Unnamed: 0_level_0,Unnamed: 1_level_0,gre score,toefl score,research
admission rate,university rate,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
77.0,3.0,132.0,108.0,0.0
76.0,1.0,89.0,88.0,2.0
76.0,3.0,99.0,94.0,2.0
1.0,1.0,142.0,91.0,0.0
27.0,2.0,126.0,96.0,4.0
...,...,...,...,...
99.0,3.0,81.0,113.0,1.0
45.0,1.0,138.0,107.0,3.0
64.0,3.0,164.0,100.0,2.0
94.0,1.0,78.0,75.0,3.0


In [89]:
df = pd.DataFrame({"A": [1,1,2,3,4],
                   "B": [3,6,3,8,9],
                   "C": ["a","b","c","d","a"]})
df

Unnamed: 0,A,B,C
0,1,3,a
1,1,6,b
2,2,3,c
3,3,8,d
4,4,9,a


In [91]:
df.replace([1,3],[10,300])

Unnamed: 0,A,B,C
0,10,300,a
1,10,6,b
2,2,300,c
3,300,8,d
4,4,9,a
