# **Pandas Package (W3SCHOOL)**

[Github: Pandas Package](https://github.com/pandas-dev/pandas)

# **Import Pandas**

In [1]:
import pandas #import package 

mydataset = {
  'cars': ["BMW", "Volvo", "Ford"],
  'passings': [3, 7, 2]
}

myvar = pandas.DataFrame(mydataset)

print(myvar)

    cars  passings
0    BMW         3
1  Volvo         7
2   Ford         2


In [2]:
import pandas as pd #as for alias

mydataset = {
  'cars': ["BMW", "Volvo", "Ford"],
  'passings': [3, 7, 2]
}

myvar = pd.DataFrame(mydataset)

print(myvar)

    cars  passings
0    BMW         3
1  Volvo         7
2   Ford         2


In [3]:
# Checking Pandas Version

print(pd.__version__)

2.0.1


# **Pandas `Series`**

In [4]:
# A Pandas Series is like a column in a table.
# It is a one-dimensional array holding data of any type.

import pandas as pd
a = [1, 7, 2]
myvar = pd.Series(a)
print(myvar)

0    1
1    7
2    2
dtype: int64


In [5]:
# If noting else is specified, the values are labeled with their index number. First value has index 0, second value has index 1 etc.
# This label can be used to access a specified value.

print(myvar[0])

1


In [6]:
# Create Labels
a = [1, 7, 2] #columns
myvar = pd.Series(a, index = ["x", "y", "z"]) #labels
print(myvar)

x    1
y    7
z    2
dtype: int64


In [7]:
# When you have created labels, you can access an item by referring to the label.
print(myvar["y"])

7


In [8]:
calories = {"day1": 420, "day2": 380, "day3": 390}
myvar = pd.Series(calories)
myvar

day1    420
day2    380
day3    390
dtype: int64

In [9]:
# To select only some of the items in the dictionary, use the index argument and specify only the items you want to include in the Series.
myvar = pd.Series(calories, index = ["day1", "day2"])
print(myvar)

day1    420
day2    380
dtype: int64


# **DataFrames**

Data sets in Pandas are usually multi-dimensional tables, called DataFrames.

`Series` is like a column, a `DataFrame` is the whole table.

In [10]:
import pandas as pd
data = {
  "calories": [420, 380, 390],
  "duration": [50, 40, 45]
}

myvar = pd.DataFrame(data)
print(myvar)

   calories  duration
0       420        50
1       380        40
2       390        45


# **Pandas DataFrames**
A Pandas DataFrame is a 2 dimensional data structure, like a 2 dimensional array, or a table with rows and columns.

In [11]:
data = {
  "calories": [420, 380, 390],
  "duration": [50, 40, 45]
}

#load data into a DataFrame object:
df = pd.DataFrame(data)

print(df) 

   calories  duration
0       420        50
1       380        40
2       390        45


In [12]:
# Locate Row

#refer to the row index:
print(df.loc[0])

calories    420
duration     50
Name: 0, dtype: int64


In [13]:
# use a list of indexes:
print(df.loc[[0, 1]]) 

# Note: When using [], the result is a Pandas DataFrame.

   calories  duration
0       420        50
1       380        40


In [14]:
# With the index argument, you can name your own indexes.
# keys -> column 
# index -> label of row

data = {
  "calories": [420, 380, 390], 
  "duration": [50, 40, 45]
}

df = pd.DataFrame(data, index = ["day1", "day2", "day3"]) 
print(df) 

      calories  duration
day1       420        50
day2       380        40
day3       390        45


In [15]:
#refer to the named index:
print(df.loc["day2"])

calories    380
duration     40
Name: day2, dtype: int64


# **Load Files Into a DataFrame**

In [18]:
df = pd.read_csv('data/data.csv')

In [19]:
df.to_string() #use to_string() to print the entire DataFrame.

'     Duration  Pulse  Maxpulse  Calories\n0          60    110       130     409.1\n1          60    117       145     479.0\n2          60    103       135     340.0\n3          45    109       175     282.4\n4          45    117       148     406.0\n5          60    102       127     300.5\n6          60    110       136     374.0\n7          45    104       134     253.3\n8          30    109       133     195.1\n9          60     98       124     269.0\n10         60    103       147     329.3\n11         60    100       120     250.7\n12         60    106       128     345.3\n13         60    104       132     379.3\n14         60     98       123     275.0\n15         60     98       120     215.2\n16         60    100       120     300.0\n17         45     90       112       NaN\n18         60    103       123     323.0\n19         45     97       125     243.0\n20         60    108       131     364.2\n21         45    100       119     282.0\n22         60    130       101   

In [21]:
# A simple way to store big data sets is to use CSV files (comma separated files).
# CSV files contains plain text and is a well know format that can be read by everyone including Pandas.

df = pd.read_csv('data/data.csv')
print(df.to_string()) #use to_string() to print the entire DataFrame.

     Duration  Pulse  Maxpulse  Calories
0          60    110       130     409.1
1          60    117       145     479.0
2          60    103       135     340.0
3          45    109       175     282.4
4          45    117       148     406.0
5          60    102       127     300.5
6          60    110       136     374.0
7          45    104       134     253.3
8          30    109       133     195.1
9          60     98       124     269.0
10         60    103       147     329.3
11         60    100       120     250.7
12         60    106       128     345.3
13         60    104       132     379.3
14         60     98       123     275.0
15         60     98       120     215.2
16         60    100       120     300.0
17         45     90       112       NaN
18         60    103       123     323.0
19         45     97       125     243.0
20         60    108       131     364.2
21         45    100       119     282.0
22         60    130       101     300.0
23         45   

In [30]:
#json file

df = pd.read_json('data/data.json') 
print(df.to_string()) 

     Duration  Pulse  Maxpulse  Calories
0          60    110       130     409.1
1          60    117       145     479.0
2          60    103       135     340.0
3          45    109       175     282.4
4          45    117       148     406.0
5          60    102       127     300.5
6          60    110       136     374.0
7          45    104       134     253.3
8          30    109       133     195.1
9          60     98       124     269.0
10         60    103       147     329.3
11         60    100       120     250.7
12         60    106       128     345.3
13         60    104       132     379.3
14         60     98       123     275.0
15         60     98       120     215.2
16         60    100       120     300.0
17         45     90       112       NaN
18         60    103       123     323.0
19         45     97       125     243.0
20         60    108       131     364.2
21         45    100       119     282.0
22         60    130       101     300.0
23         45   

In [16]:
#dictionary 
# json = python dictionary

data = {
  "Duration":{
    "0":60,
    "1":60,
    "2":60,
    "3":45,
    "4":45,
    "5":60
  },
  "Pulse":{
    "0":110,
    "1":117,
    "2":103,
    "3":109,
    "4":117,
    "5":102
  },
  "Maxpulse":{
    "0":130,
    "1":145,
    "2":135,
    "3":175,
    "4":148,
    "5":127
  },
  "Calories":{
    "0":409,
    "1":479,
    "2":340,
    "3":282,
    "4":406,
    "5":300
  }
}

df = pd.DataFrame(data)
print(df) 

   Duration  Pulse  Maxpulse  Calories
0        60    110       130       409
1        60    117       145       479
2        60    103       135       340
3        45    109       175       282
4        45    117       148       406
5        60    102       127       300


In [23]:
#The head() method returns the headers and a specified number of rows, starting from the top.

df = pd.read_csv('data/data.csv')
print(df.head(10)) #Get a quick overview by printing the first 10 rows of the DataFrame.

   Duration  Pulse  Maxpulse  Calories
0        60    110       130     409.1
1        60    117       145     479.0
2        60    103       135     340.0
3        45    109       175     282.4
4        45    117       148     406.0
5        60    102       127     300.5
6        60    110       136     374.0
7        45    104       134     253.3
8        30    109       133     195.1
9        60     98       124     269.0


In [24]:
df = pd.read_csv('data/data.csv') #Print the first 5 rows of the DataFrame.
print(df.head())

   Duration  Pulse  Maxpulse  Calories
0        60    110       130     409.1
1        60    117       145     479.0
2        60    103       135     340.0
3        45    109       175     282.4
4        45    117       148     406.0


In [25]:
# There is also a tail() method for viewing the last rows of the DataFrame.
# The tail() method returns the headers and a specified number of rows, starting from the bottom.

print(df.tail()) #Print the last 5 rows of the DataFrame.

     Duration  Pulse  Maxpulse  Calories
164        60    105       140     290.8
165        60    110       145     300.4
166        60    115       145     310.2
167        75    120       150     320.4
168        75    125       150     330.4


In [26]:
print(df.tail(9)) #Print the last 9 rows of the DataFrame.

     Duration  Pulse  Maxpulse  Calories
160        30     85       120     250.4
161        45     90       130     260.4
162        45     95       130     270.0
163        45    100       140     280.9
164        60    105       140     290.8
165        60    110       145     300.4
166        60    115       145     310.2
167        75    120       150     320.4
168        75    125       150     330.4


In [27]:
# Info About the Data
# The DataFrames object has a method called info(), that gives you more information about the data set.

print(df.info()) 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 169 entries, 0 to 168
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Duration  169 non-null    int64  
 1   Pulse     169 non-null    int64  
 2   Maxpulse  169 non-null    int64  
 3   Calories  164 non-null    float64
dtypes: float64(1), int64(3)
memory usage: 5.4 KB
None


In [None]:
# Null Values
# The info() method also tells us how many Non-Null values there are present in each column, and in our data set it seems like there are 164 of 169 Non-Null values in the "Calories" column.
# There are 5 rows with no value at all, in the "Calories" column, for whatever reason.
# Empty values, or Null values, can be bad when analyzing data, and you should consider removing rows with empty values.

In [28]:
#By default, the dropna() method returns a new DataFrame, and will not change the original.

new_df = df.dropna()
print(new_df.to_string())

     Duration  Pulse  Maxpulse  Calories
0          60    110       130     409.1
1          60    117       145     479.0
2          60    103       135     340.0
3          45    109       175     282.4
4          45    117       148     406.0
5          60    102       127     300.5
6          60    110       136     374.0
7          45    104       134     253.3
8          30    109       133     195.1
9          60     98       124     269.0
10         60    103       147     329.3
11         60    100       120     250.7
12         60    106       128     345.3
13         60    104       132     379.3
14         60     98       123     275.0
15         60     98       120     215.2
16         60    100       120     300.0
18         60    103       123     323.0
19         45     97       125     243.0
20         60    108       131     364.2
21         45    100       119     282.0
22         60    130       101     300.0
23         45    105       132     246.0
24         60   

In [29]:
#If you want to change the original DataFrame, use the inplace = True argument
#Now, the dropna(inplace = True) will NOT return a new DataFrame, but it will remove all rows containg NULL values from the original DataFrame.

df.dropna(inplace = True)
print(df.to_string())

     Duration  Pulse  Maxpulse  Calories
0          60    110       130     409.1
1          60    117       145     479.0
2          60    103       135     340.0
3          45    109       175     282.4
4          45    117       148     406.0
5          60    102       127     300.5
6          60    110       136     374.0
7          45    104       134     253.3
8          30    109       133     195.1
9          60     98       124     269.0
10         60    103       147     329.3
11         60    100       120     250.7
12         60    106       128     345.3
13         60    104       132     379.3
14         60     98       123     275.0
15         60     98       120     215.2
16         60    100       120     300.0
18         60    103       123     323.0
19         45     97       125     243.0
20         60    108       131     364.2
21         45    100       119     282.0
22         60    130       101     300.0
23         45    105       132     246.0
24         60   