# What is Pandas?
Pandas is a Python library used for working with data sets.

It has functions for analyzing, cleaning, exploring, and manipulating data.

The name "Pandas" has a reference to both "Panel Data", and "Python Data Analysis" and was created by Wes McKinney in 2008.

# Why Use Pandas?
Pandas allows us to analyze big data and make conclusions based on statistical theories.

Pandas can clean messy data sets, and make them readable and relevant.

Relevant data is very important in data science.

# What Can Pandas Do?
Pandas gives you answers about the data. Like:

Is there a correlation between two or more columns?
What is average value?
Max value?
Min value?

In [None]:
#Installation
!pip install pandas

Defaulting to user installation because normal site-packages is not writeable


In [None]:
import pandas

  from pandas.core import (


In [None]:
import pandas

mydataset = {
  'cars': ["BMW", "Volvo", "Ford"],
  'passings': [3, 7, 2]
}

myvar = pandas.DataFrame(mydataset)

print(myvar)

    cars  passings
0    BMW         3
1  Volvo         7
2   Ford         2


# import pandas as pd

mydataset = {
  'cars': ["BMW", "Volvo", "Ford"],
  'passings': [3, 7, 2]
}

myvar = pd.DataFrame(mydataset)

print(myvar)

In [None]:
#import pandas as pd

print(pd.__version__)

2.2.1


# What is a Series?
A Pandas Series is like a column in a table.

It is a one-dimensional array holding data of any type.

In [None]:
#Create a simple Pandas Series from a list:

import pandas as pd

a = [1, 7, 2]

myvar = pd.Series(a)

print(myvar)

0    1
1    7
2    2
dtype: int64


# Labels
If nothing else is specified, the values are labeled with their index number. First value has index 0, second value has index 1 etc.

This label can be used to access a specified value.

In [None]:
#Return the first value of the Series:

print(myvar[0])

1


In [None]:
#Create your own labels:

import pandas as pd

a = [1, 7, 2]

myvar = pd.Series(a, index = ["x", "y", "z"])

print(myvar)

x    1
y    7
z    2
dtype: int64


In [None]:
#Return the value of "y":

print(myvar["y"])

7


# What is a DataFrame?
A Pandas DataFrame is a 2 dimensional data structure, like a 2 dimensional array, or a table with rows and columns.

In [None]:
#Create a simple Pandas DataFrame:

import pandas as pd

data = {
  "calories": [420, 380, 390],
  "duration": [50, 40, 45]
}

#load data into a DataFrame object:
df = pd.DataFrame(data)

print(df)

   calories  duration
0       420        50
1       380        40
2       390        45


# Locate Row
As you can see from the result above, the DataFrame is like a table with rows and columns.

Pandas use the loc attribute to return one or more specified row(s)

In [None]:
#refer to the row index:
print(df.loc[0])

calories    420
duration     50
Name: 0, dtype: int64


In [None]:
#Return row 0 and 1:

#use a list of indexes:
print(df.loc[[0, 1]])

   calories  duration
0       420        50
1       380        40


# Named Indexes
With the index argument, you can name your own indexes.

In [None]:
#Add a list of names to give each row a name:

import pandas as pd

data = {
  "calories": [420, 380, 390],
  "duration": [50, 40, 45]
}

df = pd.DataFrame(data, index = ["day1", "day2", "day3"])

print(df)

      calories  duration
day1       420        50
day2       380        40
day3       390        45


# Load Files Into a DataFrame
If your data sets are stored in a file, Pandas can load them into a DataFrame.

In [None]:
import pandas as pd

df = pd.read_csv('Que3.csv')

print(df)

     Zone       Date  Temperature ReportType
0       A   4/1/2019          3.3      FM-15
1       A   4/1/2019          5.0      SOD  
2       A   4/1/2019          5.0      SOM  
3       A   4/1/2019          2.2      FM-15
4       A   4/1/2019          1.1      FM-15
...   ...        ...          ...        ...
7341    B  4/30/2020          7.2      FM-15
7342    D  4/30/2020          7.2      FM-15
7343    B  4/30/2020          7.2      FM-15
7344    D  4/30/2020          7.2      FM-15
7345    B  4/30/2020          7.2      FM-15

[7346 rows x 4 columns]


# Read CSV Files
A simple way to store big data sets is to use CSV files (comma separated files).

CSV files contains plain text and is a well know format that can be read by everyone including Pandas.

In our examples we will be using a CSV file called 'data.csv'.

In [None]:
#Load a Python Dictionary into a DataFrame:
import pandas as pd

data = {
  "Duration":{
    "0":60,
    "1":60,
    "2":60,
    "3":45,
    "4":45,
    "5":60
  },
  "Pulse":{
    "0":110,
    "1":117,
    "2":103,
    "3":109,
    "4":117,
    "5":102
  },
  "Maxpulse":{
    "0":130,
    "1":145,
    "2":135,
    "3":175,
    "4":148,
    "5":127
  },
  "Calories":{
    "0":409,
    "1":479,
    "2":340,
    "3":282,
    "4":406,
    "5":300
  }
}

df = pd.DataFrame(data)

print(df)

   Duration  Pulse  Maxpulse  Calories
0        60    110       130       409
1        60    117       145       479
2        60    103       135       340
3        45    109       175       282
4        45    117       148       406
5        60    102       127       300


# Viewing the Data
One of the most used method for getting a quick overview of the DataFrame, is the head() method.

The head() method returns the headers and a specified number of rows, starting from the top.

In [None]:
#Get a quick overview by printing the first 10 rows of the DataFrame:

import pandas as pd

df = pd.read_csv('Que3.csv')

print(df.head(10))

  Zone      Date  Temperature ReportType
0    A  4/1/2019          3.3      FM-15
1    A  4/1/2019          5.0      SOD  
2    A  4/1/2019          5.0      SOM  
3    A  4/1/2019          2.2      FM-15
4    A  4/1/2019          1.1      FM-15
5    A  4/1/2019          0.6      FM-15
6    A  4/1/2019          0.0      FM-15
7    A  4/1/2019         -0.6      FM-15
8    A  4/1/2019         -0.6      FM-15
9    A  4/1/2019          1.1      FM-15


# There is also a tail() method for viewing the last rows of the DataFrame.

The tail() method returns the headers and a specified number of rows, starting from the bottom.

In [None]:
print(df.tail(10))

     Zone       Date  Temperature ReportType
7336    D  4/30/2020          7.2      FM-15
7337    B  4/30/2020          7.2      FM-15
7338    D  4/30/2020          7.2      FM-16
7339    B  4/30/2020          7.2      FM-16
7340    D  4/30/2020          7.2      FM-15
7341    B  4/30/2020          7.2      FM-15
7342    D  4/30/2020          7.2      FM-15
7343    B  4/30/2020          7.2      FM-15
7344    D  4/30/2020          7.2      FM-15
7345    B  4/30/2020          7.2      FM-15


In [None]:
import pandas as pd

df = pd.read_csv('data.csv')

print(df)


     Duration  Pulse  Maxpulse  Calories
0          60    110       130     409.1
1          60    117       145     479.0
2          60    103       135     340.0
3          45    109       175     282.4
4          45    117       148     406.0
..        ...    ...       ...       ...
164        60    105       140     290.8
165        60    110       145     300.0
166        60    115       145     310.2
167        75    120       150     320.4
168        75    125       150     330.4

[169 rows x 4 columns]


In [None]:
df.head(50)

Unnamed: 0,Duration,Pulse,Maxpulse,Calories
0,60,110,130,409.1
1,60,117,145,479.0
2,60,103,135,340.0
3,45,109,175,282.4
4,45,117,148,406.0
5,60,102,127,300.0
6,60,110,136,374.0
7,45,104,134,253.3
8,30,109,133,195.1
9,60,98,124,269.0
