# **Pandas Package (W3SCHOOL)**

[Github: Pandas Package](https://github.com/pandas-dev/pandas)

# **Import Pandas**

In [None]:
import pandas #import package 

mydataset = {
  'cars': ["BMW", "Volvo", "Ford"],
  'passings': [3, 7, 2]
}

myvar = pandas.DataFrame(mydataset)

print(myvar)

    cars  passings
0    BMW         3
1  Volvo         7
2   Ford         2


In [None]:
import pandas as pd #as for alias

mydataset = {
  'cars': ["BMW", "Volvo", "Ford"],
  'passings': [3, 7, 2]
}

myvar = pd.DataFrame(mydataset)

print(myvar)

    cars  passings
0    BMW         3
1  Volvo         7
2   Ford         2


In [None]:
# Checking Pandas Version

print(pd.__version__)

1.1.5


# **Pandas `Series`**

In [None]:
# A Pandas Series is like a column in a table.
# It is a one-dimensional array holding data of any type.

import pandas as pd
a = [1, 7, 2]
myvar = pd.Series(a)
print(myvar)

0    1
1    7
2    2
dtype: int64


In [None]:
# If noting else is specified, the values are labeled with their index number. First value has index 0, second value has index 1 etc.
# This label can be used to access a specified value.

print(myvar[0])

1


In [None]:
# Create Labels
a = [1, 7, 2] #columns
myvar = pd.Series(a, index = ["x", "y", "z"]) #labels
print(myvar)

x    1
y    7
z    2
dtype: int64


In [None]:
# When you have created labels, you can access an item by referring to the label.
print(myvar["y"])

7


In [None]:
calories = {"day1": 420, "day2": 380, "day3": 390}
myvar = pd.Series(calories)
myvar

day1    420
day2    380
day3    390
dtype: int64

In [None]:
# To select only some of the items in the dictionary, use the index argument and specify only the items you want to include in the Series.
myvar = pd.Series(calories, index = ["day1", "day2"])
print(myvar)

day1    420
day2    380
dtype: int64


# **DataFrames**

Data sets in Pandas are usually multi-dimensional tables, called DataFrames.

`Series` is like a column, a `DataFrame` is the whole table.

In [None]:
import pandas as pd
data = {
  "calories": [420, 380, 390],
  "duration": [50, 40, 45]
}

myvar = pd.DataFrame(data)
print(myvar)

   calories  duration
0       420        50
1       380        40
2       390        45


# **Pandas DataFrames**
A Pandas DataFrame is a 2 dimensional data structure, like a 2 dimensional array, or a table with rows and columns.

In [None]:
data = {
  "calories": [420, 380, 390],
  "duration": [50, 40, 45]
}

#load data into a DataFrame object:
df = pd.DataFrame(data)

print(df) 

   calories  duration
0       420        50
1       380        40
2       390        45


In [None]:
# Locate Row

#refer to the row index:
print(df.loc[0])

calories    420
duration     50
Name: 0, dtype: int64


In [None]:
# use a list of indexes:
print(df.loc[[0, 1]]) 

# Note: When using [], the result is a Pandas DataFrame.

   calories  duration
0       420        50
1       380        40


In [None]:
# With the index argument, you can name your own indexes.
# keys -> column 
# index -> label of row

data = {
  "calories": [420, 380, 390], 
  "duration": [50, 40, 45]
}

df = pd.DataFrame(data, index = ["day1", "day2", "day3"]) 
print(df) 

      calories  duration
day1       420        50
day2       380        40
day3       390        45


In [None]:
#refer to the named index:
print(df.loc["day2"])

NameError: ignored

# **Load Files Into a DataFrame**

In [None]:
df = pd.read_csv('data.csv')

In [None]:
df.to_string() #use to_string() to print the entire DataFrame.

In [None]:
# A simple way to store big data sets is to use CSV files (comma separated files).
# CSV files contains plain text and is a well know format that can be read by everyone including Pandas.

df = pd.read_csv('data.csv')
print(df.to_string()) #use to_string() to print the entire DataFrame.

In [None]:
#json file

df = pd.read_json('data.json') 
print(df.to_string()) 

In [None]:
#dictionary 
# json = python dictionary

data = {
  "Duration":{
    "0":60,
    "1":60,
    "2":60,
    "3":45,
    "4":45,
    "5":60
  },
  "Pulse":{
    "0":110,
    "1":117,
    "2":103,
    "3":109,
    "4":117,
    "5":102
  },
  "Maxpulse":{
    "0":130,
    "1":145,
    "2":135,
    "3":175,
    "4":148,
    "5":127
  },
  "Calories":{
    "0":409,
    "1":479,
    "2":340,
    "3":282,
    "4":406,
    "5":300
  }
}

df = pd.DataFrame(data)
print(df) 

In [None]:
#The head() method returns the headers and a specified number of rows, starting from the top.

df = pd.read_csv('data.csv')
print(df.head(10)) #Get a quick overview by printing the first 10 rows of the DataFrame.

In [None]:
df = pd.read_csv('data.csv') #Print the first 5 rows of the DataFrame.
print(df.head())

In [None]:
# There is also a tail() method for viewing the last rows of the DataFrame.
# The tail() method returns the headers and a specified number of rows, starting from the bottom.

print(df.tail()) #Print the last 5 rows of the DataFrame.

In [None]:
print(df.tail(9)) #Print the last 9 rows of the DataFrame.

In [None]:
# Info About the Data
# The DataFrames object has a method called info(), that gives you more information about the data set.

print(df.info()) 

In [None]:
# Null Values
# The info() method also tells us how many Non-Null values there are present in each column, and in our data set it seems like there are 164 of 169 Non-Null values in the "Calories" column.
# There are 5 rows with no value at all, in the "Calories" column, for whatever reason.
# Empty values, or Null values, can be bad when analyzing data, and you should consider removing rows with empty values.

In [None]:
#By default, the dropna() method returns a new DataFrame, and will not change the original.

new_df = df.dropna()
print(new_df.to_string())

In [None]:
#If you want to change the original DataFrame, use the inplace = True argument
#Now, the dropna(inplace = True) will NOT return a new DataFrame, but it will remove all rows containg NULL values from the original DataFrame.

df.dropna(inplace = True)
print(df.to_string())