In [44]:
import pandas as pd
import numpy as np

In [45]:
#Creating DataFrames
fruit_sales = pd.DataFrame({"Apples":[34, 41], "Bananas":[21, 34]}, index=["2017 Sales", "2018 Sales"])
fruit_sales

Unnamed: 0,Apples,Bananas
2017 Sales,34,21
2018 Sales,41,34


In [46]:
#Creating Series
ingredients = pd.Series(["4 cups", "1 cup", "2 large", "1 can"], index=["Flour", "Milk", "Eggs", "Spam"], name="Dinner")
ingredients

Flour     4 cups
Milk       1 cup
Eggs     2 large
Spam       1 can
Name: Dinner, dtype: object

In [47]:
dict = {
    'country': ["Brazil", "Russia", "India", "China", "South Africa"],
    'capital': ["Brasilia", "Moscow", "New Dehli", "Beijing", "Pretoria"],
    'area': [8.156, 17.10, 3.286, 9.597, 1.221],
    'population': [200.4, 143.5, 1252, 1357, 52.98]
}
dict

{'country': ['Brazil', 'Russia', 'India', 'China', 'South Africa'],
 'capital': ['Brasilia', 'Moscow', 'New Dehli', 'Beijing', 'Pretoria'],
 'area': [8.156, 17.1, 3.286, 9.597, 1.221],
 'population': [200.4, 143.5, 1252, 1357, 52.98]}

## Create from dictionary

In [69]:
brics = pd.DataFrame(dict)
#Set the row indexes
brics.index = ['BR', 'RU', 'IN', 'CH', 'SA'] 
brics

Unnamed: 0,country,capital,area,population
BR,Brazil,Brasilia,8.156,200.4
RU,Russia,Moscow,17.1,143.5
IN,India,New Dehli,3.286,1252.0
CH,China,Beijing,9.597,1357.0
SA,South Africa,Pretoria,1.221,52.98


In [70]:
# Set the index to Country column, 
# drop, don't drop the new indexed column from DataFrame
# Inplace, instead of returning new DataFrame, modify the existing one inplace
# Append, append the new column with the current index

brics.set_index('country', drop=False, inplace=True, append=True)
brics
#help(brics.set_index)

Unnamed: 0_level_0,Unnamed: 1_level_0,country,capital,area,population
Unnamed: 0_level_1,country,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
BR,Brazil,Brazil,Brasilia,8.156,200.4
RU,Russia,Russia,Moscow,17.1,143.5
IN,India,India,New Dehli,3.286,1252.0
CH,China,China,Beijing,9.597,1357.0
SA,South Africa,South Africa,Pretoria,1.221,52.98


In [71]:
#Reset the index back again
brics.index = ['BR', 'RU', 'IN', 'CH', 'SA']
brics

Unnamed: 0,country,capital,area,population
BR,Brazil,Brasilia,8.156,200.4
RU,Russia,Moscow,17.1,143.5
IN,India,New Dehli,3.286,1252.0
CH,China,Beijing,9.597,1357.0
SA,South Africa,Pretoria,1.221,52.98


## Write DataFrame to a File

In [19]:
brics.to_csv("Data\\brics_from_program.csv")

## Read DataFrame from a csv file

In [20]:
#load from CSV file, first col is index for rows, first row is name of columns
brics = pd.read_csv("Data\\brics.csv", index_col=0)
brics

Unnamed: 0,country,capital,area,population
BR,Brazil,Brasilia,8.156,200.4
RU,Russia,Moscow,17.1,143.5
IN,India,New Dehli,3.286,1252.0
CH,China,Beijing,9.597,1357.0
SA,South Africa,Pretoria,1.221,52.98


## Accessing values in the form of Series, DataFrame using indexes

In [21]:
#Accessing a column using square bracket [] return the values of that colums in a Series. This will return a Series.

c1 = brics['country']
print(c1.head())
print(type(c1))

BR          Brazil
RU          Russia
IN           India
CH           China
SA    South Africa
Name: country, dtype: object
<class 'pandas.core.series.Series'>


In [22]:
#If you want to get DataFrame returned use 2 square brackets [[]]. This will return a DataFrame.

c1 = brics[['country']]
print(c1.head())
print(type(c1))

         country
BR        Brazil
RU        Russia
IN         India
CH         China
SA  South Africa
<class 'pandas.core.frame.DataFrame'>


## Returning 2 or more columns from DataFrame

In [23]:
c1 = brics[['country', 'area']]
print(c1.head)
print(type(c1))

<bound method NDFrame.head of          country    area
BR        Brazil   8.156
RU        Russia  17.100
IN         India   3.286
CH         China   9.597
SA  South Africa   1.221>
<class 'pandas.core.frame.DataFrame'>


## Selecting Rows

In [24]:
r1 = brics[1:3]
print(type(r1))
print(r1)

<class 'pandas.core.frame.DataFrame'>
   country    capital    area  population
RU  Russia     Moscow  17.100       143.5
IN   India  New Dehli   3.286      1252.0


In [25]:
r1 = brics[2:]
print(r1)

         country    capital   area  population
IN         India  New Dehli  3.286     1252.00
CH         China    Beijing  9.597     1357.00
SA  South Africa   Pretoria  1.221       52.98


## Slicing using Rows i.e. loc and iloc
### loc returns based on labels of the rows
### iloc return based on position of the rows

In [26]:
#Get Row use loc function
r1 = brics.loc['RU']
print(type(r1))
r1

<class 'pandas.core.series.Series'>


country       Russia
capital       Moscow
area            17.1
population     143.5
Name: RU, dtype: object

In [27]:
#Use 2 square brackets to get Row data is DataFrame
r1 = brics.loc[['RU']]
print(type(r1))
r1

<class 'pandas.core.frame.DataFrame'>


Unnamed: 0,country,capital,area,population
RU,Russia,Moscow,17.1,143.5


In [28]:
#returns multiple rows based on index or row labels
r1 = brics.loc[['RU', 'CH']]
print(type(r1))
r1

<class 'pandas.core.frame.DataFrame'>


Unnamed: 0,country,capital,area,population
RU,Russia,Moscow,17.1,143.5
CH,China,Beijing,9.597,1357.0


## You can also return the selected rows and selected columns only. Starting with Rows first

In [29]:
r1 = brics.loc[['RU', 'CH'], ["country", "area"]]
print(type(r1))
r1

<class 'pandas.core.frame.DataFrame'>


Unnamed: 0,country,area
RU,Russia,17.1
CH,China,9.597


In [30]:
#All rows but only selected columns
r1= brics.loc[:,["country", "area"]]
r1

Unnamed: 0,country,area
BR,Brazil,8.156
RU,Russia,17.1
IN,India,3.286
CH,China,9.597
SA,South Africa,1.221


In [31]:
#selected rows all columns
r1= brics.loc[['IN', 'SA'],:]
r1

Unnamed: 0,country,capital,area,population
IN,India,New Dehli,3.286,1252.0
SA,South Africa,Pretoria,1.221,52.98


In [32]:
#selected rows sekected columns
r1= brics.loc[['IN', 'SA'],['area', 'population']]
r1

Unnamed: 0,area,population
IN,3.286,1252.0
SA,1.221,52.98


In [33]:
# You can also get multiple rows based on rows location
r1 = brics.iloc[-3:]
print(type(r1))
r1

<class 'pandas.core.frame.DataFrame'>


Unnamed: 0,country,capital,area,population
IN,India,New Dehli,3.286,1252.0
CH,China,Beijing,9.597,1357.0
SA,South Africa,Pretoria,1.221,52.98


In [34]:
# You can also get multiple rows based on rows location
r1 = brics.iloc[0:-1]
print(type(r1))
r1

<class 'pandas.core.frame.DataFrame'>


Unnamed: 0,country,capital,area,population
BR,Brazil,Brasilia,8.156,200.4
RU,Russia,Moscow,17.1,143.5
IN,India,New Dehli,3.286,1252.0
CH,China,Beijing,9.597,1357.0


## Filtering DataFrames

In [35]:
is_huge = brics.area > 8
is_huge

BR     True
RU     True
IN    False
CH     True
SA    False
Name: area, dtype: bool

In [36]:
huge_countries = brics[is_huge]
huge_countries

Unnamed: 0,country,capital,area,population
BR,Brazil,Brasilia,8.156,200.4
RU,Russia,Moscow,17.1,143.5
CH,China,Beijing,9.597,1357.0


In [37]:
huge_countries = brics[brics.area > 8]
huge_countries

Unnamed: 0,country,capital,area,population
BR,Brazil,Brasilia,8.156,200.4
RU,Russia,Moscow,17.1,143.5
CH,China,Beijing,9.597,1357.0


In [76]:
huge_countries = brics[np.logical_and(brics["area"] > 8, brics["area"] < 10)]
huge_countries

Unnamed: 0,country,capital,area,population
BR,Brazil,Brasilia,8.156,200.4
CH,China,Beijing,9.597,1357.0


In [77]:
#Same thing can be done with a logical operator

huge_countries = brics[(brics.area > 8) & (brics.area < 10)]
huge_countries

Unnamed: 0,country,capital,area,population
BR,Brazil,Brasilia,8.156,200.4
CH,China,Beijing,9.597,1357.0


## IsIn method for selection

In [80]:
brics.loc[brics.country.isin(['Brazil', 'China'])]

Unnamed: 0,country,capital,area,population
BR,Brazil,Brasilia,8.156,200.4
CH,China,Beijing,9.597,1357.0


## Notnull based selection

In [81]:
brics.loc[brics.country.notnull()]

Unnamed: 0,country,capital,area,population
BR,Brazil,Brasilia,8.156,200.4
RU,Russia,Moscow,17.1,143.5
IN,India,New Dehli,3.286,1252.0
CH,China,Beijing,9.597,1357.0
SA,South Africa,Pretoria,1.221,52.98


## Looping a DataFrame

In [39]:
#only prints column names
for val in brics:
    print(val)

country
capital
area
population


In [40]:
# You need to explicitly mention that you need to iterate over rows.

for lab, row in brics.iterrows():
    print(type(lab), lab, type(row), row, sep='\n')

<class 'str'>
BR
<class 'pandas.core.series.Series'>
country         Brazil
capital       Brasilia
area             8.156
population       200.4
Name: BR, dtype: object
<class 'str'>
RU
<class 'pandas.core.series.Series'>
country       Russia
capital       Moscow
area            17.1
population     143.5
Name: RU, dtype: object
<class 'str'>
IN
<class 'pandas.core.series.Series'>
country           India
capital       New Dehli
area              3.286
population       1252.0
Name: IN, dtype: object
<class 'str'>
CH
<class 'pandas.core.series.Series'>
country         China
capital       Beijing
area            9.597
population     1357.0
Name: CH, dtype: object
<class 'str'>
SA
<class 'pandas.core.series.Series'>
country       South Africa
capital           Pretoria
area                 1.221
population           52.98
Name: SA, dtype: object


In [41]:
# Add new columns to DataFrame using for loop
for lab, row in brics.iterrows():
    brics.loc[lab, 'name_length'] = len(row["country"])

print(brics)

         country    capital    area  population  name_length
BR        Brazil   Brasilia   8.156      200.40          6.0
RU        Russia     Moscow  17.100      143.50          6.0
IN         India  New Dehli   3.286     1252.00          5.0
CH         China    Beijing   9.597     1357.00          5.0
SA  South Africa   Pretoria   1.221       52.98         12.0


In [88]:
# We can do the above using apply
# In this example we are selecting a column name_length in brics DataFrame, which doesn't exists, so it'll be created.
# Then we are selecting the country columns from it and then we are applying the len function on each row of it.

brics["name_length"] = brics["country"].apply(len)
print(brics)

         country    capital    area  population  name_length
BR        Brazil   Brasilia   8.156      200.40            6
RU        Russia     Moscow  17.100      143.50            6
IN         India  New Dehli   3.286     1252.00            5
CH         China    Beijing   9.597     1357.00            5
SA  South Africa   Pretoria   1.221       52.98           12


# Summary Functions

In [82]:
brics.area.describe()

count     5.000000
mean      7.872000
std       6.194341
min       1.221000
25%       3.286000
50%       8.156000
75%       9.597000
max      17.100000
Name: area, dtype: float64

In [83]:
brics.country.describe()

count          5
unique         5
top       Brazil
freq           1
Name: country, dtype: object

In [93]:
print("Mean {}, Mode {}, Std {}".format(brics.area.mean(), brics.area.mode(), brics.area.std()),  sep="\n")

Mean 7.872, Mode 0     1.221
1     3.286
2     8.156
3     9.597
4    17.100
dtype: float64, Std 6.19434140970612


In [95]:
print(brics.name_length.unique())
print(brics.name_length.value_counts())

[ 6  5 12]
6     2
5     2
12    1
Name: name_length, dtype: int64


# Maps and Apply

In [97]:
#The below map method takes a function which accepts 1 parameter which is the value 
# of the column from each row on which it's called upon like in this case it is population column 
# The method than returns the how far the population is from it's mean

population_mean = brics.population.mean()
print(population_mean)
brics.population.map(lambda p: p - population_mean)

601.176


BR   -400.776
RU   -457.676
IN    650.824
CH    755.824
SA   -548.196
Name: population, dtype: float64

In [102]:
# Applies a method on each row of the DataFrame or Series.
population_mean = brics.population.mean()
def remean_population(row):
    row.population = row.population - population_mean
    return row

brics.apply(remean_population, axis='columns')

Unnamed: 0,country,capital,area,population,name_length
BR,Brazil,Brasilia,8.156,-400.776,6
RU,Russia,Moscow,17.1,-457.676,6
IN,India,New Dehli,3.286,650.824,5
CH,China,Beijing,9.597,755.824,5
SA,South Africa,Pretoria,1.221,-548.196,12
