# Pandas basics

In [2]:
import numpy as np
import pandas as pd

## Pandas Series

In [3]:
data = pd.Series([0.25, 0.5, 0.75, 1.0])
data

0    0.25
1    0.50
2    0.75
3    1.00
dtype: float64

In [4]:
data.values

array([0.25, 0.5 , 0.75, 1.  ])

In [5]:
data.index

RangeIndex(start=0, stop=4, step=1)

In [6]:
print(data[0])
print(data[1:2])

0.25
1    0.5
dtype: float64


### Series as a generalized array

In [7]:
data = pd.Series([0.25, 0.5, 0.75, 1.0],
                 index=['a', 'b', 'c', 'd'])
data

a    0.25
b    0.50
c    0.75
d    1.00
dtype: float64

In [8]:
data.index

Index(['a', 'b', 'c', 'd'], dtype='object')

In [9]:
data['a']

0.25

In [10]:
data['a':'c']

a    0.25
b    0.50
c    0.75
dtype: float64

In [11]:
data[['c','a','b']]

c    0.75
a    0.25
b    0.50
dtype: float64

In [12]:
population_dict = {'California': 38332521,
                   'Texas': 26448193,
                   'New York': 19651127,
                   'Florida': 19552860,
                   'Illinois': 12882135}
population = pd.Series(population_dict)
population

California    38332521
Texas         26448193
New York      19651127
Florida       19552860
Illinois      12882135
dtype: int64

In [13]:
population['New York']

19651127

Exercise 1: Series Initialization and Indexing

1. Create a Pandas series containing the following data: [10, 20, 30, 40, 50] and assign it to a variable called my_series.
2. Print the series to the console.
3. Print the first element of the series.
4. Print the last element of the series.
5. Print the third and fourth elements of the series.
6. Use indexing to print the last three elements of the series.

In [14]:

# 1. Create a Pandas series
my_series = pd.Series([10, 20, 30, 40, 50])

# 2. Print the series to the console
print(my_series)

# 3. Print the first element of the series
print(my_series[0])

# 4. Print the last element of the series
print(my_series[len(my_series) - 1])

# 5. Print the third and fourth elements of the series
print(my_series[2:4])

# 6. Use indexing to print the last three elements of the series
print(my_series[-3:])

0    10
1    20
2    30
3    40
4    50
dtype: int64
10
50
2    30
3    40
dtype: int64
2    30
3    40
4    50
dtype: int64


## The Pandas DataFrame Object

In [15]:
area_dict = {'California': 423967, 'Texas': 695662, 'New York': 141297,
             'Florida': 170312, 'Illinois': 149995}
area = pd.Series(area_dict)
area

California    423967
Texas         695662
New York      141297
Florida       170312
Illinois      149995
dtype: int64

In [16]:
states = pd.DataFrame({'population': population,
                       'area': area})
states

Unnamed: 0,population,area
California,38332521,423967
Texas,26448193,695662
New York,19651127,141297
Florida,19552860,170312
Illinois,12882135,149995


In [17]:
states.index

Index(['California', 'Texas', 'New York', 'Florida', 'Illinois'], dtype='object')

In [18]:
states.columns

Index(['population', 'area'], dtype='object')

In [19]:
states['area']

California    423967
Texas         695662
New York      141297
Florida       170312
Illinois      149995
Name: area, dtype: int64

### Initialization from numpy array

In [20]:
pd.DataFrame(np.random.rand(3, 2),
             columns=['foo', 'bar'])

Unnamed: 0,foo,bar
0,0.577818,0.064481
1,0.545095,0.769715
2,0.548875,0.644295


In [21]:
pd.DataFrame(np.random.rand(3, 2),
             columns=['foo', 'bar'],
             index=['a', 'b', 'c'])

Unnamed: 0,foo,bar
a,0.575839,0.025289
b,0.74493,0.950822
c,0.381337,0.473722


### Initialization from dictionary

In [22]:
# Create a dictionary with data
data = {'Col1': np.array([1, 2, 3, 4]),
        'Col2': [5, 6, 7, 8],
        'Col3': ['A', 'B', 'C', 'D']}

# Create a dataframe from the dictionary
df = pd.DataFrame(data)

# Print the dataframe
print(df)

   Col1  Col2 Col3
0     1     5    A
1     2     6    B
2     3     7    C
3     4     8    D


## Exercises

Exercise 1: Creating a Pandas DataFrame

Create a Pandas DataFrame from scratch with the following data:

|Country|	Population|	GDP per capita|
|:---------|:---------|:------------|
|USA	|328,200,000|	62,606|
|China	|1,397,710,000|	10,262|
|Japan	|126,150,000|	40,162|

In [1]:
import pandas as pd

data = {'Country': ['USA', 'China', 'Japan'],
        'Population': [328200000, 1397710000, 126150000],
        'GDP per capita': [62606, 10262, 40162]}

df = pd.DataFrame(data)
print(df)

  Country  Population  GDP per capita
0     USA   328200000           62606
1   China  1397710000           10262
2   Japan   126150000           40162


## Data Indexing and Selection

In [32]:
area = pd.Series({'California': 423967, 'Texas': 695662,
                  'New York': 141297, 'Florida': 170312,
                  'Illinois': 149995})
pop = pd.Series({'California': 38332521, 'Texas': 26448193,
                 'New York': 19651127, 'Florida': 19552860,
                 'Illinois': 12882135})
data = pd.DataFrame({'area':area, 'pop':pop})
data

Unnamed: 0,area,pop
California,423967,38332521
Texas,695662,26448193
New York,141297,19651127
Florida,170312,19552860
Illinois,149995,12882135


In [34]:
data['area']

California    423967
Texas         695662
New York      141297
Florida       170312
Illinois      149995
Name: area, dtype: int64

In [35]:
data.area

California    423967
Texas         695662
New York      141297
Florida       170312
Illinois      149995
Name: area, dtype: int64

In [36]:
data.area is data['area']

True

In [37]:
data['density'] = data['pop'] / data['area']
data

Unnamed: 0,area,pop,density
California,423967,38332521,90.413926
Texas,695662,26448193,38.01874
New York,141297,19651127,139.076746
Florida,170312,19552860,114.806121
Illinois,149995,12882135,85.883763


### Data as a multidimensional array

In [38]:
data.values

array([[4.23967000e+05, 3.83325210e+07, 9.04139261e+01],
       [6.95662000e+05, 2.64481930e+07, 3.80187404e+01],
       [1.41297000e+05, 1.96511270e+07, 1.39076746e+02],
       [1.70312000e+05, 1.95528600e+07, 1.14806121e+02],
       [1.49995000e+05, 1.28821350e+07, 8.58837628e+01]])

In [39]:
data.T

Unnamed: 0,California,Texas,New York,Florida,Illinois
area,423967.0,695662.0,141297.0,170312.0,149995.0
pop,38332520.0,26448190.0,19651130.0,19552860.0,12882140.0
density,90.41393,38.01874,139.0767,114.8061,85.88376


In [40]:
data.values[0]

array([4.23967000e+05, 3.83325210e+07, 9.04139261e+01])

Using the **iloc** indexer, we can index the underlying array as if it is a simple NumPy array (using the implicit Python-style index), but the DataFrame index and column labels are maintained in the result:

In [41]:
data.iloc[:3, :2]

Unnamed: 0,area,pop
California,423967,38332521
Texas,695662,26448193
New York,141297,19651127


Similarly, using the **loc** indexer we can index the underlying data in an array-like style but using the explicit index and column names:

In [42]:
data.loc[:'Illinois', :'pop']

Unnamed: 0,area,pop
California,423967,38332521
Texas,695662,26448193
New York,141297,19651127
Florida,170312,19552860
Illinois,149995,12882135


In [44]:
data.loc[data.density > 100, ['pop', 'density']]

Unnamed: 0,pop,density
New York,19651127,139.076746
Florida,19552860,114.806121


In [45]:
data.iloc[0, 2] = 90
data

Unnamed: 0,area,pop,density
California,423967,38332521,90.0
Texas,695662,26448193,38.01874
New York,141297,19651127,139.076746
Florida,170312,19552860,114.806121
Illinois,149995,12882135,85.883763


In [47]:
data[data.density > 100]

Unnamed: 0,area,pop,density
New York,141297,19651127,139.076746
Florida,170312,19552860,114.806121


In [49]:
data['Texas':'Florida']

Unnamed: 0,area,pop,density
Texas,695662,26448193,38.01874
New York,141297,19651127,139.076746
Florida,170312,19552860,114.806121


## Problem

1. Create a Pandas dataframe with the following data:

|Name|	Age|Gender|
|:--------|:---------|:-------|
|John|	25|	Male|
|Sarah|	30|	Female|
|Michael|	40|	Male|
|Elizabeth|	35|	Female|

2. Use indexing to print the age of Sarah.
3. Use indexing to print the gender of Michael.
4. Use indexing to change the age of Elizabeth to 36.
5. Use indexing to print the name and age of the first three people in the dataframe.
6. Use indexing to select only people with age below 40.

In [25]:
import pandas as pd

data = {'Name': ['John', 'Sarah', 'Michael', 'Elizabeth'],
        'Age': [25, 30, 40, 35],
        'Gender': ['Male', 'Female', 'Male', 'Female']}

df = pd.DataFrame(data)


In [26]:
print(df.loc[df['Name'] == 'Sarah', 'Age'])

1    30
Name: Age, dtype: int64


In [27]:
print(df.loc[df['Name'] == 'Michael', 'Gender'])


2    Male
Name: Gender, dtype: object


In [28]:
df.loc[df['Name'] == 'Elizabeth', 'Age'] = 36
print(df)


        Name  Age  Gender
0       John   25    Male
1      Sarah   30  Female
2    Michael   40    Male
3  Elizabeth   36  Female


In [29]:
print(df.loc[0:2, ['Name', 'Age']])


      Name  Age
0     John   25
1    Sarah   30
2  Michael   40


## Exercises

Problem 2:

1. Suppose you have a DataFrame, df, containing information about different countries, including their names, populations, and GDPs. Write a Python code to perform the following tasks:

    - Create a new column called 'GDP per capita' that represents the GDP per capita for each country.
    - Find the country with the highest GDP per capita.
    - Find the average population for countries with a GDP per capita greater than 50000.
    - Create a new DataFrame called top_countries that contains the names and populations of the top 3 countries with the highest GDP per capita.


In [3]:
df = pd.DataFrame({
    'country': ['USA', 'China', 'Japan', 'Germany', 'India'],
    'population': [328200000, 1393000000, 126500000, 83020000, 1353000000],
    'GDP': [21300000, 14200000, 5085000, 4040000, 2999000]
})

df

Unnamed: 0,country,population,GDP
0,USA,328200000,21300000
1,China,1393000000,14200000
2,Japan,126500000,5085000
3,Germany,83020000,4040000
4,India,1353000000,2999000


In [4]:
# create a new column called 'GDP per capita'
df['GDP per capita'] = df['GDP'] / df['population']

# find the country with the highest GDP per capita
max_gdp_per_capita_country = df.loc[df['GDP per capita'].idxmax(), 'country']

# find the average population for countries with a GDP per capita greater than 50000
avg_population = df.loc[df['GDP per capita'] > 50000, 'population'].mean()

# create a new DataFrame called `top_countries` that contains the names and populations of the top 3 countries with the highest GDP per capita
top_countries = df[['country', 'population']].sort_values(by='population', ascending=False).head(3)

# print the resulting DataFrame and variables
print(df)
print(max_gdp_per_capita_country)
print(avg_population)
print(top_countries)

   country  population       GDP  GDP per capita
0      USA   328200000  21300000        0.064899
1    China  1393000000  14200000        0.010194
2    Japan   126500000   5085000        0.040198
3  Germany    83020000   4040000        0.048663
4    India  1353000000   2999000        0.002217
USA
nan
  country  population
1   China  1393000000
4   India  1353000000
0     USA   328200000
