## 6.1: Constructing dataframes

In [1]:
# Load the pandas and numpy packages
import numpy as np
import pandas as pd

In [4]:
# Create dataframe with pandas DataFrame() constructor
dataframe = pd.DataFrame(
    data=[['abc', 3.3, 28, True], 
          ['xyz', -0.55, 0, False]],
    columns=['Label1', 'Label2', 'Label3', 'Label4'],
    index=[0, 1],
)

In [6]:
# Display the dataframe
dataframe

Unnamed: 0,Label1,Label2,Label3,Label4
0,abc,3.3,28,True
1,xyz,-0.55,0,False


In [8]:
# Display the dataframe's shape
dataframe.shape

(2, 4)

In [10]:
# Create an array
array = np.array(
   object=[ [ [1, 2, 3], [4, 5, 6] ], 
            [ [11, 12, 13], [14, 15, 16] ] ] )
array

array([[[ 1,  2,  3],
        [ 4,  5,  6]],

       [[11, 12, 13],
        [14, 15, 16]]])

In [12]:
# Returns the array's shape
array.shape

(2, 2, 3)

## 6.2: Subsetting data.

In [15]:
# Load the country.csv data
country = pd.read_csv('country_subset.csv')

In [17]:
# Display the country dataframe
country

Unnamed: 0,Name,Continent,Population
0,Afghanistan,Asia,22720000
1,Albania,Europe,3401200
2,Algeria,Africa,31471000
3,American Samoa,Oceania,68000
4,Andorra,Europe,78000
...,...,...,...
234,Western Sahara,Africa,293000
235,Yemen,Asia,18112000
236,Yugoslavia,Europe,10640000
237,Zambia,Africa,9169000


In [19]:
# Select the 'Name' column
country['Name']

0         Afghanistan
1             Albania
2             Algeria
3      American Samoa
4             Andorra
            ...      
234    Western Sahara
235             Yemen
236        Yugoslavia
237            Zambia
238          Zimbabwe
Name: Name, Length: 239, dtype: object

In [21]:
# Put an extra bracket around the column label to return a dataframe
country[['Name']]

Unnamed: 0,Name
0,Afghanistan
1,Albania
2,Algeria
3,American Samoa
4,Andorra
...,...
234,Western Sahara
235,Yemen
236,Yugoslavia
237,Zambia


In [23]:
type(country['Name'])

pandas.core.series.Series

In [25]:
type(country[['Name']])

pandas.core.frame.DataFrame

In [27]:
# Select the 'Name' and 'Continent' columns
country[['Name', 'Continent']]

Unnamed: 0,Name,Continent
0,Afghanistan,Asia
1,Albania,Europe
2,Algeria,Africa
3,American Samoa,Oceania
4,Andorra,Europe
...,...,...
234,Western Sahara,Africa
235,Yemen,Asia
236,Yugoslavia,Europe
237,Zambia,Africa


In [29]:
# Select the element in row 0 and column 1
country.iloc[0, 1]

'Asia'

In [31]:
# Select rows 0 and 1 and column 1
country.iloc[0:2, 1]

0      Asia
1    Europe
Name: Continent, dtype: object

In [33]:
# Select all rows before row 7 and columns 1 thru 2
country.iloc[:7, 1:3]

Unnamed: 0,Continent,Population
0,Asia,22720000
1,Europe,3401200
2,Africa,31471000
3,Oceania,68000
4,Europe,78000
5,Africa,12878000
6,North America,8000


In [35]:
# Select rows 10 thru 20 and all columns from column 1 onwards
country.iloc[10:21, 1:]

Unnamed: 0,Continent,Population
10,Asia,3520000
11,North America,103000
12,Oceania,18886000
13,Europe,8091800
14,Asia,7734000
15,North America,307000
16,Asia,617000
17,Asia,129155000
18,North America,270000
19,Europe,10236000


In [37]:
# Select rows 10 thru 20 and the Continent and Population columns
country.loc[10:20, ['Continent', 'Population']]

Unnamed: 0,Continent,Population
10,Asia,3520000
11,North America,103000
12,Oceania,18886000
13,Europe,8091800
14,Asia,7734000
15,North America,307000
16,Asia,617000
17,Asia,129155000
18,North America,270000
19,Europe,10236000


## 6.3: Subsetting data using relational operators

In [42]:
# Load the country.csv data
country = pd.read_csv('country.csv')

In [44]:
# Display the country dataframe
country

Unnamed: 0,Country,Continent,SurfaceArea,Population,Density,IndependenceDate,OfficialLanguage,Percentage
0,China,Asia,9572900,1277558000,133,1949-10-01,Mandarin,0.81
1,Bangladesh,Asia,143998,129155000,897,1905-05-24,Bengali,0.98
2,Brazil,South America,8547403,170115000,20,1822-09-07,Portuguese,0.98
3,India,Asia,3287263,1013662000,308,1905-04-30,Hindi,0.4
4,Norway,Europe,385207,5379000,14,1905-05-17,Norwegian,0.97
5,United States,North America,9363520,278357000,30,1776-07-04,English,0.86


In [46]:
# Select rows where the Continent is South America
country[country['Continent'] == 'South America']

Unnamed: 0,Country,Continent,SurfaceArea,Population,Density,IndependenceDate,OfficialLanguage,Percentage
2,Brazil,South America,8547403,170115000,20,1822-09-07,Portuguese,0.98


In [48]:
# Select rows where the Continent is not Asia
country[country['Continent'] != 'Asia']

Unnamed: 0,Country,Continent,SurfaceArea,Population,Density,IndependenceDate,OfficialLanguage,Percentage
2,Brazil,South America,8547403,170115000,20,1822-09-07,Portuguese,0.98
4,Norway,Europe,385207,5379000,14,1905-05-17,Norwegian,0.97
5,United States,North America,9363520,278357000,30,1776-07-04,English,0.86


In [50]:
# Select rows where the Continent is Asia or Europe
country[(country['Continent'] == 'Asia') | (country['Continent'] == 'Europe')]

Unnamed: 0,Country,Continent,SurfaceArea,Population,Density,IndependenceDate,OfficialLanguage,Percentage
0,China,Asia,9572900,1277558000,133,1949-10-01,Mandarin,0.81
1,Bangladesh,Asia,143998,129155000,897,1905-05-24,Bengali,0.98
3,India,Asia,3287263,1013662000,308,1905-04-30,Hindi,0.4
4,Norway,Europe,385207,5379000,14,1905-05-17,Norwegian,0.97


In [52]:
# Select rows where the Continent is not in Asia
country[~(country['Continent'] == 'Asia')]

Unnamed: 0,Country,Continent,SurfaceArea,Population,Density,IndependenceDate,OfficialLanguage,Percentage
2,Brazil,South America,8547403,170115000,20,1822-09-07,Portuguese,0.98
4,Norway,Europe,385207,5379000,14,1905-05-17,Norwegian,0.97
5,United States,North America,9363520,278357000,30,1776-07-04,English,0.86


### 6.1:Challenge_Activity_Subsetting dataframes using pandas

## 1:
**This dataset contains information on taxi journeys during March 2019 in New York City. The data includes passengers, distance, fare, tip, tolls, and total.**

* **Create a dataframe subset containing the total column of the cabsNY dataframe.**
    
**The code contains all imports, loads the dataset, and prints the total column.**

In [None]:
# Loads necessary packages
import pandas as pd

# Loads the taxi.csv dataset
taxicabsNY = pd.read_csv('taxi.csv')

# Subset a column of the taxicabsNY dataframe by specifying the column name
total = # Your code goes here

# Prints the column
print(total)

## 2:

**This dataset contains information on taxi journeys during March 2019 in New York City. The data includes pickup, dropoff, passengers, distance, fare, tip, tolls, total, color, payment, pickup_zone, dropoff_zone, pickup_borough, and dropoff_borough.**

* **Using the iloc() method, display the first 13 columns and the first 12 rows of the cabsNY dataframe.**

**The code contains all imports, loads the dataset, and prints the column.**

In [None]:
# Loads necessary packages
import pandas as pd

# Loads the taxi.csv dataset
cabsNY = pd.read_csv('taxi.csv')

# Subset cabsNY using the iloc method
df = # Your code goes here

# Prints the column
print(df)

## 3:

**This dataset contains information on taxi journeys during March 2019 in New York City. The data includes passengers, distance, fare, tip, tolls, and total.**

* **Using the loc() method, display all rows after and including row 5, and the total column of the cabsNY dataframe.**
    
**The code contains all imports, loads the dataset, and prints the column.**

In [None]:


# Subset cabsNY by specifying columns and rows using the loc method
df = # Your code goes here

# Prints the column
print(df)

## 4:

**This dataset contains information on taxi journeys during March 2019 in New York City. The data includes passengers, distance, fare, tip, tolls, and total.**

* **Display all rows of the taxicabsNY dataframe where the tolls column is >= 9.**

**The code contains all imports, loads the dataset, and prints the column.**

In [None]:


# Subset taxicabsNY using comparison operators
df = # Your code goes here

# Prints the column
print(df)

## 5:

**This dataset contains information on taxi journeys during March 2019 in New York City. The data includes pickup, dropoff, pickup_zone, dropoff_zone, pickup_borough, and dropoff_borough.**

* **Display all rows of the taxisNY dataframe where dropoff_borough is at Queens and pickup_zone is either at Prospect-Lefferts Gardens or Sunnyside.**

**The code contains all imports, loads the dataset, and prints the column.**

In [None]:
# Loads necessary packages
import pandas as pd

# Loads the taxi.csv dataset
taxisNY = pd.read_csv('taxi.csv')

# Subset taxisNY using comparison and logical operators
df = # Your code goes here

# Prints the column
print(df)

## 6.4: Dataframe methods in pandas.

In [3]:
# Load the country.csv data
country = pd.read_csv('country.csv')

In [5]:
# Display the country dataframe
country

Unnamed: 0,Country,Continent,SurfaceArea,Population,Density,IndependenceDate,OfficialLanguage,Percentage
0,China,Asia,9572900,1277558000,133,1949-10-01,Mandarin,0.81
1,Bangladesh,Asia,143998,129155000,897,1905-05-24,Bengali,0.98
2,Brazil,South America,8547403,170115000,20,1822-09-07,Portuguese,0.98
3,India,Asia,3287263,1013662000,308,1905-04-30,Hindi,0.4
4,Norway,Europe,385207,5379000,14,1905-05-17,Norwegian,0.97
5,United States,North America,9363520,278357000,30,1776-07-04,English,0.86


In [60]:
# Return the value in row 0 of the Continent column
country.at[0, 'Continent']

'Asia'

In [62]:
# Use drop() to remove the Percentage column
country.drop(labels='Percentage', axis=1)

Unnamed: 0,Country,Continent,SurfaceArea,Population,Density,IndependenceDate,OfficialLanguage
0,China,Asia,9572900,1277558000,133,1949-10-01,Mandarin
1,Bangladesh,Asia,143998,129155000,897,1905-05-24,Bengali
2,Brazil,South America,8547403,170115000,20,1822-09-07,Portuguese
3,India,Asia,3287263,1013662000,308,1905-04-30,Hindi
4,Norway,Europe,385207,5379000,14,1905-05-17,Norwegian
5,United States,North America,9363520,278357000,30,1776-07-04,English


In [9]:
help(country.drop)

Help on method drop in module pandas.core.frame:

drop(labels: 'IndexLabel | None' = None, *, axis: 'Axis' = 0, index: 'IndexLabel | None' = None, columns: 'IndexLabel | None' = None, level: 'Level | None' = None, inplace: 'bool' = False, errors: 'IgnoreRaise' = 'raise') -> 'DataFrame | None' method of pandas.core.frame.DataFrame instance
    Drop specified labels from rows or columns.

    Remove rows or columns by specifying label names and corresponding
    axis, or by directly specifying index or column names. When using a
    multi-index, labels on different levels can be removed by specifying
    the level. See the :ref:`user guide <advanced.shown_levels>`
    for more information about the now unused levels.

    Parameters
    ----------
    labels : single label or list-like
        Index or column labels to drop. A tuple will be used as a single
        label and not treated as a list-like.
    axis : {0 or 'index', 1 or 'columns'}, default 0
        Whether to drop labels fr

In [64]:
# Use insert() to add a new column
percent = [81, 98, 98, 40, 97, 86]
country.insert(loc=7, column='Percent', value=percent)
country

Unnamed: 0,Country,Continent,SurfaceArea,Population,Density,IndependenceDate,OfficialLanguage,Percent,Percentage
0,China,Asia,9572900,1277558000,133,1949-10-01,Mandarin,81,0.81
1,Bangladesh,Asia,143998,129155000,897,1905-05-24,Bengali,98,0.98
2,Brazil,South America,8547403,170115000,20,1822-09-07,Portuguese,98,0.98
3,India,Asia,3287263,1013662000,308,1905-04-30,Hindi,40,0.4
4,Norway,Europe,385207,5379000,14,1905-05-17,Norwegian,97,0.97
5,United States,North America,9363520,278357000,30,1776-07-04,English,86,0.86


In [66]:
# Use replace to change the country label
country.replace(to_replace='United States', value='USA')

Unnamed: 0,Country,Continent,SurfaceArea,Population,Density,IndependenceDate,OfficialLanguage,Percent,Percentage
0,China,Asia,9572900,1277558000,133,1949-10-01,Mandarin,81,0.81
1,Bangladesh,Asia,143998,129155000,897,1905-05-24,Bengali,98,0.98
2,Brazil,South America,8547403,170115000,20,1822-09-07,Portuguese,98,0.98
3,India,Asia,3287263,1013662000,308,1905-04-30,Hindi,40,0.4
4,Norway,Europe,385207,5379000,14,1905-05-17,Norwegian,97,0.97
5,USA,North America,9363520,278357000,30,1776-07-04,English,86,0.86


In [68]:
# Use sort_values to arrange by density
country.sort_values(by='Density')

Unnamed: 0,Country,Continent,SurfaceArea,Population,Density,IndependenceDate,OfficialLanguage,Percent,Percentage
4,Norway,Europe,385207,5379000,14,1905-05-17,Norwegian,97,0.97
2,Brazil,South America,8547403,170115000,20,1822-09-07,Portuguese,98,0.98
5,United States,North America,9363520,278357000,30,1776-07-04,English,86,0.86
0,China,Asia,9572900,1277558000,133,1949-10-01,Mandarin,81,0.81
3,India,Asia,3287263,1013662000,308,1905-04-30,Hindi,40,0.4
1,Bangladesh,Asia,143998,129155000,897,1905-05-24,Bengali,98,0.98
