Creating data

In [2]:
import pandas as pd
#There are two core objects in pandas: the DataFrame and the Series.
#A DataFrame is a table
pd.DataFrame({'Yes': [50, 21], 'No': [131, 2]})

Unnamed: 0,Yes,No
0,50,131
1,21,2


In [3]:
# DataFrame entries are not limited to integers.
pd.DataFrame({'Bob': ['I liked it.', 'It was awful.'], 'Sue': ['Pretty good.', 'Bland.']})

Unnamed: 0,Bob,Sue
0,I liked it.,Pretty good.
1,It was awful.,Bland.


In [6]:
# pd.DataFrame() constructor to generate these DataFrame objects. 
# The syntax for declaring a new one is a dictionary whose keys are the column names (Bob and Sue in this example)

In [7]:
#The list of row labels used in a DataFrame is known as an Index
pd.DataFrame({'Bob': ['I liked it.', 'It was awful.'], 
              'Sue': ['Pretty good.', 'Bland.']},
             index=['Product A', 'Product B'])

Unnamed: 0,Bob,Sue
Product A,I liked it.,Pretty good.
Product B,It was awful.,Bland.


#A Series, by contrast, is a sequence of data values.
#If a DataFrame is a table, a Series is a list
pd.Series([1, 2, 3, 4, 5])
#A Series is, in essence, a single column of a DataFrame.
pd.Series([30, 35, 40], index=['2015 Sales', '2016 Sales', '2017 Sales'], name='Product A')

Reading data files

In [12]:
#csv read
wine_reviews = pd.read_csv("../input/wine-reviews/winemag-data-130k-v2.csv")
# We can use the shape attribute to check how large the resulting DataFrame is:
wine_reviews.shape
# (129971, 14)

FileNotFoundError: [Errno 2] No such file or directory: '../input/wine-reviews/winemag-data-130k-v2.csv'

In [None]:
# We can examine the contents of the resultant DataFrame using the head() command, which grabs the first five rows:
wine_reviews.head()

In [None]:
#this CSV file has a built-in index, which pandas did not pick up on automatically.
#To make pandas use that column for the index (instead of creating a new one from scratch), we can specify an index_col
wine_reviews = pd.read_csv("../input/wine-reviews/winemag-data-130k-v2.csv", index_col=0)

Indexing in pandas

In [1]:
# pandas has its own accessor operators, loc and iloc
# index-based selection: selecting data based on its numerical position in the data 
reviews.iloc[0]

In [None]:
# To get a column with iloc, we can do the following:
reviews.iloc[:, 0]
# It's also possible to pass a list:\
reviews.iloc[[0, 1, 2], 0]
# negative numbers can be used in selection.
reviews.iloc[-5:]

In [None]:
#  label-based selection. it's the data index value, not its position, which matters.
reviews.loc[0, 'country']

Conditional selection

In [None]:
reviews.country == 'Italy'
# This operation produced a Series of True/False booleans based on the country of each record. 
# This result can then be used inside of loc to select the relevant data:
reviews.loc[reviews.country == 'Italy']

In [None]:
# We can use the ampersand (&) to bring the two questions together:
reviews.loc[(reviews.country == 'Italy') & (reviews.points >= 90)]
# OR Condition | 
reviews.loc[(reviews.country == 'Italy') | (reviews.points >= 90)]

In [None]:
# Pandas comes with a few built-in conditional selectors
# The first is isin. isin is lets you select data whose value "is in" a list of values
reviews.loc[reviews.country.isin(['Italy', 'France'])]
# The second is isnull (and its companion notnull)
reviews.loc[reviews.price.notnull()]

Assigning data

In [None]:
# assigning data to a DataFrame is easy. You can assign either a constant value:
reviews['critic'] = 'everyone'
reviews['critic']
# Or with an iterable of values:
reviews['index_backwards'] = range(len(reviews), 0, -1)
reviews['index_backwards']
# range(start,stop,step)