# 1 Creating, reading & writing

In [1]:
import pandas as pd

## DataFrame

In [2]:
pd.DataFrame({'Yes': [50, 21], 'No': [131, 2]})

Unnamed: 0,Yes,No
0,50,131
1,21,2


In [3]:
pd.DataFrame({'Bob': ['I liked it.', 'It was awful.'], 
              'Sue': ['Pretty good.', 'Bland.']})

Unnamed: 0,Bob,Sue
0,I liked it.,Pretty good.
1,It was awful.,Bland.


In [4]:
pd.DataFrame({'Bob': ['I liked it.', 'It was awful.'], 
              'Sue': ['Pretty good.', 'Bland.']},
             index=['Product A', 'Product B'])

Unnamed: 0,Bob,Sue
Product A,I liked it.,Pretty good.
Product B,It was awful.,Bland.


## Series

In [5]:
pd.Series([1, 2, 3, 4, 5])

0    1
1    2
2    3
3    4
4    5
dtype: int64

In [6]:
pd.Series([30, 35, 40], 
           index=['2015 Sales', '2016 Sales', '2017 Sales'],
           name='Product A')

2015 Sales    30
2016 Sales    35
2017 Sales    40
Name: Product A, dtype: int64

In [7]:
# open and indexing df
reviews = pd.read_csv("/home/gluecksman/src/Data_and_ML/pandas/winemag-data-130k-v2.csv", index_col=0)

# saving df to something
reviews.to_csv("just_a_fancy_test_name.csv")

# 2 Indexing, selecting & assigning

In [22]:
# to remember
reviews.head(2)

Unnamed: 0,country,description,designation,points,price,province,region_1,region_2,taster_name,taster_twitter_handle,title,variety,winery,critic,index_backwards
0,Italy,"Aromas include tropical fruit, broom, brimston...",Vulkà Bianco,87,,Sicily & Sardinia,Etna,,Kerin O’Keefe,@kerinokeefe,Nicosia 2013 Vulkà Bianco (Etna),White Blend,Nicosia,everyone,129971
1,Portugal,"This is ripe and fruity, a wine that is smooth...",Avidagos,87,15.0,Douro,,,Roger Voss,@vossroger,Quinta dos Avidagos 2011 Avidagos Red (Douro),Portuguese Red,Quinta dos Avidagos,everyone,129970


In [8]:
reviews.country
# same as 
reviews['country']


# further, we can do this:
reviews['country'][0] # selects 1st
reviews['country'][0:10] # selects from 1st - 10th

0       Italy
1    Portugal
2          US
3          US
4          US
5       Spain
6       Italy
7      France
8     Germany
9      France
Name: country, dtype: object

### index-based selection
first element of the range is included and the last one excluded

In [9]:
# index-based selection
reviews.iloc[0] # selects all row (and thus, the different columns' values)

# to get a column with iloc
reviews.iloc[:, 0] # selects all the column i= 0

# both below are =
reviews.iloc[:3, 0] # selects from the column i= 0, the 1st, 2nd and 3rd row
reviews.iloc[[0, 1, 2], 0] # the same

reviews.iloc[1:3, 0] # selects, from the column i= 0, JUST 2nd & 3rd entries

reviews.iloc[-5:] # selects last five rows (with all the column's content)

sample_reviews = reviews.iloc[[1, 2, 3, 5, 8], :] # selects the rows 1, 2, 3, 5, 8 entirely (with all the columns)

### Label-based selection
indexes inclusively

In [64]:
# label-based selection
reviews.loc[:, ['taster_name', 'taster_twitter_handle', 'points']] # selects rows from the desired columns 

# Manipulating the index
reviews.set_index("title")

# Conditional selection
reviews.country == 'Italy' #  produced a Series of True/False booleans based on the country of each record

reviews.loc[reviews.country == 'Italy'] # selects rows where column 'country' == 'italy'
reviews.loc[(reviews.country == 'Italy') & (reviews.points >= 90)] # and where the column 'points' ==result > 90
reviews.loc[(reviews.country == 'Italy') | (reviews.points >= 90)] # or (same thing otherwise)

# isin lets you select the data whose value "is in" a list of values
reviews.loc[reviews.country.isin(['Italy', 'France'])]

# isnull/notnull
reviews.loc[reviews.price.notnull()] # selects rows that are non-null/non-NaN in the 'price' column
reviews.loc[reviews.price.isnull()] # selects rows that are null/NaN in the 'price' column

# assigning data to a df
reviews['critic'] = 'everyone'
reviews['index_backwards'] = range(len(reviews), 0, -1) # "-1" means backwards otherwise the range is not working 

# 3 Summary functions & Maps

In [43]:
# to remember
reviews.head(2)

Unnamed: 0,country,description,designation,points,price,province,region_1,region_2,taster_name,taster_twitter_handle,title,variety,winery,critic,index_backwards
0,Italy,"Aromas include tropical fruit, broom, brimston...",Vulkà Bianco,87,,Sicily & Sardinia,Etna,,Kerin O’Keefe,@kerinokeefe,Nicosia 2013 Vulkà Bianco (Etna),White Blend,Nicosia,everyone,129971
1,Portugal,"This is ripe and fruity, a wine that is smooth...",Avidagos,87,15.0,Douro,,,Roger Voss,@vossroger,Quinta dos Avidagos 2011 Avidagos Red (Douro),Portuguese Red,Quinta dos Avidagos,everyone,129970


In [19]:
# description (of a column)
reviews.taster_name.describe()
reviews.points.describe()

count    129971.000000
mean         88.447138
std           3.039730
min          80.000000
25%          86.000000
50%          88.000000
75%          91.000000
max         100.000000
Name: points, dtype: float64

In [44]:
# mean & median (of a column)
reviews.points.mean()
reviews.points.median()

88.0

In [18]:
# list unique values (of a column)
reviews.taster_name.unique()

array(['Kerin O’Keefe', 'Roger Voss', 'Paul Gregutt',
       'Alexander Peartree', 'Michael Schachner', 'Anna Lee C. Iijima',
       'Virginie Boone', 'Matt Kettmann', nan, 'Sean P. Sullivan',
       'Jim Gordon', 'Joe Czerwinski', 'Anne Krebiehl\xa0MW',
       'Lauren Buzzeo', 'Mike DeSimone', 'Jeff Jenssen',
       'Susan Kostrzewa', 'Carrie Dykes', 'Fiona Adams',
       'Christina Pickard'], dtype=object)

In [21]:
#  list of unique values and how often they occur (for a column)
reviews.taster_name.value_counts()

Roger Voss            25514
Michael Schachner     15134
Kerin O’Keefe         10776
Virginie Boone         9537
Paul Gregutt           9532
Matt Kettmann          6332
Joe Czerwinski         5147
Sean P. Sullivan       4966
Anna Lee C. Iijima     4415
Jim Gordon             4177
Anne Krebiehl MW       3685
Lauren Buzzeo          1835
Susan Kostrzewa        1085
Mike DeSimone           514
Jeff Jenssen            491
Alexander Peartree      415
Carrie Dykes            139
Fiona Adams              27
Christina Pickard         6
Name: taster_name, dtype: int64

### Maps
All of the standard Python operators (>, <, ==, and so on) are faster than map() or apply() because they uses speed ups built into pandas. However, they are not as flexible as map() or apply(), which can do more advanced things, like applying conditional logic.


In the examples below map() and apply() method are doing the same thing

In [25]:
# to remember
reviews.head(2)

Unnamed: 0,country,description,designation,points,price,province,region_1,region_2,taster_name,taster_twitter_handle,title,variety,winery,critic,index_backwards
0,Italy,"Aromas include tropical fruit, broom, brimston...",Vulkà Bianco,87,,Sicily & Sardinia,Etna,,Kerin O’Keefe,@kerinokeefe,Nicosia 2013 Vulkà Bianco (Etna),White Blend,Nicosia,everyone,129971
1,Portugal,"This is ripe and fruity, a wine that is smooth...",Avidagos,87,15.0,Douro,,,Roger Voss,@vossroger,Quinta dos Avidagos 2011 Avidagos Red (Douro),Portuguese Red,Quinta dos Avidagos,everyone,129970


In [26]:
# map (returns a new transformed Series)
review_points_mean = reviews.points.mean()
reviews.points.map(lambda p: p - review_points_mean) # returns a new transformed Series. Does not modify the original data it is called on

0        -1.447138
1        -1.447138
2        -1.447138
3        -1.447138
4        -1.447138
            ...   
129966    1.552862
129967    1.552862
129968    1.552862
129969    1.552862
129970    1.552862
Name: points, Length: 129971, dtype: float64

In [80]:
# apply (returns a new transformed Dataframe)
# (if big dataframe, takes time)
def remean_points(row):
    row.points = row.points - review_points_mean
    return row

reviews.apply(remean_points, axis='columns') # returns a new transformed Dataframe. Does not modify the original data it is called on

# but here for the print only:
remean_points_df = reviews.apply(remean_points, axis='columns')
remean_points_df.head(2) # test

Unnamed: 0,country,description,designation,points,price,province,region_1,region_2,taster_name,taster_twitter_handle,title,variety,winery,critic,index_backwards
0,Italy,"Aromas include tropical fruit, broom, brimston...",Vulkà Bianco,-1.447138,,Sicily & Sardinia,Etna,,Kerin O’Keefe,@kerinokeefe,Nicosia 2013 Vulkà Bianco (Etna),White Blend,Nicosia,everyone,129971
1,Portugal,"This is ripe and fruity, a wine that is smooth...",Avidagos,-1.447138,15.0,Douro,,,Roger Voss,@vossroger,Quinta dos Avidagos 2011 Avidagos Red (Douro),Portuguese Red,Quinta dos Avidagos,everyone,129970


In [31]:
# check that the orginal dataframe was not modified by either map or apply method:
reviews.head(2)

Unnamed: 0,country,description,designation,points,price,province,region_1,region_2,taster_name,taster_twitter_handle,title,variety,winery,critic,index_backwards
0,Italy,"Aromas include tropical fruit, broom, brimston...",Vulkà Bianco,87,,Sicily & Sardinia,Etna,,Kerin O’Keefe,@kerinokeefe,Nicosia 2013 Vulkà Bianco (Etna),White Blend,Nicosia,everyone,129971
1,Portugal,"This is ripe and fruity, a wine that is smooth...",Avidagos,87,15.0,Douro,,,Roger Voss,@vossroger,Quinta dos Avidagos 2011 Avidagos Red (Douro),Portuguese Red,Quinta dos Avidagos,everyone,129970


In [36]:
# an easier way to do the above code
review_points_mean = reviews.points.mean()
reviews.points - review_points_mean

0        -1.447138
1        -1.447138
2        -1.447138
3        -1.447138
4        -1.447138
            ...   
129966    1.552862
129967    1.552862
129968    1.552862
129969    1.552862
129970    1.552862
Name: points, Length: 129971, dtype: float64

In [41]:
# operators (do not modified original dataframe if just used by themselves)
reviews.country + " - " + reviews.region_1

0                     Italy - Etna
1                              NaN
2           US - Willamette Valley
3         US - Lake Michigan Shore
4           US - Willamette Valley
                    ...           
129966                         NaN
129967                 US - Oregon
129968             France - Alsace
129969             France - Alsace
129970             France - Alsace
Length: 129971, dtype: object

In [42]:
# check that the orginal dataframe was not modified by the operators:
reviews.head(2)

Unnamed: 0,country,description,designation,points,price,province,region_1,region_2,taster_name,taster_twitter_handle,title,variety,winery,critic,index_backwards
0,Italy,"Aromas include tropical fruit, broom, brimston...",Vulkà Bianco,87,,Sicily & Sardinia,Etna,,Kerin O’Keefe,@kerinokeefe,Nicosia 2013 Vulkà Bianco (Etna),White Blend,Nicosia,everyone,129971
1,Portugal,"This is ripe and fruity, a wine that is smooth...",Avidagos,87,15.0,Douro,,,Roger Voss,@vossroger,Quinta dos Avidagos 2011 Avidagos Red (Douro),Portuguese Red,Quinta dos Avidagos,everyone,129970


##### Exercices from Kaggle

In [79]:
# 1/ Which wine is the "best bargain"? Create a variable bargain_wine with the title of the wine with the highest points-to-price ratio in the dataset
bargain_idx = (reviews.points / reviews.price).idxmax() # idxmax = index of max value
bargain_wine = reviews.loc[bargain_idx, 'title']
bargain_wine # test

'Bandit NV Merlot (California)'

In [82]:
# 2/ "tropical" or "fruity"? Create a Series `descriptor_counts` counting how many times each of these two words appears in the `description` column in the dataset
nb_trop = reviews.description.map(lambda description: "tropical" in                                             description).sum()
nb_fruit = reviews.description.map(lambda description: "fruity" in                                              description).sum()

descriptor_counts = pd.Series([nb_trop,nb_fruit]
                             , index=['tropical', 'fruity'])
descriptor_counts # test

tropical    3607
fruity      9090
dtype: int64

In [92]:
# 3/ A score of 95 or higher counts as 3 stars, a score of at least 85 but less than 95 is 2 stars. Any other score is 1 star. But any wines from Canada should automatically get 3 stars, regardless of points (mafia).

def star_attibuter(row):
    if row['country'] == "Canada":
        return 3
    elif row.points >= 95:
        return 3
    elif row.points >= 85 & row.points < 95:
        return 2
    else:
        return 1

star_ratings = reviews.apply(star_attibuter, axis='columns') # type = Series
star_ratings # test

0         2
1         2
2         2
3         2
4         2
         ..
129966    2
129967    2
129968    2
129969    2
129970    2
Length: 129971, dtype: int64

# 4 Grouping and Sorting

0         2
1         2
2         2
3         2
4         2
         ..
129966    2
129967    2
129968    2
129969    2
129970    2
Length: 129971, dtype: int64