# BASICS OF PANDA

In [74]:
import pandas as pd
import numpy as np

In [3]:
df = pd.read_csv("data/gapminder.tsv", sep = '\t')

In [4]:
df.head()

Unnamed: 0,country,continent,year,lifeExp,pop,gdpPercap
0,Afghanistan,Asia,1952,28.801,8425333,779.445314
1,Afghanistan,Asia,1957,30.332,9240934,820.85303
2,Afghanistan,Asia,1962,31.997,10267083,853.10071
3,Afghanistan,Asia,1967,34.02,11537966,836.197138
4,Afghanistan,Asia,1972,36.088,13079460,739.981106


In [9]:
df.tail()

Unnamed: 0,country,continent,year,lifeExp,pop,gdpPercap
1699,Zimbabwe,Africa,1987,62.351,9216418,706.157306
1700,Zimbabwe,Africa,1992,60.377,10704340,693.420786
1701,Zimbabwe,Africa,1997,46.809,11404948,792.44996
1702,Zimbabwe,Africa,2002,39.989,11926563,672.038623
1703,Zimbabwe,Africa,2007,43.487,12311143,469.709298


In [19]:
df.index

RangeIndex(start=0, stop=1704, step=1)

In [20]:
df.columns

Index(['country', 'continent', 'year', 'lifeExp', 'pop', 'gdpPercap'], dtype='object')

In [22]:
df.values

array([['Afghanistan', 'Asia', 1952, 28.801, 8425333, 779.4453145],
       ['Afghanistan', 'Asia', 1957, 30.331999999999997, 9240934,
        820.8530296],
       ['Afghanistan', 'Asia', 1962, 31.997, 10267083, 853.1007099999999],
       ...,
       ['Zimbabwe', 'Africa', 1997, 46.809, 11404948, 792.4499602999999],
       ['Zimbabwe', 'Africa', 2002, 39.989000000000004, 11926563,
        672.0386227000001],
       ['Zimbabwe', 'Africa', 2007, 43.486999999999995, 12311143,
        469.70929810000007]], dtype=object)

In [27]:
type(df)

pandas.core.frame.DataFrame

In [28]:
df.shape

(1704, 6)

In [30]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1704 entries, 0 to 1703
Data columns (total 6 columns):
country      1704 non-null object
continent    1704 non-null object
year         1704 non-null int64
lifeExp      1704 non-null float64
pop          1704 non-null int64
gdpPercap    1704 non-null float64
dtypes: float64(2), int64(2), object(2)
memory usage: 80.0+ KB


In [32]:
df.country.notnull().count()

1704

In [36]:
df['country'].isna()

0       False
1       False
2       False
3       False
4       False
5       False
6       False
7       False
8       False
9       False
10      False
11      False
12      False
13      False
14      False
15      False
16      False
17      False
18      False
19      False
20      False
21      False
22      False
23      False
24      False
25      False
26      False
27      False
28      False
29      False
        ...  
1674    False
1675    False
1676    False
1677    False
1678    False
1679    False
1680    False
1681    False
1682    False
1683    False
1684    False
1685    False
1686    False
1687    False
1688    False
1689    False
1690    False
1691    False
1692    False
1693    False
1694    False
1695    False
1696    False
1697    False
1698    False
1699    False
1700    False
1701    False
1702    False
1703    False
Name: country, Length: 1704, dtype: bool

In [37]:
country_df = df['country']
country_df.head()

0    Afghanistan
1    Afghanistan
2    Afghanistan
3    Afghanistan
4    Afghanistan
Name: country, dtype: object

In [44]:
countries = country_df.unique()

In [45]:
type(countries)

numpy.ndarray

In [51]:
np.count_nonzero(countries)

142

In [52]:
subset = df[['country','continent','year']]

In [54]:
subset.head()

Unnamed: 0,country,continent,year
0,Afghanistan,Asia,1952
1,Afghanistan,Asia,1957
2,Afghanistan,Asia,1962
3,Afghanistan,Asia,1967
4,Afghanistan,Asia,1972


In [55]:
pandas.__version__

'0.23.4'

### EXTRACTING ROWS 

#### There are 3 methods to do this. 

* loc gets rows (or columns) with particular labels from the index.
* iloc gets rows (or columns) at particular positions in the index (so it only takes integers).
* ix usually tries to behave like loc but falls back to behaving like iloc if a label is not present in the index.(Not adivsable to use)

In [62]:
df.loc[0]

country      Afghanistan
continent           Asia
year                1952
lifeExp           28.801
pop              8425333
gdpPercap        779.445
Name: 0, dtype: object

In [57]:
df.loc[[1,2]]

Unnamed: 0,country,continent,year,lifeExp,pop,gdpPercap
1,Afghanistan,Asia,1957,30.332,9240934,820.85303
2,Afghanistan,Asia,1962,31.997,10267083,853.10071


In [58]:
df.iloc[2]

country      Afghanistan
continent           Asia
year                1962
lifeExp           31.997
pop             10267083
gdpPercap        853.101
Name: 2, dtype: object

In [65]:
df.iloc[1:5]

Unnamed: 0,country,continent,year,lifeExp,pop,gdpPercap
1,Afghanistan,Asia,1957,30.332,9240934,820.85303
2,Afghanistan,Asia,1962,31.997,10267083,853.10071
3,Afghanistan,Asia,1967,34.02,11537966,836.197138
4,Afghanistan,Asia,1972,36.088,13079460,739.981106


In [63]:
df.ix[1]

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  """Entry point for launching an IPython kernel.


country      Afghanistan
continent           Asia
year                1957
lifeExp           30.332
pop              9240934
gdpPercap        820.853
Name: 1, dtype: object

In [66]:
subset = df.loc[:,['country','pop']]

In [69]:
subset.head()

Unnamed: 0,country,pop
0,Afghanistan,8425333
1,Afghanistan,9240934
2,Afghanistan,10267083
3,Afghanistan,11537966
4,Afghanistan,13079460


In [70]:
df.loc[df.year == 1967, ['country','year','pop']]

Unnamed: 0,country,year,pop
3,Afghanistan,1967,11537966
15,Albania,1967,1984060
27,Algeria,1967,12760499
39,Angola,1967,5247469
51,Argentina,1967,22934225
63,Australia,1967,11872264
75,Austria,1967,7376998
87,Bahrain,1967,202182
99,Bangladesh,1967,62821884
111,Belgium,1967,9556500


In [73]:
df.loc[(df['year'] == 1967) & (df['pop'] > 1_000_000),['country','year','pop']]

Unnamed: 0,country,year,pop
3,Afghanistan,1967,11537966
15,Albania,1967,1984060
27,Algeria,1967,12760499
39,Angola,1967,5247469
51,Argentina,1967,22934225
63,Australia,1967,11872264
75,Austria,1967,7376998
99,Bangladesh,1967,62821884
111,Belgium,1967,9556500
123,Benin,1967,2427334
