In [1]:
import numpy as np
import pandas as pd


### Creating a Pandas Series

In [2]:
arr = np.arange(12)

In [7]:
np.random.shuffle(arr)

In [8]:
s1 = pd.Series(arr)
print(s1)

0      2
1      5
2      1
3      9
4      4
5      6
6      3
7      8
8     11
9     10
10     7
11     0
dtype: int64


In [9]:
#numeric series
s = pd.Series([2, 4, 5, 6, 9])
print(s)
print(type(s))

0    2
1    4
2    5
3    6
4    9
dtype: int64
<class 'pandas.core.series.Series'>


In [10]:
# character series
char_series = pd.Series(['a', 'b', 'af'])
char_series

0     a
1     b
2    af
dtype: object

In [13]:
date_series = pd.date_range(start = '01-01-2018', periods = 14) ##MM-DD-YYYY
date_series
#type(date_series)

DatetimeIndex(['2018-01-01', '2018-01-02', '2018-01-03', '2018-01-04',
               '2018-01-05', '2018-01-06', '2018-01-07', '2018-01-08',
               '2018-01-09', '2018-01-10', '2018-01-11', '2018-01-12',
               '2018-01-13', '2018-01-14'],
              dtype='datetime64[ns]', freq='D')

#### Indexing Series

Indexing series is exactly same as 1-D numpy arrays - index starts at 0.

In [17]:
arr = np.arange(12)
np.random.shuffle(arr)
s1 = pd.Series(arr)
print(s1)

0      2
1      4
2      9
3      5
4      3
5     10
6     11
7      1
8      0
9      7
10     6
11     8
dtype: int64


In [18]:
s1[4]

3

In [19]:
s1[4:8]

4     3
5    10
6    11
7     1
dtype: int64

In [20]:
s1[[2, 4, 6]]

2     9
4     3
6    11
dtype: int64

#### Explicitly specifying indices

You might have noticed that while creating a series, Pandas automatically indexes it from 0 to (n-1), n being the number of rows. But if we want, we can also explicitly set the index ourselves, using the ‘index’ argument while creating the series using `pd.Series()`

In [23]:
pd.Series(arr, index = np.arange(1, 13))

1      2
2      4
3      9
4      5
5      3
6     10
7     11
8      1
9      0
10     7
11     6
12     8
dtype: int64

In [24]:
pd.Series([1, 2, 4], index=['a', 'b', 'c'])

a    1
b    2
c    4
dtype: int64

In [26]:
pd.Series(np.array(['a']*10), index = range(2,12))

2     a
3     a
4     a
5     a
6     a
7     a
8     a
9     a
10    a
11    a
dtype: object

In [27]:
# creating a series
series1 = pd.Series([5, 2, 3,7], index=['a', 'b', 'c', 'd'])
 
# creating a series
series2 = pd.Series([1, 6, 4, 9], index=['a', 'b', 'd', 'e'])

In [30]:
series1

a    5
b    2
c    3
d    7
dtype: int64

In [31]:
series2

a    1
b    6
d    4
e    9
dtype: int64

In [None]:
# series1 series2
#a 5        1
#b  2       6
#c  3       10
#d  7      4
#e  10      9

In [114]:
series1.add(series2, fill_value=10)

a     6.0
b     8.0
c    13.0
d    11.0
e    19.0
dtype: float64

### Dataframe

In [40]:
#dictionary
d = {'names': ['shivani', 'shivangi'], 'marks': [90, 95]}
df = pd.DataFrame(data=d)

In [43]:
df

Unnamed: 0,names,marks
0,shivani,90
1,shivangi,95


In [49]:
# numpy array
df2 = pd.DataFrame(np.arange(1, 10).reshape(3,3), columns=['a', 'b', 'c'])

In [50]:
df2.head()

Unnamed: 0,a,b,c
0,1,2,3
1,4,5,6
2,7,8,9


In [87]:
iris = pd.read_csv('iris.csv')

In [58]:
iris.head(10) #top 5 rows in iris dataframe

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa
5,5.4,3.9,1.7,0.4,setosa
6,4.6,3.4,1.4,0.3,setosa
7,5.0,3.4,1.5,0.2,setosa
8,4.4,2.9,1.4,0.2,setosa
9,4.9,3.1,1.5,0.1,setosa


In [60]:
iris.tail(7)

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),species
143,6.8,3.2,5.9,2.3,virginica
144,6.7,3.3,5.7,2.5,virginica
145,6.7,3.0,5.2,2.3,virginica
146,6.3,2.5,5.0,1.9,virginica
147,6.5,3.0,5.2,2.0,virginica
148,6.2,3.4,5.4,2.3,virginica
149,5.9,3.0,5.1,1.8,virginica


In [62]:
iris.describe()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
count,150.0,150.0,150.0,150.0
mean,5.843333,3.054,3.758667,1.198667
std,0.828066,0.433594,1.76442,0.763161
min,4.3,2.0,1.0,0.1
25%,5.1,2.8,1.6,0.3
50%,5.8,3.0,4.35,1.3
75%,6.4,3.3,5.1,1.8
max,7.9,4.4,6.9,2.5


In [64]:
iris.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
sepal length (cm)    150 non-null float64
sepal width (cm)     150 non-null float64
petal length (cm)    150 non-null float64
petal width (cm)     150 non-null float64
species              150 non-null object
dtypes: float64(4), object(1)
memory usage: 5.9+ KB


In [65]:
iris.columns

Index(['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)',
       'petal width (cm)', 'species'],
      dtype='object')

In [80]:
iris.rename(columns = {'species': 'target'}, inplace=True)

In [70]:
iris.shape

(150, 5)

In [81]:
iris.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


In [93]:
iris.set_index('sepal length (cm)', inplace=True)

In [94]:
iris.head()

Unnamed: 0_level_0,sepal width (cm),petal length (cm),petal width (cm),species
sepal length (cm),Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
5.1,3.5,1.4,0.2,setosa
4.9,3.0,1.4,0.2,setosa
4.7,3.2,1.3,0.2,setosa
4.6,3.1,1.5,0.2,setosa
5.0,3.6,1.4,0.2,setosa


Convert to numpy array

In [89]:
npdata = iris.values

In [90]:
npdata

array([[5.1, 3.5, 1.4, 0.2, 'setosa'],
       [4.9, 3.0, 1.4, 0.2, 'setosa'],
       [4.7, 3.2, 1.3, 0.2, 'setosa'],
       [4.6, 3.1, 1.5, 0.2, 'setosa'],
       [5.0, 3.6, 1.4, 0.2, 'setosa'],
       [5.4, 3.9, 1.7, 0.4, 'setosa'],
       [4.6, 3.4, 1.4, 0.3, 'setosa'],
       [5.0, 3.4, 1.5, 0.2, 'setosa'],
       [4.4, 2.9, 1.4, 0.2, 'setosa'],
       [4.9, 3.1, 1.5, 0.1, 'setosa'],
       [5.4, 3.7, 1.5, 0.2, 'setosa'],
       [4.8, 3.4, 1.6, 0.2, 'setosa'],
       [4.8, 3.0, 1.4, 0.1, 'setosa'],
       [4.3, 3.0, 1.1, 0.1, 'setosa'],
       [5.8, 4.0, 1.2, 0.2, 'setosa'],
       [5.7, 4.4, 1.5, 0.4, 'setosa'],
       [5.4, 3.9, 1.3, 0.4, 'setosa'],
       [5.1, 3.5, 1.4, 0.3, 'setosa'],
       [5.7, 3.8, 1.7, 0.3, 'setosa'],
       [5.1, 3.8, 1.5, 0.3, 'setosa'],
       [5.4, 3.4, 1.7, 0.2, 'setosa'],
       [5.1, 3.7, 1.5, 0.4, 'setosa'],
       [4.6, 3.6, 1.0, 0.2, 'setosa'],
       [5.1, 3.3, 1.7, 0.5, 'setosa'],
       [4.8, 3.4, 1.9, 0.2, 'setosa'],
       [5.0, 3.0, 1.6, 0.

In [95]:
world_bank = pd.read_csv('world_ind_pop_data.csv')

In [99]:
world_bank.head()

Unnamed: 0,CountryName,CountryCode,Year,Total Population,Urban population (% of total)
0,Arab World,ARB,1960,92495900.0,31.285384
1,Caribbean small states,CSS,1960,4190810.0,31.59749
2,Central Europe and the Baltics,CEB,1960,91401580.0,44.507921
3,East Asia & Pacific (all income levels),EAS,1960,1042475000.0,22.471132
4,East Asia & Pacific (developing only),EAP,1960,896493000.0,16.917679


In [100]:
world_bank.set_index('CountryName', inplace=True)

In [101]:
world_bank.head()

Unnamed: 0_level_0,CountryCode,Year,Total Population,Urban population (% of total)
CountryName,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Arab World,ARB,1960,92495900.0,31.285384
Caribbean small states,CSS,1960,4190810.0,31.59749
Central Europe and the Baltics,CEB,1960,91401580.0,44.507921
East Asia & Pacific (all income levels),EAS,1960,1042475000.0,22.471132
East Asia & Pacific (developing only),EAP,1960,896493000.0,16.917679


### Indexing and Selecting Data

### Selecting rows

In [106]:
data = pd.read_csv('winequality-red.csv', sep=';')

In [107]:
data.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5


In [108]:
data[2:7]

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
5,7.4,0.66,0.0,1.8,0.075,13.0,40.0,0.9978,3.51,0.56,9.4,5
6,7.9,0.6,0.06,1.6,0.069,15.0,59.0,0.9964,3.3,0.46,9.4,5


In [111]:
data[4::2].head(10)

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
6,7.9,0.6,0.06,1.6,0.069,15.0,59.0,0.9964,3.3,0.46,9.4,5
8,7.8,0.58,0.02,2.0,0.073,9.0,18.0,0.9968,3.36,0.57,9.5,7
10,6.7,0.58,0.08,1.8,0.097,15.0,65.0,0.9959,3.28,0.54,9.2,5
12,5.6,0.615,0.0,1.6,0.089,16.0,59.0,0.9943,3.58,0.52,9.9,5
14,8.9,0.62,0.18,3.8,0.176,52.0,145.0,0.9986,3.16,0.88,9.2,5
16,8.5,0.28,0.56,1.8,0.092,35.0,103.0,0.9969,3.3,0.75,10.5,7
18,7.4,0.59,0.08,4.4,0.086,6.0,29.0,0.9974,3.38,0.5,9.0,4
20,8.9,0.22,0.48,1.8,0.077,29.0,60.0,0.9968,3.39,0.53,9.4,6
22,7.9,0.43,0.21,1.6,0.106,10.0,37.0,0.9966,3.17,0.91,9.5,5


### Selecting Columns

There are two simple ways to select a single column from a dataframe - ```df['column_name']``` and ```df.column_name```.

In [117]:
data['chlorides'].head()

0    0.076
1    0.098
2    0.092
3    0.075
4    0.076
Name: chlorides, dtype: float64

In [118]:
data.chlorides.head()

0    0.076
1    0.098
2    0.092
3    0.075
4    0.076
Name: chlorides, dtype: float64

In [119]:
type(data.chlorides)
type(data['chlorides'])

pandas.core.series.Series

#### Selecting Multiple Columns 

You can select multiple columns by passing the list of column names inside the ```[]```: ```df[['column_1', 'column_2', 'column_n']]```.

In [122]:
data.columns

Index(['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar',
       'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density',
       'pH', 'sulphates', 'alcohol', 'quality'],
      dtype='object')

In [126]:
data[['sulphates', 'alcohol', 'citric acid']].shape

(1599, 3)

In [127]:
type(data[['sulphates', 'alcohol', 'citric acid']])

pandas.core.frame.DataFrame

### Selecting Subsets of Dataframes

Until now, you have seen selecting rows and columns using the following ways:
* Selecting rows: ```df[start:stop]```
* Selecting columns: ```df['column']``` or ```df.column``` or ```df[['col_x', 'col_y']]```
    * ```df['column']``` or ```df.column``` return a series
    * ```df[['col_x', 'col_y']]``` returns a dataframe

But pandas does not prefer this way of indexing dataframes, since it has some ambiguity. For instance, let's try and select the third row of the dataframe.

In [128]:
simple_df = pd.DataFrame(np.arange(1, 10).reshape(3,3), columns=['a', 'b', 'c'])

In [131]:
simple_df[2]

KeyError: 2

Pandas throws an error because it is confused whether the ```[2]``` is an *index* or a *label*.

In [132]:
simple_df

Unnamed: 0,a,b,c
0,1,2,3
1,4,5,6
2,7,8,9


In [135]:
simple_df.index = [2, 1, 0]

In [136]:
simple_df

Unnamed: 0,a,b,c
2,1,2,3
1,4,5,6
0,7,8,9


In [140]:
simple_df.iloc[2]

a    7
b    8
c    9
Name: 0, dtype: int64

## Position and Label Based Indexing: ```df.iloc``` and ```df.loc```

### iloc

In [141]:
simple_df.iloc[2]

a    7
b    8
c    9
Name: 0, dtype: int64

In [144]:
iris.iloc[[2, 4, 8]]

Unnamed: 0_level_0,sepal width (cm),petal length (cm),petal width (cm),species
sepal length (cm),Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
4.7,3.2,1.3,0.2,setosa
5.0,3.6,1.4,0.2,setosa
4.4,2.9,1.4,0.2,setosa


In [145]:
iris.iloc[[2, 4, 8], :]

Unnamed: 0_level_0,sepal width (cm),petal length (cm),petal width (cm),species
sepal length (cm),Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
4.7,3.2,1.3,0.2,setosa
5.0,3.6,1.4,0.2,setosa
4.4,2.9,1.4,0.2,setosa


In [152]:
iris.iloc[4:8, 2:]

Unnamed: 0_level_0,petal width (cm),species
sepal length (cm),Unnamed: 1_level_1,Unnamed: 2_level_1
5.0,0.2,setosa
5.4,0.4,setosa
4.6,0.3,setosa
5.0,0.2,setosa


In [155]:
iris.head(8)

Unnamed: 0_level_0,sepal width (cm),petal length (cm),petal width (cm),species
sepal length (cm),Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
5.1,3.5,1.4,0.2,setosa
4.9,3.0,1.4,0.2,setosa
4.7,3.2,1.3,0.2,setosa
4.6,3.1,1.5,0.2,setosa
5.0,3.6,1.4,0.2,setosa
5.4,3.9,1.7,0.4,setosa
4.6,3.4,1.4,0.3,setosa
5.0,3.4,1.5,0.2,setosa


In [1]:
list1 = [True if i%2==0 else False for i in range(150)]


iris.iloc[list1]

NameError: name 'iris' is not defined

In [147]:
iris.head()

Unnamed: 0_level_0,sepal width (cm),petal length (cm),petal width (cm),species
sepal length (cm),Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
5.1,3.5,1.4,0.2,setosa
4.9,3.0,1.4,0.2,setosa
4.7,3.2,1.3,0.2,setosa
4.6,3.1,1.5,0.2,setosa
5.0,3.6,1.4,0.2,setosa


In [146]:
iris.columns

Index(['sepal width (cm)', 'petal length (cm)', 'petal width (cm)', 'species'], dtype='object')

Question. Select a single column 'petal_length' using iloc

In [151]:
iris.iloc[:, 1].head()

sepal length (cm)
5.1    1.4
4.9    1.4
4.7    1.3
4.6    1.5
5.0    1.4
Name: petal length (cm), dtype: float64

Question. Retrieve all rows for columns 'petal length', 'petal width' using iloc

Question. Retrieve all columns for first 5 rows using iloc

Question. Retrieve rows 4, 8, 12 and columns density, ph, sulphates

In [None]:
iris.loc[3, 'species']

In [None]:
iris.loc[5]

In [None]:
iris.loc[5, :]

In [None]:
iris.loc[[3, 7, 8]]

In [None]:
iris.loc[[3, 7, 8], :]

In [None]:
iris.loc[iris['species'] == 'virginica']

Question. Find rows with sepal width 3.0 

Question. Find rows with sepal width 3.0 and petal length 5.2