# Pandas Tutorial

Pandas makes use of two data structures
 - series: representing data in one dimensional form
 - dataframes: representing data in two dimensional form

In [2]:
import numpy as np
import pandas as pd

pd.set_option('display.max_columns', None)

## 1. General

In [2]:
df = pd.read_csv('weather.csv')  # loads .csv file into pandas dataframe

The default separator between values is commna (,). It the values are separated by other means, we need to specidy the separtor. 

In [3]:
df.head()   # returns first five rows (unless passed an argument) of dataframe

Unnamed: 0,month,avg_high,avg_low,record_high,record_low,avg_precipitation
0,Jan,58,42,74,22,2.95
1,Feb,61,45,78,26,3.02
2,Mar,65,48,84,25,2.34
3,Apr,67,50,92,28,1.02
4,May,71,53,98,35,0.48


In [4]:
df.tail()   # returns last five rows (unless passed an argument) of dataframe

Unnamed: 0,month,avg_high,avg_low,record_high,record_low,avg_precipitation
7,Aug,77,59,102,43,0.03
8,Sep,77,57,103,40,0.17
9,Oct,73,54,96,34,0.81
10,Nov,64,48,84,30,1.7
11,Dec,58,42,73,21,2.56


In [5]:
df.columns          # returns the column names

Index(['month', 'avg_high', 'avg_low', 'record_high', 'record_low',
       'avg_precipitation'],
      dtype='object')

In [6]:
df.values           # returns the column values

array([['Jan', 58, 42, 74, 22, 2.95],
       ['Feb', 61, 45, 78, 26, 3.02],
       ['Mar', 65, 48, 84, 25, 2.34],
       ['Apr', 67, 50, 92, 28, 1.02],
       ['May', 71, 53, 98, 35, 0.48],
       ['Jun', 75, 56, 107, 41, 0.11],
       ['Jul', 77, 58, 105, 44, 0.0],
       ['Aug', 77, 59, 102, 43, 0.03],
       ['Sep', 77, 57, 103, 40, 0.17],
       ['Oct', 73, 54, 96, 34, 0.81],
       ['Nov', 64, 48, 84, 30, 1.7],
       ['Dec', 58, 42, 73, 21, 2.56]], dtype=object)

In [7]:
df.dtypes               # returns the datatype of the columns

month                 object
avg_high               int64
avg_low                int64
record_high            int64
record_low             int64
avg_precipitation    float64
dtype: object

In [8]:
df.index                # returns the row indices

RangeIndex(start=0, stop=12, step=1)

In [9]:
for index,row in df.iterrows():                 # iterating through the rows of the dataframe
    print(index, row['month'], row['avg_high']) # avoid such iterations: use built-in functions

0 Jan 58
1 Feb 61
2 Mar 65
3 Apr 67
4 May 71
5 Jun 75
6 Jul 77
7 Aug 77
8 Sep 77
9 Oct 73
10 Nov 64
11 Dec 58


In [10]:
df.describe()               # returns the statistical summary of the data

Unnamed: 0,avg_high,avg_low,record_high,record_low,avg_precipitation
count,12.0,12.0,12.0,12.0,12.0
mean,68.583333,51.0,91.333333,32.416667,1.265833
std,7.366488,6.060303,12.323911,8.240238,1.186396
min,58.0,42.0,73.0,21.0,0.0
25%,63.25,47.25,82.5,25.75,0.155
50%,69.0,51.5,94.0,32.0,0.915
75%,75.5,56.25,102.25,40.25,2.395
max,77.0,59.0,107.0,44.0,3.02


In [11]:
df['record_low'].describe()   # returns statistical summary of desired column

count    12.000000
mean     32.416667
std       8.240238
min      21.000000
25%      25.750000
50%      32.000000
75%      40.250000
max      44.000000
Name: record_low, dtype: float64

In [12]:
df.rename(columns = {'avg_precipitation':'avg_rain'}, inplace = True)     # renaming a column name
df.columns

Index(['month', 'avg_high', 'avg_low', 'record_high', 'record_low',
       'avg_rain'],
      dtype='object')

In [13]:
cdf = df[df['month'].isin(['Jun', 'Jul', 'Aug'])] # returns rows for month that is in list [Jun,July,Aug]
cdf.head()

Unnamed: 0,month,avg_high,avg_low,record_high,record_low,avg_rain
5,Jun,75,56,107,41,0.11
6,Jul,77,58,105,44,0.0
7,Aug,77,59,102,43,0.03


In [14]:
df.to_csv('temp.csv')     # to write a dataframe in a .csv format

## 2. Accessing the data

In [15]:
df = pd.read_csv('weather.csv')
df.head()

Unnamed: 0,month,avg_high,avg_low,record_high,record_low,avg_precipitation
0,Jan,58,42,74,22,2.95
1,Feb,61,45,78,26,3.02
2,Mar,65,48,84,25,2.34
3,Apr,67,50,92,28,1.02
4,May,71,53,98,35,0.48


In [16]:
df.avg_low     # returns the desired column

0     42
1     45
2     48
3     50
4     53
5     56
6     58
7     59
8     57
9     54
10    48
11    42
Name: avg_low, dtype: int64

In [17]:
df['avg_low']  # returns the desired column

0     42
1     45
2     48
3     50
4     53
5     56
6     58
7     59
8     57
9     54
10    48
11    42
Name: avg_low, dtype: int64

In [18]:
df[['avg_low','avg_high']]  # returns multiple columns (argument passed is a list)

Unnamed: 0,avg_low,avg_high
0,42,58
1,45,61
2,48,65
3,50,67
4,53,71
5,56,75
6,58,77
7,59,77
8,57,77
9,54,73


In [19]:
df[2:6]             # slices out rows 2,3,4,5

Unnamed: 0,month,avg_high,avg_low,record_high,record_low,avg_precipitation
2,Mar,65,48,84,25,2.34
3,Apr,67,50,92,28,1.02
4,May,71,53,98,35,0.48
5,Jun,75,56,107,41,0.11


In [20]:
df.loc[2, ['avg_low']]  # returns the value at row 2 and column 'avg_low'

avg_low    48
Name: 2, dtype: object

In [21]:
df.loc[:, ['avg_high','record_low']] # slices out all rows and columns 'avg_high','record_low'

Unnamed: 0,avg_high,record_low
0,58,22
1,61,26
2,65,25
3,67,28
4,71,35
5,75,41
6,77,44
7,77,43
8,77,40
9,73,34


In [22]:
df.loc[2:6,['avg_high','record_low']] # slices out rows 2,3,4,5,6 and columns 'avg_high','record_low'

Unnamed: 0,avg_high,record_low
2,65,25
3,67,28
4,71,35
5,75,41
6,77,44


In [23]:
df.loc[2:6,'avg_high':'record_low'] # slices out rows 2,3,4,5,6 and columns 'avg_high' to 'record_low'

Unnamed: 0,avg_high,avg_low,record_high,record_low
2,65,48,84,25
3,67,50,92,28
4,71,53,98,35
5,75,56,107,41
6,77,58,105,44


In [24]:
df.iloc[0,2]      # returns the value in first row and third column

42

In [25]:
df.iloc[2:6]      # slices out rows 2,3,4,5 (iloc means index location)

Unnamed: 0,month,avg_high,avg_low,record_high,record_low,avg_precipitation
2,Mar,65,48,84,25,2.34
3,Apr,67,50,92,28,1.02
4,May,71,53,98,35,0.48
5,Jun,75,56,107,41,0.11


In [26]:
df.iloc[2:6,[1,5]]  # slices out rows 2,3,4,5 and columns 1 and 5

Unnamed: 0,avg_high,avg_precipitation
2,65,2.34
3,67,1.02
4,71,0.48
5,75,0.11


In [27]:
df.iloc[2:6, 1:5]   # slices out rows 2,3,4,5 and columns 1 to 4

Unnamed: 0,avg_high,avg_low,record_high,record_low
2,65,48,84,25
3,67,50,92,28
4,71,53,98,35
5,75,56,107,41


## Mathematical Operations

In [28]:
df = pd.read_csv('weather.csv')
df.head()

Unnamed: 0,month,avg_high,avg_low,record_high,record_low,avg_precipitation
0,Jan,58,42,74,22,2.95
1,Feb,61,45,78,26,3.02
2,Mar,65,48,84,25,2.34
3,Apr,67,50,92,28,1.02
4,May,71,53,98,35,0.48


In [29]:
df.loc[1, ['avg_precipitation']] = 4.02     # change a particular element
df.head(2)

Unnamed: 0,month,avg_high,avg_low,record_high,record_low,avg_precipitation
0,Jan,58,42,74,22,2.95
1,Feb,61,45,78,26,4.02


In [30]:
df.loc[0,'avg_precipitation'] = np.nan  # nan to handle missing data/inappropriate data
df.head(2)

Unnamed: 0,month,avg_high,avg_low,record_high,record_low,avg_precipitation
0,Jan,58,42,74,22,
1,Feb,61,45,78,26,4.02


In [31]:
df['avg_low'] = df['avg_low'] + 1     # adds 1 to each element of column avg_low
df.head(2)

Unnamed: 0,month,avg_high,avg_low,record_high,record_low,avg_precipitation
0,Jan,58,43,74,22,
1,Feb,61,46,78,26,4.02


In [32]:
df['avg_low'] = 2*df['avg_low']       # multiplies each element of column avg_low by 2
df.head(2)

Unnamed: 0,month,avg_high,avg_low,record_high,record_low,avg_precipitation
0,Jan,58,86,74,22,
1,Feb,61,92,78,26,4.02


In [33]:
df.loc[:,'avg_low'] = np.array([5]*len(df))  # sets avg_low column to 5 for all rows
df.head(2)

Unnamed: 0,month,avg_high,avg_low,record_high,record_low,avg_precipitation
0,Jan,58,5,74,22,
1,Feb,61,5,78,26,4.02


In [34]:
df['avg_record'] = (df.record_high + df.record_low)/2.0 # new column: mean of avg_low & avg_high
df.head(2)

Unnamed: 0,month,avg_high,avg_low,record_high,record_low,avg_precipitation,avg_record
0,Jan,58,5,74,22,,48.0
1,Feb,61,5,78,26,4.02,52.0


In [35]:
df['avg_high'].unique()     # returns the unique values in the desired column

array([58, 61, 65, 67, 71, 75, 77, 73, 64])

In [36]:
cdf = df.sort_values('record_high', ascending = False) # sorts rows according to desired column
cdf.head()

Unnamed: 0,month,avg_high,avg_low,record_high,record_low,avg_precipitation,avg_record
5,Jun,75,5,107,41,0.11,74.0
6,Jul,77,5,105,44,0.0,74.5
8,Sep,77,5,103,40,0.17,71.5
7,Aug,77,5,102,43,0.03,72.5
4,May,71,5,98,35,0.48,66.5


In [37]:
df[df.avg_precipitation > 1.0] # returns rows where avg_precipitation > 1.0

Unnamed: 0,month,avg_high,avg_low,record_high,record_low,avg_precipitation,avg_record
1,Feb,61,5,78,26,4.02,52.0
2,Mar,65,5,84,25,2.34,54.5
3,Apr,67,5,92,28,1.02,60.0
10,Nov,64,5,84,30,1.7,57.0
11,Dec,58,5,73,21,2.56,47.0


## Everything Else

Creating series from a dictionary

In [3]:
temp_dict = {'a' : 1, 'b' : 2, 'c' : 3}
temp_series = pd.Series(temp_dict)
temp_series

a    1
b    2
c    3
dtype: int64

Creating series from lists: first list is value and second is the index

In [10]:
temp = pd.Series([100, 90, 80], ['Sachin', 'Sourav', 'Dravid'])
print(temp)
print(temp['Sachin'])

Sachin    100
Sourav     90
Dravid     80
dtype: int64
100


**Creating dataframe from a dictionary**

In [11]:
temp_dict = {'Name':['Aamir', 'Salman', 'Shahrukh'],
             'Movie':['Rangeela', 'Saajan', 'Darr'],
             'Year':[1995, 1991, 1993]}                 # a dictionary

temp_df = pd.DataFrame(temp_dict)   # creating a data frame from a dictionary

temp_df.head()                      # key corresponds to column labels

Unnamed: 0,Name,Movie,Year
0,Aamir,Rangeela,1995
1,Salman,Saajan,1991
2,Shahrukh,Darr,1993


## File formats

Lets read file in .txt format

In [12]:
df = pd.read_csv('boston.txt', sep = '\t')  # text file with tab separator
df.head()

Unnamed: 0,MV,INDUS,NOX,RM,TAX,PT,LSTAT
0,24.0,2.31,53.8,6.575,296,15.3,4.98
1,21.6,7.07,46.9,6.421,242,17.8,9.14
2,34.7,7.07,46.9,7.185,242,17.8,4.03
3,33.4,2.18,45.8,6.998,222,18.7,2.94
4,36.2,2.18,45.8,7.147,222,18.7,5.33


In [13]:
import html5lib

In [19]:
indian_states = pd.read_html('https://en.wikipedia.org/wiki/States_and_union_territories_of_India')
print(type(indian_states))

<class 'list'>


In [23]:
indian_states[3]

Unnamed: 0,State,ISO 3166-2:IN,Vehiclecode,Zone,Capital,Largest city,Statehood,Population[40],Area(km2),Officiallanguages[41],Additional officiallanguages[41]
0,Andhra Pradesh,IN-AP,AP,Southern,Hyderabad (de jure)Amaravati (de facto) Note 1...,Visakhapatnam,1 November 1956,49506799,160205,Telugu,—
1,Arunachal Pradesh,IN-AR,AR,North-Eastern,Itanagar,Itanagar,20 February 1987,1383727,83743,English,—
2,Assam,IN-AS,AS,North-Eastern,Dispur,Guwahati,26 January 1950,31205576,78550,Assamese,"Bengali, Bodo"
3,Bihar,IN-BR,BR,Eastern,Patna,Patna,26 January 1950,104099452,94163,Hindi,Urdu
4,Chhattisgarh,IN-CT,CG,Central,Naya Raipur,Raipur,1 November 2000,25545198,135194,Hindi,—
5,Goa,IN-GA,GA,Western,Panaji,Vasco da Gama,30 May 1987,1458545,3702,Konkani,"English, Marathi"
6,Gujarat,IN-GJ,GJ,Western,Gandhinagar,Ahmedabad,1 May 1960,60439692,196024,Gujarati,—
7,Haryana,IN-HR,HR,Northern,Chandigarh,Faridabad,1 November 1966,25351462,44212,Hindi,Punjabi[44][45]
8,Himachal Pradesh,IN-HP,HP,Northern,Shimla (Summer)Dharamshala (Winter)[46],Shimla,25 January 1971,6864602,55673,Hindi,Sanskrit[47]
9,Jharkhand,IN-JH,JH,Eastern,Ranchi,Jamshedpur,15 November 2000,32988134,74677,Hindi,Urdu[48]


In [24]:
type(indian_states[3])

pandas.core.frame.DataFrame