# Pandas Tutorial

Pandas is an open source, BSD-licensed library providing high perfomance, easy-to-use data structures and data analysis tools for the Python programming language.

In [1]:
import pandas as pd
import numpy as np

Data Frame - Combination of rows and columns and it will basically show a represtation format where in how your data exactly looks like in the excel sheet in the same way it will be loaded.

In [3]:
## Playing with Data Frame
## Creates a 2-D array with range between 0-20 and 5 rows and 4 columns

df = pd.DataFrame(np.arange(0,20).reshape(5,4),index=['Row1','Row2','Row3','Row4','Row5'],columns=['Column1','Column2','Column3','Column4'])

In [4]:
df.head()

Unnamed: 0,Column1,Column2,Column3,Column4
Row1,0,1,2,3
Row2,4,5,6,7
Row3,8,9,10,11
Row4,12,13,14,15
Row5,16,17,18,19


In [5]:
## CSV - Comma Separated Values

df.to_csv('test1.csv')

In [8]:
## Accessing the elements
## 1. .loc    2. .iloc

df.loc['Row1']



Column1    0
Column2    1
Column3    2
Column4    3
Name: Row1, dtype: int32

In [9]:
## It gives Series, which shows that series can be either one row or one column 
## If there are more than one row or column then it is a data frame
type(df.loc['Row1'])

pandas.core.series.Series

In [10]:
df.iloc[:,:]

Unnamed: 0,Column1,Column2,Column3,Column4
Row1,0,1,2,3
Row2,4,5,6,7
Row3,8,9,10,11
Row4,12,13,14,15
Row5,16,17,18,19


In [11]:
## Gives first three rows and first two columns
df.iloc[0:3,0:2]

Unnamed: 0,Column1,Column2
Row1,0,1
Row2,4,5
Row3,8,9


In [12]:
## This gives a data frame because it ha more than one row and one column
type(df.iloc[0:3,0:2])

pandas.core.frame.DataFrame

In [13]:
df.iloc[0:2,0]

Row1    0
Row2    4
Name: Column1, dtype: int32

In [15]:
type(df.iloc[0:2,0])

pandas.core.series.Series

In [17]:
## Convert data frames into arrays

df.iloc[:,1:].values

array([[ 1,  2,  3],
       [ 5,  6,  7],
       [ 9, 10, 11],
       [13, 14, 15],
       [17, 18, 19]])

In [18]:
df.iloc[:,1:].values.shape

(5, 3)

In [19]:
# When we first load a dataset we first need to check that which columns have null values
# So for that we use isnull() method along with the.sum() method.
# This will give us a count of null values for each column.
df.isnull().sum()

Column1    0
Column2    0
Column3    0
Column4    0
dtype: int64

In [20]:
# It gives how many unique values are there in column1
df['Column1'].value_counts()

0     1
4     1
8     1
12    1
16    1
Name: Column1, dtype: int64

In [23]:
# unique method is also used to check all the unique elements in a column
df['Column1'].unique()

array([ 0,  4,  8, 12, 16])

In [3]:
# We use sep because csv files are comma separated values but if we want
#to use another spearter parameter then such as ; then we can use sep

test_df=pd.read_csv('test1.csv',sep=';')

In [4]:
test_df.head()

Unnamed: 0.1,Unnamed: 0,Column1,Column2,Column3,Column4
0,Row1,0,1,2,3
1,Row2,4,5,6,7
2,Row3,8,9,10,11
3,Row4,12,13,14,15
4,Row5,16,17,18,19


## CSV

In [5]:
from io import StringIO, BytesIOy

In [6]:
data  = ('col1,col2,col3\n'
        'x,y,1\n'
        'a,b,2\n'
        'c,d,3')

In [7]:
type(data)

str

In [8]:
StringIO()

<_io.StringIO at 0x1c9a887a950>

In [10]:
pd.read_csv(StringIO(data))

Unnamed: 0,col1,col2,col3
0,x,y,1
1,a,b,2
2,c,d,3


In [13]:
#usecols help to retrieve only those columns which we want to

df=pd.read_csv(StringIO(data), usecols=['col1','col3'])

In [14]:
df

Unnamed: 0,col1,col3
0,x,1
1,a,2
2,c,3


In [15]:
df.to_csv('Test.csv')

In [16]:
# Specifying columns data types

data = ('a,b,c,d\n'
         '1,2,3,4\n'
         '5,6,7,8\n'
       '9,10,11,12\n')

In [17]:
print(data)

a,b,c,d
1,2,3,4
5,6,7,8
9,10,11,12



In [18]:
df=pd.read_csv(StringIO(data),dtype=object)

In [19]:
df

Unnamed: 0,a,b,c,d
0,1,2,3,4
1,5,6,7,8
2,9,10,11,12


In [20]:
df['a']

0    1
1    5
2    9
Name: a, dtype: object

In [21]:
df['a'][1]

'5'

In [22]:
df=pd.read_csv(StringIO(data),dtype=int)

In [23]:
df

Unnamed: 0,a,b,c,d
0,1,2,3,4
1,5,6,7,8
2,9,10,11,12


In [24]:
df['a']

0    1
1    5
2    9
Name: a, dtype: int32

In [26]:
df=pd.read_csv(StringIO(data),dtype={'b':int,'c':float,'a':'Int64'})

In [27]:
df

Unnamed: 0,a,b,c,d
0,1,2,3.0,4
1,5,6,7.0,8
2,9,10,11.0,12


In [28]:
df['a'][1]

5

In [29]:
df.dtypes

a      Int64
b      int32
c    float64
d      int64
dtype: object