In [1]:
import numpy as np
import pandas as pd

### Creating a Dataframe with a custom rows and columns name

In [2]:
df = pd.DataFrame(np.arange(0,20).reshape(5,4),index=['Row1','Row2','Row3','Row4','Row5'],columns=['Column1','Column2','Column3','Column4'])
df.to_csv('Test.csv')   # we can convert any dataframe into excel or other formats
df.head()   # head method prints out with row and column name (5 rows at default)

Unnamed: 0,Column1,Column2,Column3,Column4
Row1,0,1,2,3
Row2,4,5,6,7
Row3,8,9,10,11
Row4,12,13,14,15
Row5,16,17,18,19


### Accessing the elements
- loc
- iloc

In [3]:
df.loc['Row1']   # doesn't work for a column

Column1    0
Column2    1
Column3    2
Column4    3
Name: Row1, dtype: int32

In [4]:
type(df.loc['Row1'])    # this is a series

pandas.core.series.Series

In [5]:
df.iloc[:,:]

Unnamed: 0,Column1,Column2,Column3,Column4
Row1,0,1,2,3
Row2,4,5,6,7
Row3,8,9,10,11
Row4,12,13,14,15
Row5,16,17,18,19


In [6]:
df.iloc[:2,:2] 

Unnamed: 0,Column1,Column2
Row1,0,1
Row2,4,5


In [7]:
type(df.iloc[:2,:2])   # this is a dataframe because it has more than one row and column

pandas.core.frame.DataFrame

In [8]:
df.iloc[:1,:]

Unnamed: 0,Column1,Column2,Column3,Column4
Row1,0,1,2,3


In [9]:
type(df.iloc[:1,:])  # this is also a dataframe, this is confusing several times

pandas.core.frame.DataFrame

In [10]:
df.iloc[0,:]

Column1    0
Column2    1
Column3    2
Column4    3
Name: Row1, dtype: int32

In [11]:
type(df.iloc[0,:])   # now this is a series

pandas.core.series.Series

In [12]:
df.iloc[:,0]         # the series can be issued for any column too

Row1     0
Row2     4
Row3     8
Row4    12
Row5    16
Name: Column1, dtype: int32

In [13]:
type(df.iloc[:,0])

pandas.core.series.Series

In [14]:
print(df['Column2']) # can be accessed like this only for column
type(df['Column2'])

Row1     1
Row2     5
Row3     9
Row4    13
Row5    17
Name: Column2, dtype: int32


pandas.core.series.Series

In [15]:
print(df[ ['Column2','Column4'] ])
type(df[ ['Column2','Column4'] ])

      Column2  Column4
Row1        1        3
Row2        5        7
Row3        9       11
Row4       13       15
Row5       17       19


pandas.core.frame.DataFrame

### Converting Dataframe into Array

In [16]:
df.values

array([[ 0,  1,  2,  3],
       [ 4,  5,  6,  7],
       [ 8,  9, 10, 11],
       [12, 13, 14, 15],
       [16, 17, 18, 19]])

### Important Dataframe methods

In [17]:
df.isnull().sum()

Column1    0
Column2    0
Column3    0
Column4    0
dtype: int64

In [18]:
df['Column1'].value_counts()

Column1
0     1
4     1
8     1
12    1
16    1
Name: count, dtype: int64

In [19]:
df['Column1'].unique()

array([ 0,  4,  8, 12, 16])

### Reading from a CSV file

In [20]:
df = pd.read_csv('Test.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,Column1,Column2,Column3,Column4
0,Row1,0,1,2,3
1,Row2,4,5,6,7
2,Row3,8,9,10,11
3,Row4,12,13,14,15
4,Row5,16,17,18,19


In [21]:
df = pd.read_csv('Test.csv',index_col=0)
df.head()

Unnamed: 0,Column1,Column2,Column3,Column4
Row1,0,1,2,3
Row2,4,5,6,7
Row3,8,9,10,11
Row4,12,13,14,15
Row5,16,17,18,19


In [22]:
# suppose we are changing the csv files seperator as ';' from ',' in the file
df = pd.read_csv('Test.csv',index_col=0,sep=';')
df.head()

"Row1,0,1,2,3"
"Row2,4,5,6,7"
"Row3,8,9,10,11"
"Row4,12,13,14,15"
"Row5,16,17,18,19"


In [23]:
df = pd.read_csv('mercedesbenz.csv')
df.head()

Unnamed: 0,ID,y,X0,X1,X2,X3,X4,X5,X6,X8,...,X375,X376,X377,X378,X379,X380,X382,X383,X384,X385
0,0,130.81,k,v,at,a,d,u,j,o,...,0,0,1,0,0,0,0,0,0,0
1,6,88.53,k,t,av,e,d,y,l,o,...,1,0,0,0,0,0,0,0,0,0
2,7,76.26,az,w,n,c,d,x,j,x,...,0,0,0,0,0,0,1,0,0,0
3,9,80.62,az,t,n,f,d,x,l,e,...,0,0,0,0,0,0,0,0,0,0
4,13,78.02,az,v,n,f,d,h,d,n,...,0,0,0,0,0,0,0,0,0,0


In [24]:
df['X15'].value_counts()

X15
0    4207
1       2
Name: count, dtype: int64

In [25]:
df['X15'].unique()

array([0, 1], dtype=int64)

In [26]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4209 entries, 0 to 4208
Columns: 378 entries, ID to X385
dtypes: float64(1), int64(369), object(8)
memory usage: 12.1+ MB


In [27]:
df.describe()

Unnamed: 0,ID,y,X10,X11,X12,X13,X14,X15,X16,X17,...,X375,X376,X377,X378,X379,X380,X382,X383,X384,X385
count,4209.0,4209.0,4209.0,4209.0,4209.0,4209.0,4209.0,4209.0,4209.0,4209.0,...,4209.0,4209.0,4209.0,4209.0,4209.0,4209.0,4209.0,4209.0,4209.0,4209.0
mean,4205.960798,100.669318,0.013305,0.0,0.075077,0.057971,0.42813,0.000475,0.002613,0.007603,...,0.318841,0.057258,0.314802,0.02067,0.009503,0.008078,0.007603,0.001663,0.000475,0.001426
std,2437.608688,12.679381,0.11459,0.0,0.263547,0.233716,0.494867,0.021796,0.051061,0.086872,...,0.466082,0.232363,0.464492,0.142294,0.097033,0.089524,0.086872,0.040752,0.021796,0.037734
min,0.0,72.11,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,2095.0,90.82,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,4220.0,99.15,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,6314.0,109.01,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,8417.0,265.32,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [28]:
df['y']

0       130.81
1        88.53
2        76.26
3        80.62
4        78.02
         ...  
4204    107.39
4205    108.77
4206    109.22
4207     87.48
4208    110.85
Name: y, Length: 4209, dtype: float64

In [29]:
df['y'] > 100

0        True
1       False
2       False
3       False
4       False
        ...  
4204     True
4205     True
4206     True
4207    False
4208     True
Name: y, Length: 4209, dtype: bool

In [30]:
df[ df['y'] > 100 ]

Unnamed: 0,ID,y,X0,X1,X2,X3,X4,X5,X6,X8,...,X375,X376,X377,X378,X379,X380,X382,X383,X384,X385
0,0,130.81,k,v,at,a,d,u,j,o,...,0,0,1,0,0,0,0,0,0,0
6,24,128.76,al,r,e,f,d,f,h,s,...,0,0,0,0,0,0,0,0,0,0
8,27,108.67,w,s,as,e,d,f,i,h,...,1,0,0,0,0,0,0,0,0,0
9,30,126.99,j,b,aq,c,d,f,a,e,...,0,0,1,0,0,0,0,0,0,0
10,31,102.09,h,r,r,f,d,f,h,p,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4202,8402,123.34,ap,l,s,c,d,aa,d,r,...,0,0,0,0,0,0,0,0,0,0
4204,8405,107.39,ak,s,as,c,d,aa,d,q,...,1,0,0,0,0,0,0,0,0,0
4205,8406,108.77,j,o,t,d,d,aa,h,h,...,0,1,0,0,0,0,0,0,0,0
4206,8412,109.22,ak,v,r,a,d,aa,g,e,...,0,0,1,0,0,0,0,0,0,0


In [31]:
from io import StringIO, BytesIO

In [32]:
data = ('col1,col2,col3\n'
        'x,y,1\n'
        'a,b,2\n'
        'c,d,3')
df = pd.read_csv(StringIO(data))
df.head()

Unnamed: 0,col1,col2,col3
0,x,y,1
1,a,b,2
2,c,d,3


In [33]:
df = pd.read_csv(StringIO(data),usecols=['col1','col3'])
df.head()

Unnamed: 0,col1,col3
0,x,1
1,a,2
2,c,3


In [34]:
df = pd.read_csv(StringIO(data),dtype=object)
df['col3'][0]   # this int value is acting like an object

'1'

In [35]:
df = pd.read_csv(StringIO(data),usecols=['col3'],dtype=int)
df['col3'][0] # this int value is acting like an integer

1

In [36]:
df = pd.read_csv(StringIO(data),dtype={'col1':object,'col2':object,'col3':float})
df.head()

Unnamed: 0,col1,col2,col3
0,x,y,1.0
1,a,b,2.0
2,c,d,3.0


In [37]:
data = ('a,b,c\n'
        '4,apple,bat\n'
        '8,orange,cow')
df = pd.read_csv(StringIO(data))
df.head()

Unnamed: 0,a,b,c
0,4,apple,bat
1,8,orange,cow


In [38]:
data = '{"userId": 1,"id": 1,"title": "sunt aut facere repellat provident occaecati excepturi optio reprehenderit","body": "quia et suscipit\nsuscipit recusandae consequuntur expedita et cum\nreprehenderit molestiae ut ut quas totam\nnostrum rerum est autem sunt rem eveniet architecto"}'
df = pd.read_json(data,orient='index')
df.head()

Unnamed: 0,0
userId,1
id,1
title,sunt aut facere repellat provident occaecati e...
body,quia et suscipit\nsuscipit recusandae consequu...
