# Pandas
- Open source BSD licensed library
- Easy to use data structure and data analysis tools

## Agenda
- What is data frame?
- What is data series?
- Different operations in Pandas

In [63]:
import pandas as pd
import numpy as np

In [64]:
#Creating data frame with value from 0 to 20. Row values are "Row1","Row2","Row3","Row4","Row5" and
# columns are "colum1","colum2","colum3","colum4"

df = pd.DataFrame(np.arange(0,20).reshape(5,4),                   # Data in the data frame
                  index=["Row1","Row2","Row3","Row4","Row5"],     # Rows of the data frame
                  columns=["colum1","colum2","colum3","colum4"])  # Columns of the data frame
df

Unnamed: 0,colum1,colum2,colum3,colum4
Row1,0,1,2,3
Row2,4,5,6,7
Row3,8,9,10,11
Row4,12,13,14,15
Row5,16,17,18,19


In [65]:
#data frame series
df.loc["Row3"]

colum1     8
colum2     9
colum3    10
colum4    11
Name: Row3, dtype: int64

In [66]:
type(df.loc["Row3"])

pandas.core.series.Series

In [67]:
df.iloc[0:3,0:4]

Unnamed: 0,colum1,colum2,colum3,colum4
Row1,0,1,2,3
Row2,4,5,6,7
Row3,8,9,10,11


In [68]:
type(df.iloc[0:3,0:4])

pandas.core.frame.DataFrame

In [69]:
df.iloc[0:1,0]

Row1    0
Name: colum1, dtype: int64

In [70]:
type(df.iloc[0:1,0])

pandas.core.series.Series

In [71]:
df.iloc[:, :].values

array([[ 0,  1,  2,  3],
       [ 4,  5,  6,  7],
       [ 8,  9, 10, 11],
       [12, 13, 14, 15],
       [16, 17, 18, 19]])

In [72]:
df["colum1"]

Row1     0
Row2     4
Row3     8
Row4    12
Row5    16
Name: colum1, dtype: int64

### value_counts
Shows the value count of each items
Eg : 12 is present 1 times, 4 is present 1 times etc

In [73]:
df["colum1"].value_counts()

12    1
4     1
16    1
8     1
0     1
Name: colum1, dtype: int64

In [74]:
df.isnull().sum()

colum1    0
colum2    0
colum3    0
colum4    0
dtype: int64

### Unique
Shows all the unique value present in column1

In [75]:
df["colum1"].unique()

array([ 0,  4,  8, 12, 16])

In [76]:
df[["colum1", "colum2"]]

Unnamed: 0,colum1,colum2
Row1,0,1
Row2,4,5
Row3,8,9
Row4,12,13
Row5,16,17


### Creating CSV file from data

In [77]:
df.to_csv("test.csv")

In [78]:
df = pd.read_csv("mercedesbenz.csv")
# opening top 5 records
df.head()

Unnamed: 0,ID,y,X0,X1,X2,X3,X4,X5,X6,X8,...,X375,X376,X377,X378,X379,X380,X382,X383,X384,X385
0,0,130.81,k,v,at,a,d,u,j,o,...,0,0,1,0,0,0,0,0,0,0
1,6,88.53,k,t,av,e,d,y,l,o,...,1,0,0,0,0,0,0,0,0,0
2,7,76.26,az,w,n,c,d,x,j,x,...,0,0,0,0,0,0,1,0,0,0
3,9,80.62,az,t,n,f,d,x,l,e,...,0,0,0,0,0,0,0,0,0,0
4,13,78.02,az,v,n,f,d,h,d,n,...,0,0,0,0,0,0,0,0,0,0


In [79]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4209 entries, 0 to 4208
Columns: 378 entries, ID to X385
dtypes: float64(1), int64(369), object(8)
memory usage: 12.1+ MB


In [80]:
df.describe()

Unnamed: 0,ID,y,X10,X11,X12,X13,X14,X15,X16,X17,...,X375,X376,X377,X378,X379,X380,X382,X383,X384,X385
count,4209.0,4209.0,4209.0,4209.0,4209.0,4209.0,4209.0,4209.0,4209.0,4209.0,...,4209.0,4209.0,4209.0,4209.0,4209.0,4209.0,4209.0,4209.0,4209.0,4209.0
mean,4205.960798,100.669318,0.013305,0.0,0.075077,0.057971,0.42813,0.000475,0.002613,0.007603,...,0.318841,0.057258,0.314802,0.02067,0.009503,0.008078,0.007603,0.001663,0.000475,0.001426
std,2437.608688,12.679381,0.11459,0.0,0.263547,0.233716,0.494867,0.021796,0.051061,0.086872,...,0.466082,0.232363,0.464492,0.142294,0.097033,0.089524,0.086872,0.040752,0.021796,0.037734
min,0.0,72.11,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,2095.0,90.82,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,4220.0,99.15,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,6314.0,109.01,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,8417.0,265.32,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [82]:
df = pd.read_csv("test.csv")
df

Unnamed: 0.1,Unnamed: 0,colum1,colum2,colum3,colum4
0,Row1,0,1,2,3
1,Row2,4,5,6,7
2,Row3,8,9,10,11
3,Row4,12,13,14,15
4,Row5,16,17,18,19
