### Pandas

In [2]:
import numpy as np
import pandas as pd

In [2]:
s = pd.Series(np.random.randn(5), index = ['a', 'b', 'c', 'd', 'e'])
s

a   -1.668983
b    1.082435
c   -0.243524
d    0.269498
e   -1.012729
dtype: float64

In [3]:
# Series can be instantiated bu dictionaries
d = {'b': 1, 'a': 0, 'c': 2}
pd.Series(d)

a    0
b    1
c    2
dtype: int64

In [4]:
dates = pd.date_range('20200101', periods=5)
dates

DatetimeIndex(['2020-01-01', '2020-01-02', '2020-01-03', '2020-01-04',
               '2020-01-05'],
              dtype='datetime64[ns]', freq='D')

In [5]:
df = pd.DataFrame(np.random.randn(5, 4), index=dates, columns=list('ABCD')) 
df

Unnamed: 0,A,B,C,D
2020-01-01,-2.254408,-1.298883,-0.261542,-0.980511
2020-01-02,0.63693,-0.542786,-1.839379,1.574845
2020-01-03,-0.199895,0.092349,0.304934,-2.236054
2020-01-04,-0.48376,-0.5331,0.572895,1.990393
2020-01-05,-0.32958,0.668731,1.055914,0.379169


In [6]:
df.head(2) # Gives first five entries if no number is mentioned

Unnamed: 0,A,B,C,D
2020-01-01,-2.254408,-1.298883,-0.261542,-0.980511
2020-01-02,0.63693,-0.542786,-1.839379,1.574845


In [7]:
df.tail(3) # Gives last five entries if no number is mentioned

Unnamed: 0,A,B,C,D
2020-01-03,-0.199895,0.092349,0.304934,-2.236054
2020-01-04,-0.48376,-0.5331,0.572895,1.990393
2020-01-05,-0.32958,0.668731,1.055914,0.379169


In [8]:
df.index # Index values that we mentioned in the dataframe

DatetimeIndex(['2020-01-01', '2020-01-02', '2020-01-03', '2020-01-04',
               '2020-01-05'],
              dtype='datetime64[ns]', freq='D')

In [9]:
df.columns # Column values mentioned in the dataframe

Index(['A', 'B', 'C', 'D'], dtype='object')

In [10]:
df.describe()

Unnamed: 0,A,B,C,D
count,5.0,5.0,5.0,5.0
mean,-0.526143,-0.322738,-0.033435,0.145568
std,1.059069,0.741762,1.116051,1.763587
min,-2.254408,-1.298883,-1.839379,-2.236054
25%,-0.48376,-0.542786,-0.261542,-0.980511
50%,-0.32958,-0.5331,0.304934,0.379169
75%,-0.199895,0.092349,0.572895,1.574845
max,0.63693,0.668731,1.055914,1.990393


In [11]:
df.T # Transpose

Unnamed: 0,2020-01-01,2020-01-02,2020-01-03,2020-01-04,2020-01-05
A,-2.254408,0.63693,-0.199895,-0.48376,-0.32958
B,-1.298883,-0.542786,0.092349,-0.5331,0.668731
C,-0.261542,-1.839379,0.304934,0.572895,1.055914
D,-0.980511,1.574845,-2.236054,1.990393,0.379169


In [12]:
df.sort_index(axis=0, ascending=False) # axis 0 is rows & axis 1 is columns

Unnamed: 0,A,B,C,D
2020-01-05,-0.32958,0.668731,1.055914,0.379169
2020-01-04,-0.48376,-0.5331,0.572895,1.990393
2020-01-03,-0.199895,0.092349,0.304934,-2.236054
2020-01-02,0.63693,-0.542786,-1.839379,1.574845
2020-01-01,-2.254408,-1.298883,-0.261542,-0.980511


In [13]:
df.sort_values(by='B')

Unnamed: 0,A,B,C,D
2020-01-01,-2.254408,-1.298883,-0.261542,-0.980511
2020-01-02,0.63693,-0.542786,-1.839379,1.574845
2020-01-04,-0.48376,-0.5331,0.572895,1.990393
2020-01-03,-0.199895,0.092349,0.304934,-2.236054
2020-01-05,-0.32958,0.668731,1.055914,0.379169


In [14]:
df['A'] # for particlar columns

2020-01-01   -2.254408
2020-01-02    0.636930
2020-01-03   -0.199895
2020-01-04   -0.483760
2020-01-05   -0.329580
Freq: D, Name: A, dtype: float64

In [15]:
df[:2] # slicing of dataframe by rows

Unnamed: 0,A,B,C,D
2020-01-01,-2.254408,-1.298883,-0.261542,-0.980511
2020-01-02,0.63693,-0.542786,-1.839379,1.574845


In [16]:
df.loc[dates[0]] # for particular row by label

A   -2.254408
B   -1.298883
C   -0.261542
D   -0.980511
Name: 2020-01-01 00:00:00, dtype: float64

In [17]:
df.loc[:, ['A', 'B', 'C']] # [rows, columns]

Unnamed: 0,A,B,C
2020-01-01,-2.254408,-1.298883,-0.261542
2020-01-02,0.63693,-0.542786,-1.839379
2020-01-03,-0.199895,0.092349,0.304934
2020-01-04,-0.48376,-0.5331,0.572895
2020-01-05,-0.32958,0.668731,1.055914


In [18]:
print(df.iloc[3]) # access via passed integer woth indexing like arrays 0, 1, 2, ...

A   -0.483760
B   -0.533100
C    0.572895
D    1.990393
Name: 2020-01-04 00:00:00, dtype: float64


In [19]:
df.iloc[3:5, 0:2] # more slicing
df.iloc[1:3, 1:3]

Unnamed: 0,B,C
2020-01-02,-0.542786,-1.839379
2020-01-03,0.092349,0.304934


In [20]:
df.iloc[1:3, :]

Unnamed: 0,A,B,C,D
2020-01-02,0.63693,-0.542786,-1.839379,1.574845
2020-01-03,-0.199895,0.092349,0.304934,-2.236054


In [21]:
df.iloc[:, 1:2]

Unnamed: 0,B
2020-01-01,-1.298883
2020-01-02,-0.542786
2020-01-03,0.092349
2020-01-04,-0.5331
2020-01-05,0.668731


In [22]:
df.iloc[1, 1] # rows, columns

-0.5427859214578571

In [23]:
#df.iloc[2, 3]
df.iat[2, 3] # only for one element

-2.2360542313992102

In [24]:
df.mean() # column-wise mean calculation

A   -0.526143
B   -0.322738
C   -0.033435
D    0.145568
dtype: float64

In [25]:
df.mean(1) # row-wise mean calculation

2020-01-01   -1.198836
2020-01-02   -0.042597
2020-01-03   -0.509667
2020-01-04    0.386607
2020-01-05    0.443559
Freq: D, dtype: float64

In [26]:
df.mean(0) # default

A   -0.526143
B   -0.322738
C   -0.033435
D    0.145568
dtype: float64

In [4]:
# Create dataframe
dfx = pd.DataFrame({'col1':range(3), 'col2':range(3,6)})
dfx

Unnamed: 0,col1,col2
0,0,3
1,1,4
2,2,5


In [6]:
# Change column names
dfx.rename(columns={'col1':'apples', 'col2':'oranges'})

Unnamed: 0,apples,oranges
0,0,3
1,1,4
2,2,5


In [7]:
help(pd.DataFrame.drop)

Help on function drop in module pandas.core.frame:

drop(self, labels=None, axis=0, index=None, columns=None, level=None, inplace=False, errors='raise')
    Drop specified labels from rows or columns.
    
    Remove rows or columns by specifying label names and corresponding
    axis, or by specifying directly index or column names. When using a
    multi-index, labels on different levels can be removed by specifying
    the level.
    
    Parameters
    ----------
    labels : single label or list-like
        Index or column labels to drop.
    axis : {0 or 'index', 1 or 'columns'}, default 0
        Whether to drop labels from the index (0 or 'index') or
        columns (1 or 'columns').
    index : single label or list-like
        Alternative to specifying axis (``labels, axis=0``
        is equivalent to ``index=labels``).
    
        .. versionadded:: 0.21.0
    columns : single label or list-like
        Alternative to specifying axis (``labels, axis=1``
        is equivalen

### Data Set Analysis

In [27]:
import pandas as pd

In [28]:
da = pd.read_csv('diabetes.csv')

In [29]:
type(da)

pandas.core.frame.DataFrame

In [30]:
da.shape

(768, 9)

In [31]:
da

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1
...,...,...,...,...,...,...,...,...,...
763,10,101,76,48,180,32.9,0.171,63,0
764,2,122,70,27,0,36.8,0.340,27,0
765,5,121,72,23,112,26.2,0.245,30,0
766,1,126,60,0,0,30.1,0.349,47,1


In [32]:
da.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [33]:
da.columns

Index(['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',
       'BMI', 'DiabetesPedigreeFunction', 'Age', 'Outcome'],
      dtype='object')

In [34]:
da.index

RangeIndex(start=0, stop=768, step=1)

In [35]:
da.iloc[3, :]

Pregnancies                  1.000
Glucose                     89.000
BloodPressure               66.000
SkinThickness               23.000
Insulin                     94.000
BMI                         28.100
DiabetesPedigreeFunction     0.167
Age                         21.000
Outcome                      0.000
Name: 3, dtype: float64

In [36]:
x = da.iloc[3:5, :2]
y= da.iloc[:, 2:5]
print(x)
print(y)

   Pregnancies  Glucose
3            1       89
4            0      137
     BloodPressure  SkinThickness  Insulin
0               72             35        0
1               66             29        0
2               64              0        0
3               66             23       94
4               40             35      168
..             ...            ...      ...
763             76             48      180
764             70             27        0
765             72             23      112
766             60              0        0
767             70             31        0

[768 rows x 3 columns]


In [37]:
print(pd.isnull(da.Glucose).sum())
print(pd.notnull(da.Glucose).sum())

0
768


In [38]:
print(pd.isnull(da.Pregnancies).sum())
print(pd.notnull(da.Pregnancies).sum())

0
768


In [39]:
da.loc[:, ['Pregnancies', 'Glucose']]

Unnamed: 0,Pregnancies,Glucose
0,6,148
1,1,85
2,8,183
3,1,89
4,0,137
...,...,...
763,10,101
764,2,122
765,5,121
766,1,126


In [40]:
da.loc[10:15]

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
10,4,110,92,0,0,37.6,0.191,30,0
11,10,168,74,0,0,38.0,0.537,34,1
12,10,139,80,0,0,27.1,1.441,57,0
13,1,189,60,23,846,30.1,0.398,59,1
14,5,166,72,19,175,25.8,0.587,51,1
15,7,100,0,0,0,30.0,0.484,32,1


In [41]:
da.iloc[:4]

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0


In [42]:
da.iloc[:3, 5:]

Unnamed: 0,BMI,DiabetesPedigreeFunction,Age,Outcome
0,33.6,0.627,50,1
1,26.6,0.351,31,0
2,23.3,0.672,32,1


In [43]:
da.Outcome.unique()

array([1, 0], dtype=int64)