## PANDAS

## import pandas library

In [1]:
import pandas as pd

In [2]:
import numpy as np

In [5]:
val = np.linspace(0,1,10)

In [6]:
ind = ['a','b','c','d','e','f','g','h','i','j']

## Series

One-dimensional ndarray with axis labels (including time series).

In [7]:
data = pd.Series(val,ind)
data

a    0.000000
b    0.111111
c    0.222222
d    0.333333
e    0.444444
f    0.555556
g    0.666667
h    0.777778
i    0.888889
j    1.000000
dtype: float64

In [8]:
data.index

Index(['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j'], dtype='object')

In [9]:
data.values

array([0.        , 0.11111111, 0.22222222, 0.33333333, 0.44444444,
       0.55555556, 0.66666667, 0.77777778, 0.88888889, 1.        ])

## DataFrame

Two-dimensional size-mutable, potentially heterogeneous tabular data
structure with labeled axes (rows and columns). Can be thought of as a dict-like
container for Series objects. The primary pandas data structure.

In [10]:
dict = {'Emp':['John', 'Marry','Joseph'],
        'ID':[101,102,103],
         'Age' : [22,34,29]}

In [11]:
dict

{'Emp': ['John', 'Marry', 'Joseph'],
 'ID': [101, 102, 103],
 'Age': [22, 34, 29]}

In [12]:
ind = ['a','b','c']

In [13]:
df1 = pd.DataFrame(dict,ind)
df1

Unnamed: 0,Emp,ID,Age
a,John,101,22
b,Marry,102,34
c,Joseph,103,29


In [14]:
#to fetch Emp column
df1['Emp']

a      John
b     Marry
c    Joseph
Name: Emp, dtype: object

In [15]:
#to extract multiple column
df1[['Emp','ID']]

Unnamed: 0,Emp,ID
a,John,101
b,Marry,102
c,Joseph,103


In [16]:
#to extract rows
df1[1:2]

Unnamed: 0,Emp,ID,Age
b,Marry,102,34


## slicing using loc
## loc function is used to select columns by names 

In [19]:
df1.loc[:, ['Emp']]

Unnamed: 0,Emp
a,John
b,Marry
c,Joseph


## iloc method uses the index instead of the columns name 

In [22]:
##extracting all rows and columns till 2nd column where 2 is exclusive
df1.iloc[:, :]

Unnamed: 0,Emp,ID,Age
a,John,101,22
b,Marry,102,34
c,Joseph,103,29


## Concatenation
## we can concatenate two DataFrames using function pd.concat()

In [23]:
df1 = pd.DataFrame(dict,ind)
df1

Unnamed: 0,Emp,ID,Age
a,John,101,22
b,Marry,102,34
c,Joseph,103,29


In [24]:
df2 = pd.DataFrame({'Emp': ['Alan', 'Smith'],
                     'ID': [201,202],
                     'Age':[32,27]}, index = ['d','e'])
df2

Unnamed: 0,Emp,ID,Age
d,Alan,201,32
e,Smith,202,27


In [26]:
con = pd.concat([df1,df2])
con

Unnamed: 0,Emp,ID,Age
a,John,101,22
b,Marry,102,34
c,Joseph,103,29
d,Alan,201,32
e,Smith,202,27


## Reading csv file

In [56]:
df = pd.read_csv(r'C:\Users\DELL\Downloads\insurance.csv')

In [57]:
#to view the first few records
#df.iloc[10:20,:]
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19.0,female,27.9,0,yes,southwest,16884.924
1,18.0,male,33.77,1,no,southeast,1725.5523
2,28.0,male,33.0,3,no,southeast,4449.462
3,33.0,male,22.705,0,no,northwest,21984.47061
4,32.0,male,28.88,0,no,northwest,3866.8552


In [58]:
#getting info about the dataset
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
age         1334 non-null float64
sex         1338 non-null object
bmi         1338 non-null float64
children    1338 non-null int64
smoker      1338 non-null object
region      1338 non-null object
charges     1338 non-null float64
dtypes: float64(3), int64(1), object(3)
memory usage: 73.2+ KB


In [48]:
#understanding the variables
df.describe()

Unnamed: 0,age,bmi,children,charges
count,1334.0,1338.0,1338.0,1338.0
mean,39.233883,30.663397,1.094918,13270.422265
std,14.053403,6.098187,1.205493,12110.011237
min,18.0,15.96,0.0,1121.8739
25%,27.0,26.29625,0.0,4740.28715
50%,39.0,30.4,1.0,9382.033
75%,51.0,34.69375,2.0,16639.912515
max,64.0,53.13,5.0,63770.42801


In [49]:
#to check for null values
df.isnull().sum()

age         4
sex         0
bmi         0
children    0
smoker      0
region      0
charges     0
dtype: int64

In [None]:
df.dropna()

In [61]:
df1 = df.fillna(1)

In [64]:
df1.isnull().sum()

age         0
sex         0
bmi         0
children    0
smoker      0
region      0
charges     0
dtype: int64

In [65]:
df.shape

(1338, 7)

In [66]:
#generate relationships between each continuous variables
df.corr()

Unnamed: 0,age,bmi,children,charges
age,1.0,0.108932,0.043629,0.298006
bmi,0.108932,1.0,0.012759,0.198341
children,0.043629,0.012759,1.0,0.067998
charges,0.298006,0.198341,0.067998,1.0


In [67]:
df.max()

age                64
sex              male
bmi             53.13
children            5
smoker            yes
region      southwest
charges       63770.4
dtype: object

## groupby()
## split the data into groups based on some criteria

In [72]:
gb = df.groupby('children')
print(gb)

<pandas.core.groupby.groupby.DataFrameGroupBy object at 0x0000026D4926B080>


In [73]:
#prints the 1st value/row for each group
#gb.first()
gb.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19.0,female,27.9,0,yes,southwest,16884.924
1,18.0,male,33.77,1,no,southeast,1725.5523
2,28.0,male,33.0,3,no,southeast,4449.462
3,33.0,male,22.705,0,no,northwest,21984.47061
4,32.0,male,28.88,0,no,northwest,3866.8552
5,31.0,female,25.74,0,no,southeast,3756.6216
6,46.0,female,33.44,1,no,southeast,8240.5896
7,37.0,female,27.74,3,no,northwest,7281.5056
8,37.0,male,29.83,2,no,northeast,6406.4107
9,60.0,female,25.84,0,no,northwest,28923.13692
