# Pandas Basics

In [2]:
import pandas as pd
from pandas import DataFrame, read_csv # General syntax to import specific functions in a library
import numpy as np
import matplotlib.pyplot as plt
import matplotlib
import os

## Intro to Data Structures
### Series

In [7]:
# Series is a one-dimensional labeled array

# series from array
s = pd.Series(np.arange(10)**2, index =[0,1,2,3,4,5,6,7,8,9] )
s.index          # to get the index

# series from dict
a = {'a':1, 'b':2, 'c':3}
b = pd.Series(a)
c = pd.Series(a, index=['d','c','b','a'])  # Fills the unknown index with NaN

# From a scalar
d = pd.Series(9, index=['a','b','c','d'])  # value will be repeated to match the length of index.

a    9
b    9
c    9
d    9
dtype: int64

## Accessing data from a pd.Series

In [32]:
# Series are simillar to ndarray
b[0]    # 1st element
c[:4]
d[[2,1,3]]

# Accessing data using index
c['d']
c['a']

# we can use np functions on a series
np.sqrt(d)
np.exp(d)

e = c + c
print(e)

'a' in c    # check if an index value exist

c.get('a')

# Naming a series
f = pd.Series([1,2,3,4,5,6], name = 'one_six')
f.name

# renaming of an Series
g = f.rename('One_Six')
g.name

d    NaN
c    6.0
b    4.0
a    2.0
dtype: float64


'One_Six'

### DataFrame

In [49]:
# DataFrame from pd.series
d = {'one' : pd.Series([1., 2., 3.], index=['a', 'b', 'c']),
   'two' : pd.Series([1., 2., 3., 4.], index=['a', 'b', 'c', 'd'])}
df = pd.DataFrame(d)

df1 = pd.DataFrame(d, index = ['a','b','c'], columns = ['one', 'two'])
df1.index         # to get an array of index
df1.columns       # to get the names of the columns

# DataFrame from dict
e = {'one':[1,2,3,4,5],
      'two':[6,7,8,9,10]}
edf = pd.DataFrame(e, index = ['q','w','e','r','t'])
edf

Unnamed: 0,one,two
q,1,6
w,2,7
e,3,8
r,4,9
t,5,10


## Column selection, addition, deletion

In [53]:
edf['three'] = edf['one'] * edf['two']  # new column from existing columh
edf['flag'] = edf['three'] < 8
edf

# remove a column
# del edf['two']

# filling a column with same element for every row
edf['five'] = 10

edf['six'] = edf['one'][:2]
edf

Unnamed: 0,one,two,three,flag,five,six
q,1,6,6,True,10,1.0
w,2,7,14,False,10,2.0
e,3,8,24,False,10,
r,4,9,36,False,10,
t,5,10,50,False,10,


## Indexing / Selection

In [57]:
# select column
edf['two']

# select row using its index
edf.loc['e']

# selecting rows with its int location
edf.iloc[2:4]

# selecting rows using a boolean
edf[edf['flag']]

Unnamed: 0,one,two,three,flag,five,six
q,1,6,6,True,10,1.0


## Viewing Data

In [63]:
edf.head(5)     # Top 5 rows
edf.tail(5)     # bottom 5 rows
edf.index
edf.columns
edf.values      # All values in the form of array

# Statistic summary
edf.describe()

# sort by index
edf.sort_index(axis=1, ascending=False)

#sorting by values
edf.sort_values(by = 'two')

#Boolean Indexing
edf[edf.three > 20]

edf[edf['flag'].isin([True])]

Unnamed: 0,one,two,three,flag,five,six
q,1,6,6,True,10,1.0


## Handling missing data

In [1]:
edf.dropna(how = 'any') # dropping the rows with na

edf.fillna(value = 5)   # fill na with a perticular value

pd.isna(edf)

NameError: name 'edf' is not defined

## Apply

In [71]:
edf.apply(lambda x: x.max() - x.min())

one       4.0
two       4.0
three    44.0
flag      1.0
five      0.0
six       1.0
dtype: float64

# Counting the unique values

In [74]:
s = pd.Series(np.random.randint(0, 7, size=10))
s.value_counts()    # index has unique values and column has its respective counts

5    3
3    3
0    2
6    1
1    1
dtype: int64

## converting  'ABC' to 'abc' 

In [79]:
s = pd.Series(['A', 'B', 'C', 'Aaba', 'Baca', np.nan, 'CABA', 'dog', 'cat'])
s.str.lower()

0       a
1       b
2       c
3    aaba
4    baca
5     NaN
6    caba
7     dog
8     cat
dtype: object

## Mearge

In [88]:
df = pd.DataFrame(np.random.randn(10, 4))
x = [df[0:3], df[3:6], df[6:]]
pd.concat(x)     # simillar to vstack

Unnamed: 0,0,1,2,3
0,-1.001553,-0.238485,0.893561,2.886936
1,-0.312829,0.190385,1.735057,-0.730807
2,-0.233343,-0.359333,0.257148,-0.132271
3,-0.134713,0.405327,0.268533,-0.076236
4,-1.252267,0.186314,-0.698993,3.359102
5,-0.352296,0.034784,0.104255,0.532966
6,-2.255845,1.37631,-0.467503,-0.49265
7,-0.396973,-0.622461,0.126851,0.411275
8,1.52602,-0.148417,1.676227,0.219067
9,0.054267,-1.057787,-0.963126,-0.719789


## Join

In [85]:
left = pd.DataFrame({'key': ['foo', 'bar'], 'lval': [1, 2]})
right = pd.DataFrame({'key': ['foo', 'bar'], 'rval': [4, 5]})
middle = pd.merge(left, right, on = 'key')
middle

Unnamed: 0,key,lval,rval
0,foo,1,4
1,bar,2,5


## Append

In [89]:
df = pd.DataFrame(np.random.randn(8, 4), columns=['A','B','C','D'])
s = df.iloc[3]
df.append(s,ignore_index= True)

Unnamed: 0,A,B,C,D
0,1.217043,0.498129,-1.050399,0.181601
1,-0.175095,0.6613,1.203731,1.21141
2,1.96447,0.827645,0.738239,-2.283212
3,2.159575,-0.441985,-0.279173,-0.391175
4,-0.185842,1.602047,0.269888,0.427732
5,2.039794,-1.570256,-0.505026,-1.345123
6,0.252666,0.834449,0.745144,0.576684
7,0.436464,-0.009596,1.824732,0.581122
8,2.159575,-0.441985,-0.279173,-0.391175


## Grouping

In [92]:
df = pd.DataFrame({'A' : ['foo', 'bar', 'foo', 'bar',
                              'foo', 'bar', 'foo', 'foo'],
                       'B' : ['one', 'one', 'two', 'three',
                              'two', 'two', 'one', 'three'],
                       'C' : np.random.randn(8),
                       'D' : np.random.randn(8)})
df.groupby('A').sum()
df.groupby(['A','B']).mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,C,D
A,B,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,one,0.145828,0.070742
bar,three,-0.327266,-0.151545
bar,two,-0.965695,0.334142
foo,one,-0.189534,2.021002
foo,three,1.022765,1.087027
foo,two,-0.000977,0.867307


## Categorical

In [97]:
df = pd.DataFrame({"id":[1,2,3,4,5,6], "raw_grade":['a', 'b', 'b', 'a', 'a', 'e']})
df['grade']=df['raw_grade'].astype('category')

# Renaming the category
df["grade"].cat.categories = ["very good", "good", "very bad"]

0    very good
1         good
2         good
3    very good
4    very good
5     very bad
Name: grade, dtype: category
Categories (3, object): [very good, good, very bad]

## Excel

In [None]:
# Writing to excel
df.to_excel('foo.xlsx', sheet_name='Sheet1')

# reading to excel
pd.read_excel('foo.xlsx', 'Sheet1', index_col=None, na_values=['NA'])



## Creating Data

In [99]:
# Creating each column as a list
name = ['Alen','Bob','Carl','Don','Elan']
age = [22,24,28,34,36]
weight = [200,150,180,172,180]

# Pairing both together
data_set = list(zip(name,age,weight))

# Converting the list into a data frame
df = pd.DataFrame(data = data_set, columns = ['name','age','weight'])
df

# Exporting the data as csv file
df.to_csv('dataset.csv', index = False, header = True)

# Reading a csv file
path = r'C:\Users\Prudhvinath\Desktop\sem-2\Projects Wishing to do\Basics of python\dataset.csv'
df1 = pd.read_csv(path)

# Delete the above file
os.remove(path)

## Finding the data types and accessing the columns

In [100]:
df.name               # we can access a column by its name
df.age.dtype          # data type of a perticular column
df.dtypes             # knowing the data types of all the columns

name      object
age        int64
weight     int64
dtype: object

## Sorting

In [101]:
Sorted = df.sort_values(['age'], ascending=False)
Sorted.head(1)

Unnamed: 0,name,age,weight
4,Elan,36,180
