# Introduction to Pandas 

##### Pandas is an open source Python library for data analysis. It gives Python the ability to work with spreadsheet-like data for fast data loading, manipulating, aligning, and merging

### There are two main components of Pandas
### Series and DataFrame
The DataFrame represents your entire spreadsheet or rectangular data, whereas the Series is a single column of the DataFrame.

In [28]:
#import numpy, pandas and matplotlib libraies
import numpy as np
import pandas as pd
from pandas import *
import matplotlib.pyplot as plt
%matplotlib inline

# Series

In [3]:
#create a series 
ser=pd.Series([1,2,3,4,5])
ser

0    1
1    2
2    3
3    4
4    5
dtype: int64

In [5]:
#access the series element values
ser.values

array([1, 2, 3, 4, 5])

In [6]:
#access the series indexes
ser.index

RangeIndex(start=0, stop=5, step=1)

In [7]:
#series with custom index
sample_ser=pd.Series([32,88,76,75,19],index=['a','b','c','d','e'])
sample_ser

a    32
b    88
c    76
d    75
e    19
dtype: int64

In [8]:
#indexing of series
sample_ser['c']

76

In [9]:
#assigning values to series via indexes
sample_ser['d']=100

In [10]:
sample_ser

a     32
b     88
c     76
d    100
e     19
dtype: int64

In [16]:
#create a series with custom index passed as a list
states = ['California', 'Ohio', 'Oregon', 'Texas']
data=[100,200,300,400]
state_ser= pd.Series(data, index=states)


In [17]:
state_ser

California    100
Ohio          200
Oregon        300
Texas         400
dtype: int64

## Filtering Condtions

In [11]:
#fetch records based on condition
sample_ser[sample_ser >75]

b     88
c     76
d    100
dtype: int64

In [13]:
#validate if particular index in series
'e' in sample_ser

True

In [14]:
#converting series into dictionary
ser_dict=sample_ser.to_dict()
ser_dict

{'a': 32, 'b': 88, 'c': 76, 'd': 100, 'e': 19}

In [15]:
#converting dictionary to series
dict_ser=pd.Series(ser_dict)
dict_ser

a     32
b     88
c     76
d    100
e     19
dtype: int64

## Dataframes

#### Three ways you can create Dataframe Objects in Python

1. Reading data from webpage
2. Reading data from a file location ( local/cloud/database) 
3. Creating a dataframe locally by passing values using pandas DataFrame function

#### Case I : Reading from a webpage

#### Case II : Reading Data from a file lcoation 

In [48]:
#create a pandas dataframe using read_csv function 
df=pd.read_csv('nfl.csv')

In [49]:
#generic information about df
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32 entries, 0 to 31
Data columns (total 10 columns):
Rank                32 non-null int64
Team                32 non-null int64
Won                 32 non-null object
Lost                32 non-null int64
Tied                32 non-null int64
Pct.                32 non-null int64
First NFL Season    32 non-null float64
Total Games         32 non-null int64
Division            32 non-null object
Zone                32 non-null object
dtypes: float64(1), int64(6), object(3)
memory usage: 2.6+ KB


In [50]:
#Generate statistics values of df
df.describe()

Unnamed: 0,Rank,Team,Lost,Tied,Pct.,First NFL Season,Total Games
count,32.0,32.0,32.0,32.0,32.0,32.0,32.0
mean,15.5,16.5,448.4375,439.59375,13.25,0.500531,1956.625
std,9.380832,9.380832,156.415893,137.048816,12.492578,0.047509,22.511287
min,0.0,1.0,110.0,146.0,0.0,0.387,1920.0
25%,7.75,8.75,351.0,389.0,4.75,0.46,1936.0
50%,15.5,16.5,462.0,440.5,9.5,0.5025,1960.0
75%,23.25,24.25,550.75,543.25,20.25,0.5375,1966.25
max,31.0,32.0,749.0,740.0,42.0,0.573,2002.0


In [51]:
# Columns present in dataframe
df.columns

Index(['Rank', 'Team', 'Won', 'Lost', 'Tied', 'Pct.', 'First NFL Season',
       'Total Games', 'Division', 'Zone'],
      dtype='object')

In [52]:
#Wrong way of accessing columns  with space in names
df.Total Games

SyntaxError: invalid syntax (<ipython-input-52-103eb8238ae1>, line 2)

In [53]:
#correct way of accessing column values for such columns
df['Total Games']

0     1960
1     1921
2     1920
3     1966
4     1960
5     1961
6     1996
7     1925
8     1960
9     1950
10    1933
11    1960
12    1953
13    1960
14    1976
15    1932
16    1995
17    1960
18    1937
19    1950
20    1933
21    1960
22    1960
23    1930
24    1968
25    1960
26    1995
27    1967
28    1966
29    2002
30    1920
31    1976
Name: Total Games, dtype: int64

In [54]:
#sneak peek into the datframe , view only top few rows
df.head(3)

Unnamed: 0,Rank,Team,Won,Lost,Tied,Pct.,First NFL Season,Total Games,Division,Zone
0,0,1,Dallas Cowboys,502,374,6,0.573,1960,882,NFC East
1,1,2,Green Bay Packers,737,562,37,0.565,1921,1336,NFC North
2,2,3,Chicago Bears,749,579,42,0.562,1920,1370,NFC North


In [55]:
#last few records of the dataframe
df.tail(2)

Unnamed: 0,Rank,Team,Won,Lost,Tied,Pct.,First NFL Season,Total Games,Division,Zone
30,30,31,Arizona Cardinals,550,740,40,0.429,1920,1330,NFC West
31,31,32,Tampa Bay Buccaneers,255,404,1,0.387,1976,660,NFC South


In [56]:
#random sample view of records 
df.sample(3)

Unnamed: 0,Rank,Team,Won,Lost,Tied,Pct.,First NFL Season,Total Games,Division,Zone
4,4,5,New England Patriots[b],489,386,9,0.558,1960,884,AFC East
24,24,25,Cincinnati Bengals,351,417,4,0.457,1968,772,AFC North
18,18,19,Los Angeles Rams,555,559,21,0.498,1937,1135,NFC West


In [65]:
#select few columns 
df_small=df[['Rank', 'Team', 'Won', 'Lost','Division','Zone']]

In [66]:
df_small

Unnamed: 0,Rank,Team,Won,Lost,Division,Zone
0,0,1,Dallas Cowboys,502,882,NFC East
1,1,2,Green Bay Packers,737,1336,NFC North
2,2,3,Chicago Bears,749,1370,NFC North
3,3,4,Miami Dolphins,445,800,AFC East
4,4,5,New England Patriots[b],489,884,AFC East
5,5,6,Minnesota Vikings,470,870,NFC North
6,6,7,Baltimore Ravens,190,352,AFC North
7,7,8,New York Giants,687,1305,NFC East
8,8,9,Denver Broncos,470,884,AFC West
9,9,10,San Francisco 49ers,528,1002,NFC West


In [67]:
#shape & Size of data
df_small.shape

(32, 6)

In [69]:
#group by column values
df_small.Zone.value_counts()

NFC West     4
AFC South    4
AFC West     4
AFC East     4
NFC North    4
NFC East     4
NFC South    4
AFC North    4
Name: Zone, dtype: int64

In [70]:
#fetch particular row (first row)
df_small.iloc[0]

Rank                     0
Team                     1
Won         Dallas Cowboys
Lost                   502
Division               882
Zone              NFC East
Name: 0, dtype: object

In [74]:
#add new column with fixed value
df_small['new_col']=5

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [75]:
df_small

Unnamed: 0,Rank,Team,Won,Lost,Division,Zone,new_col
0,0,1,Dallas Cowboys,502,882,NFC East,5
1,1,2,Green Bay Packers,737,1336,NFC North,5
2,2,3,Chicago Bears,749,1370,NFC North,5
3,3,4,Miami Dolphins,445,800,AFC East,5
4,4,5,New England Patriots[b],489,884,AFC East,5
5,5,6,Minnesota Vikings,470,870,NFC North,5
6,6,7,Baltimore Ravens,190,352,AFC North,5
7,7,8,New York Giants,687,1305,NFC East,5
8,8,9,Denver Broncos,470,884,AFC West,5
9,9,10,San Francisco 49ers,528,1002,NFC West,5


In [76]:
#Add new column with random values
df_small['new_col']=np.arange(32)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [77]:
df_small

Unnamed: 0,Rank,Team,Won,Lost,Division,Zone,new_col
0,0,1,Dallas Cowboys,502,882,NFC East,0
1,1,2,Green Bay Packers,737,1336,NFC North,1
2,2,3,Chicago Bears,749,1370,NFC North,2
3,3,4,Miami Dolphins,445,800,AFC East,3
4,4,5,New England Patriots[b],489,884,AFC East,4
5,5,6,Minnesota Vikings,470,870,NFC North,5
6,6,7,Baltimore Ravens,190,352,AFC North,6
7,7,8,New York Giants,687,1305,NFC East,7
8,8,9,Denver Broncos,470,884,AFC West,8
9,9,10,San Francisco 49ers,528,1002,NFC West,9


In [78]:
#del the column
del df_small['new_col']

In [79]:
df_small

Unnamed: 0,Rank,Team,Won,Lost,Division,Zone
0,0,1,Dallas Cowboys,502,882,NFC East
1,1,2,Green Bay Packers,737,1336,NFC North
2,2,3,Chicago Bears,749,1370,NFC North
3,3,4,Miami Dolphins,445,800,AFC East
4,4,5,New England Patriots[b],489,884,AFC East
5,5,6,Minnesota Vikings,470,870,NFC North
6,6,7,Baltimore Ravens,190,352,AFC North
7,7,8,New York Giants,687,1305,NFC East
8,8,9,Denver Broncos,470,884,AFC West
9,9,10,San Francisco 49ers,528,1002,NFC West


In [80]:
#Rename columns
df_small = df_small.rename(columns={'Rank': 'Standings', 'Division': 'Section'})

In [81]:
df_small

Unnamed: 0,Standings,Team,Won,Lost,Section,Zone
0,0,1,Dallas Cowboys,502,882,NFC East
1,1,2,Green Bay Packers,737,1336,NFC North
2,2,3,Chicago Bears,749,1370,NFC North
3,3,4,Miami Dolphins,445,800,AFC East
4,4,5,New England Patriots[b],489,884,AFC East
5,5,6,Minnesota Vikings,470,870,NFC North
6,6,7,Baltimore Ravens,190,352,AFC North
7,7,8,New York Giants,687,1305,NFC East
8,8,9,Denver Broncos,470,884,AFC West
9,9,10,San Francisco 49ers,528,1002,NFC West


#### Case III : Create the dataframe locally by passing dictionary values

In [82]:
#create dictionary for dataframe
player_dict={'Player':['Federer','Murray','Nadal','Djokovic','Sharapova','Williams'],
       200,225 'Matches':[,300,150,275,325],
     'Rank':[1,2,3,4,5,6]}

In [84]:
#create the dataframe using dictionary values
player_df=pd.DataFrame(player_dict,index=np.arange(1,7))
player_df

Unnamed: 0,Matches,Player,Rank
1,200,Federer,1
2,225,Murray,2
3,300,Nadal,3
4,150,Djokovic,4
5,275,Sharapova,5
6,325,Williams,6


# Indexing objects

In [85]:
#fetch the 4th index for Matches column
player_df.Matches[4]

150

In [86]:
player_df.Player[4]

'Djokovic'

In [87]:
# Reindexing
player_df.reindex(np.arange(1,8))

Unnamed: 0,Matches,Player,Rank
1,200.0,Federer,1.0
2,225.0,Murray,2.0
3,300.0,Nadal,3.0
4,150.0,Djokovic,4.0
5,275.0,Sharapova,5.0
6,325.0,Williams,6.0
7,,,


## Drop rows & columns

In [88]:
#drop column ( axis=1)
player_df.drop('Rank',axis=1)

Unnamed: 0,Matches,Player
1,200,Federer
2,225,Murray
3,300,Nadal
4,150,Djokovic
5,275,Sharapova
6,325,Williams


In [89]:
#drop row ( axis=0)
player_df.drop(6,axis=0)

Unnamed: 0,Matches,Player,Rank
1,200,Federer,1
2,225,Murray,2
3,300,Nadal,3
4,150,Djokovic,4
5,275,Sharapova,5


## Conditions

In [90]:
player_df[player_df.Rank < 4]

Unnamed: 0,Matches,Player,Rank
1,200,Federer,1
2,225,Murray,2
3,300,Nadal,3


In [91]:
player_df.Rank < 4

1     True
2     True
3     True
4    False
5    False
6    False
Name: Rank, dtype: bool