## Pandas Series

In [2]:
# pandas series
# A pandas Series is a one-dimensional labelled data structure 
# which can hold data such as strings, integers and even other Python objects
# Python Pandas Series are homogeneous one-dimensional objects, that is, 
# all data are of the same type and are implicitly labelled with an index.

import pandas as pd

my_list = [1, 2, 3, 4]

# pandas series from a list
ser = pd.Series(my_list)
ser

0    1
1    2
2    3
3    4
dtype: int64

In [9]:
import numpy as np

# pandas series from numpy array
my_arr = np.array(['p', 'u', 'j', 'a', 'n'])
ser = pd.Series(my_arr)
print(ser, '\n')

# creating pandas series from dictonary
my_dict = {'day1': 330, 'day2': 430, 'day3': 530}
ser = pd.Series(my_dict)
print(ser)

# day1    330
# day2    430
# day3    530
# here day1, day2 and day3 are indexes or labels to identify the data 330, 430 and 530 respectively


# even heterogenous arrays are formed here, but series are generally homogenous
my_arr = np.array([1, 2, 'a', 'b'])
ser = pd.Series(my_arr)
ser

0    p
1    u
2    j
3    a
4    n
dtype: object 

day1    330
day2    430
day3    530
dtype: int64


0    1
1    2
2    a
3    b
dtype: object

In [36]:
# There are two ways through which we can access element of series, they are :
# Accessing Element from Series with Position
# Accessing Element Using Label (index)

# Accessing Element from Series with position
data = ['p', 'u', 'j', 'a', 'n', 'd', 'a', 'h', 'a', 'l']
ser = pd.Series(data)
ser[:5]
# 0    p
# 1    u
# 2    j
# 3    a
# 4    n
# its like accessing element of a list

ser[3:6]
# 3    a
# 4    n
# 5    d


# Accessing Element Using Label (index)
# In order to access an element from series, we have to set values
# by index label. A Series is like a fixed-size dictionary in that
# you can get and set values by index label.

data = ['p', 'u', 'j', 'a', 'n', 'd', 'a', 'h', 'a', 'l']
ser = pd.Series(data, index=[11, 12, 13, 14, 15, 16, 17, 18, 19, 20])
ser[16] # 'd'

ser.head()
# 11    p
# 12    u
# 13    j
# 14    a
# 15    n

ser[12:15] # Series([], dtype: object) (it doesn't return actual data values)
# the index that we provided is called explicit index

# to index series explicit index we use .loc[]
# the last index is also included in result
ser.loc[12:15]
# 12    u
# 13    j
# 14    a
# 15    n

12    u
13    j
14    a
15    n
dtype: object

In [38]:
# loading data from a csv file to a dataframe
df = pd.read_csv("nba.csv")
df.head() # to get first 5 rows of the csv file

ser = pd.Series(df['Name'])
data = ser.head(10)
data

0    Avery Bradley
1      Jae Crowder
2     John Holland
3      R.J. Hunter
4    Jonas Jerebko
5     Amir Johnson
6    Jordan Mickey
7     Kelly Olynyk
8     Terry Rozier
9     Marcus Smart
Name: Name, dtype: object

In [39]:

# using indexing operator
data[3:6]
# 3      R.J. Hunter
# 4    Jonas Jerebko
# 5     Amir Johnson

3      R.J. Hunter
4    Jonas Jerebko
5     Amir Johnson
Name: Name, dtype: object

In [40]:
# indexing a series using .loc[]
data.loc[3:6]

# 3      R.J. Hunter
# 4    Jonas Jerebko
# 5     Amir Johnson
# 6    Jordan Mickey

3      R.J. Hunter
4    Jonas Jerebko
5     Amir Johnson
6    Jordan Mickey
Name: Name, dtype: object

In [3]:
# .iloc[] function allows us to retrieve data by position

import pandas as pd

df = pd.read_csv("nba.csv")
ser = pd.Series(df['Name'])
data = ser.head(10)
data.iloc[3:6]

# 3      R.J. Hunter
# 4    Jonas Jerebko
# 5     Amir Johnson

# last index is not included here

3      R.J. Hunter
4    Jonas Jerebko
5     Amir Johnson
Name: Name, dtype: object

In [17]:
# binary operations on series

import pandas as pd

data = pd.Series([5, 2, 3, 7], index=['a', 'b', 'c', 'd'])

data1 = pd.Series([1, 6, 4, 9], index=['a', 'b', 'd', 'e'])

# adding two series
data.add(data1, fill_value=0)
# a     6.0
# b     8.0
# c     3.0
# d    11.0
# e     9.0

data.sub(data1, fill_value=0)
# a    4.0
# b   -4.0
# c    3.0
# d    3.0
# e   -9.0

a    4.0
b   -4.0
c    3.0
d    3.0
e   -9.0
dtype: float64

In [31]:
# conversion operation on series

data = pd.read_csv("nba.csv")

# dropping all null values to avoid errors
data.dropna(inplace=True)

# storing dtype before converting
data.dtypes
# Name         object
# Team         object
# Number      float64
# Position     object
# Age         float64
# Height       object
# Weight      float64
# College      object
# Salary      float64

data["Salary"] = data["Salary"].astype(int)
data["Number"] = data["Number"].astype(str)

data.dtypes
# Name         object
# Team         object
# Number       object
# Position     object
# Age         float64
# Height       object
# Weight      float64
# College      object
# Salary        int64



# converting series to list
data = pd.read_csv("nba.csv")
data.dropna(inplace=True)
type(data["Salary"]) # pandas.core.series.Series

salary_list = data["Salary"].tolist()
type(salary_list) # list

list

## Dataframe

In [42]:
# Dataframe is Two-dimensional, size-mutable, potentially heterogeneous tabular data.
# like series, dataframe also consists of index
# if no index provided RangeIndex is used i.e. 0, 1, 2, 3, 4...

# creating df from dictionary

d = {'col1': [1, 2], 'col2': [3, 4]}
df = pd.DataFrame(data=d)
# print(df)
#    col1  col2
# 0     1     3
# 1     2     4

df.dtypes
# col1    int64
# col2    int64
# dtype: object


import numpy as np
# enforcing single dtype
df = pd.DataFrame(data=d, dtype=np.int8)
df.dtypes
# col1    int8
# col2    int8
# dtype: object


# constructing dataframe from numpy ndarray
df2 = pd.DataFrame(np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]), columns=['a', 'b', 'c'])
# print(df2)
#    a  b  c
# 0  1  2  3
# 1  4  5  6
# 2  7  8  9

   a  b  c
0  1  2  3
1  4  5  6
2  7  8  9


In [44]:
df = pd.read_csv("nba.csv")
df.head()

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary
0,Avery Bradley,Boston Celtics,0.0,PG,25.0,6-2,180.0,Texas,7730337.0
1,Jae Crowder,Boston Celtics,99.0,SF,25.0,6-6,235.0,Marquette,6796117.0
2,John Holland,Boston Celtics,30.0,SG,27.0,6-5,205.0,Boston University,
3,R.J. Hunter,Boston Celtics,28.0,SG,22.0,6-5,185.0,Georgia State,1148640.0
4,Jonas Jerebko,Boston Celtics,8.0,PF,29.0,6-10,231.0,,5000000.0
