# Data Analysis - Major League Baseball Data

## Series

In [114]:
import pandas as pd
import numpy as np

# Creating an empty series, will result in DeprecationWarning
#series = pd.Series()

# Passing dtype as a parameter to Series for an empty series to avoid DeprecationWarning
# Creating an empty series
series = pd.Series(dtype='float64')
# Newline to separate series print statements
print('{}\n'.format(series))


Series([], dtype: float64)



In [115]:
series = pd.Series(5)
print('{}\n'.format(series))

0    5
dtype: int64



In [116]:
series = pd.Series([1, 2, 3])
print('{}\n'.format(series))

0    1
1    2
2    3
dtype: int64



In [117]:
series = pd.Series([1, 2.2]) # upcasting
print('{}\n'.format(series))

0    1.0
1    2.2
dtype: float64



In [118]:
arr = np.array([1, 2])
series = pd.Series(arr, dtype=np.float32)
print('{}\n'.format(series))

0    1.0
1    2.0
dtype: float32



In [119]:
series = pd.Series([[1, 2], [3, 4]])
print('{}\n'.format(series))

series = pd.Series(["Vee","Pree","Fluffy"])
print('{}\n'.format(series))

0    [1, 2]
1    [3, 4]
dtype: object

0       Vee
1      Pree
2    Fluffy
dtype: object



In [120]:
# custom indexing
series = pd.Series(["Albert","Duke","Jerry"],index=[1,2,3])
print('{} \n'.format(series))

1    Albert
2      Duke
3     Jerry
dtype: object 



In [121]:
#index can be made of any static type
series = pd.Series([1, 2, 3], index=['a', 8, 0.3])
print('{}\n'.format(series))

a      1
8      2
0.3    3
dtype: int64



### Exercise

In [122]:
s1 = pd.Series([1,3,5.2])
print('{}\n'.format(s1))

s2 = s1 * pd.Series([0.1,0.2,0.3])
print('{}\n'.format(s2))

s3 = pd.Series([1,3,8,np.nan],index = ['a', 'b', 'c', 'd'])
print('{}\n'.format(s3))

s4 = pd.Series({'a':0 , 'b':1 , 'c':2})
print('{}\n'.format(s4))


0    1.0
1    3.0
2    5.2
dtype: float64

0    0.10
1    0.60
2    1.56
dtype: float64

a    1.0
b    3.0
c    8.0
d    NaN
dtype: float64

a    0
b    1
c    2
dtype: int64



## DataFrame

In [123]:
df = pd.DataFrame()

In [124]:
df = pd.DataFrame([1,2,3])
print('{}\n'.format(df))

   0
0  1
1  2
2  3



In [125]:
df = pd.DataFrame([[1,2,3]])
print('{}\n'.format(df))

   0  1  2
0  1  2  3



In [126]:
df = pd.DataFrame([[1,2],[5,6]],columns =['c1','c2'] , index =['r1','r2'])
print('{}\n'.format(df))

    c1  c2
r1   1   2
r2   5   6



In [127]:
df = pd.DataFrame({'c1':[1,2],'c2':[4,5]},index=['r1','r2'])
print('{}\n'.format(df))

    c1  c2
r1   1   4
r2   2   5



### Upcasting Dataframe

In [128]:
upcast = pd.DataFrame([[5, 6], [1.2, 3]])

In [129]:
print('{}\n'.format(upcast))
print('\n',upcast.dtypes)

     0  1
0  5.0  6
1  1.2  3


 0    float64
1      int64
dtype: object


In [130]:
df = pd.DataFrame([[5, 6], [1.2, 3]])
ser = pd.Series([0, 0], name='r3')

#df_app = df.append(ser)
#print('{}\n'.format(df_app))

### Concatinating DataFrame

In [131]:

df = pd.DataFrame([[5, 6], [1.2, 3]])
ser = pd.Series([0, 0], name='r3')

df_app = pd.concat([df,ser],ignore_index = True)
print('{}\n'.format(df_app))

     0    1
0  5.0  6.0
1  1.2  3.0
2  0.0  NaN
3  0.0  NaN



In [132]:
df2 = pd.DataFrame([[0,0],[9,9]])
df_app = pd.concat([df_app,df2])
print('{}\n'.format(df_app))



     0    1
0  5.0  6.0
1  1.2  3.0
2  0.0  NaN
3  0.0  NaN
0  0.0  0.0
1  9.0  9.0



### Dropping Data

In [133]:
df = pd.DataFrame({'c1': [1, 2], 'c2': [3, 4],
                   'c3': [5, 6]},
                  index=['r1', 'r2'])
print('{}\n'.format(df))


    c1  c2  c3
r1   1   3   5
r2   2   4   6



In [134]:
df_drop = df.drop(labels='r1')
print('{}\n'.format(df_drop))


    c1  c2  c3
r2   2   4   6



In [135]:

# Drop columns c1, c3
df_drop = df.drop(labels=['c1', 'c3'], axis=1)
print('{}\n'.format(df_drop))


    c2
r1   3
r2   4



In [136]:


df_drop = df.drop(columns='c2')
print('{}\n'.format(df_drop))


    c1  c3
r1   1   5
r2   2   6



In [137]:

df.drop(index='r2', columns='c2')
print('{}\n'.format(df_drop))

    c1  c3
r1   1   5
r2   2   6



In [138]:
df =pd.DataFrame({'c1':[0,1,2,3],'c2':[5,6,7,8]},index=['r1', 'r2', 'r3', 'r4'])
print('{}\n'.format(df))

    c1  c2
r1   0   5
r2   1   6
r3   2   7
r4   3   8



In [139]:
row_df =  pd.DataFrame([[9,9]],columns=['c1', 'c2'],index=['r5'])
print('{}\n'.format(row_df))

    c1  c2
r5   9   9



In [140]:
#df_app = df.append(row_df)
df_app = pd.concat([df,row_df])
df_drop = df_app.drop(labels='r2')

print('{}\n'.format(df_app))
print('{}\n'.format(df_drop))

    c1  c2
r1   0   5
r2   1   6
r3   2   7
r4   3   8
r5   9   9

    c1  c2
r1   0   5
r3   2   7
r4   3   8
r5   9   9



## Combining

In [143]:
df1 = pd.DataFrame({'c1':[1,2], 'c2':[3,4]},
                   index=['r1','r2'])
df2 = pd.DataFrame({'c1':[5,6], 'c2':[7,8]},
                   index=['r1','r2'])
df3 = pd.DataFrame({'c1':[5,6], 'c2':[7,8]})


In [144]:

concat = pd.concat([df1, df2], axis=1)
# Newline to separate print statements
print('{}\n'.format(concat))

concat = pd.concat([df2, df1, df3])
print('{}\n'.format(concat))

concat = pd.concat([df1, df3], axis=1)
print('{}\n'.format(concat))

    c1  c2  c1  c2
r1   1   3   5   7
r2   2   4   6   8

    c1  c2
r1   5   7
r2   6   8
r1   1   3
r2   2   4
0    5   7
1    6   8

     c1   c2   c1   c2
r1  1.0  3.0  NaN  NaN
r2  2.0  4.0  NaN  NaN
0   NaN  NaN  5.0  7.0
1   NaN  NaN  6.0  8.0



### Merging

In [145]:
mlb_df1 = pd.DataFrame({'name': ['john doe', 'al smith', 'sam black', 'john doe'],
                        'pos': ['1B', 'C', 'P', '2B'],
                        'year': [2000, 2004, 2008, 2003]})
mlb_df2 = pd.DataFrame({'name': ['john doe', 'al smith', 'jack lee'],
                        'year': [2000, 2004, 2012],
                        'rbi': [80, 100, 12]})
  

In [149]:
                      
print('{}\n'.format(mlb_df1))
print('{}\n'.format(mlb_df1))

mlb_merged = pd.merge(mlb_df1, mlb_df2)
print('{}\n'.format(mlb_merged))

        name pos  year
0   john doe  1B  2000
1   al smith   C  2004
2  sam black   P  2008
3   john doe  2B  2003

        name pos  year
0   john doe  1B  2000
1   al smith   C  2004
2  sam black   P  2008
3   john doe  2B  2003

       name pos  year  rbi
0  john doe  1B  2000   80
1  al smith   C  2004  100



In [150]:
def concat_rows(df1, df2):
  row_concat = pd.concat([df1,df2])
  return row_concat

In [151]:
def concat_cols(df1, df2):
  col_concat = pd.concat([df1,df2],axis = 1)
  return col_concat

In [164]:
def merge_dfs(df1, df2):
  merged_df = pd.merge(df1,df2)
  return merged_df

## Indexing

### Direct Indexing

In [165]:
df = pd.DataFrame({'name':['albert','rob','timothy'],'age':[30,26,55]})
print('{}\n'.format(df))

      name  age
0   albert   30
1      rob   26
2  timothy   55



In [166]:
col1 = df['name']
#output is a series
print('{}\n'.format(col1))


0     albert
1        rob
2    timothy
Name: name, dtype: object



In [167]:

col1_df = df[['age']]
#output is a dataframe
print('{}\n'.format(col1_df))


   age
0   30
1   26
2   55



In [168]:

col23 = df[['name', 'age']]
print('{}\n'.format(col23))

      name  age
0   albert   30
1      rob   26
2  timothy   55



In [169]:
df = pd.DataFrame({'c1': [1, 2, 3], 'c2': [4, 5, 6],
                   'c3': [7, 8, 9]}, index=['r1', 'r2', 'r3'])


In [170]:
first_two_rows = df[0:2]
print('{}\n'.format(first_two_rows))

    c1  c2  c3
r1   1   4   7
r2   2   5   8



In [171]:
last_two_rows = df['r2':'r3']
print('{}\n'.format(last_two_rows))

    c1  c2  c3
r2   2   5   8
r3   3   6   9



In [172]:
df = pd.DataFrame({'c1': [1, 2, 3], 'c2': [4, 5, 6],
                   'c3': [7, 8, 9]}, index=['r1', 'r2', 'r3'])
                   
print('{}\n'.format(df))


    c1  c2  c3
r1   1   4   7
r2   2   5   8
r3   3   6   9



In [175]:

print('{}\n'.format(df.iloc[1])) #row 1

print('{}\n'.format(df.iloc[[0, 2]])) #row 0,2

bool_list = [False, True, True]
print('{}\n'.format(df.iloc[bool_list])) #row 0-> False , row1 -> true , row2 -> true

c1    2
c2    5
c3    8
Name: r2, dtype: int64

    c1  c2  c3
r1   1   4   7
r3   3   6   9

    c1  c2  c3
r2   2   5   8
r3   3   6   9



In [176]:
df = pd.DataFrame({'c1': [1, 2, 3], 'c2': [4, 5, 6],
                   'c3': [7, 8, 9]}, index=['r1', 'r2', 'r3'])
                   
print('{}\n'.format(df))


    c1  c2  c3
r1   1   4   7
r2   2   5   8
r3   3   6   9



In [177]:

print('{}\n'.format(df.loc['r2']))

bool_list = [False, True, True]
print('{}\n'.format(df.loc[bool_list]))

single_val = df.loc['r1', 'c2']
print('Single val: {}\n'.format(single_val))

print('{}\n'.format(df.loc[['r1', 'r3'], 'c2']))

df.loc[['r1', 'r3'], 'c2'] = 0
print('{}\n'.format(df))

c1    2
c2    5
c3    8
Name: r2, dtype: int64

    c1  c2  c3
r2   2   5   8
r3   3   6   9

Single val: 4

r1    4
r3    6
Name: c2, dtype: int64

    c1  c2  c3
r1   1   0   7
r2   2   5   8
r3   3   0   9

