In [2]:
#The DataFrame data structure is the heart of the Panda's library. It's a primary object that you'll be working with in data
#analysis and cleaning tasks.

#The DataFrame is conceptually a two-dimensional series object, where there's an index and multiple columns of content,
#with each column having a label. In fact, the distinction between a column and a row is really only a conceptual distinction.
#And you can think of the DataFrame itself as simply a two-axes labeled array.

In [3]:
#Pandas is a popular Python package for data science, and with good reason: it offers powerful, expressive and flexible data
#structures that make data manipulation and analysis easy, among many other things. The DataFrame is one of these structures.


In [5]:
import pandas as pd

#Lets create three school records for students and their class grades.
#I'll create each as a series which has a student name, the class name, and the score. 

record1 = pd.Series({'Name': 'Alice',
                        'Class': 'Physics',
                        'Score': 85})
record2 = pd.Series({'Name': 'Jack',
                        'Class': 'Chemistry',
                        'Score': 82})
record3 = pd.Series({'Name': 'Helen',
                        'Class': 'Biology',
                        'Score': 90})

# Like a Series, the DataFrame object is index. Here I'll use a group of series, where each series 
# represents a row of data. Just like the Series function, we can pass in our individual items
# in an array, and we can pass in our index values as a second arguments

df = pd.DataFrame([record1, record2, record3],
                  index=['school1', 'school2','school1'])
# And just like the Series we can use the head() function to see the first several rows of the
# dataframe, including indices from both axes, and we can use this to verify the columns and the rows
df.head()

Unnamed: 0,Name,Class,Score
school1,Alice,Physics,85
school2,Jack,Chemistry,82
school1,Helen,Biology,90


In [4]:
# An alternative method is that you could use a list of dictionaries, where each dictionary 
# represents a row of data.
import pandas as pd
students = [{'class' : 11,
             'name' : 'Tuba',
             'ID' : 1,
             'subject' : 'CSE'},
            {'class' : 12,
             'name' : 'Tara',
             'ID' : 4,
             'subject' : 'EEE'},
            {'class' : 9,
             'name' : 'Kali',
             'ID' : 6,
             'subject' : 'ETT'},
            
            
           ]
df = pd.DataFrame(students,index=['school1', 'school2','school1'])
df

Unnamed: 0,class,name,ID,subject
school1,11,Tuba,1,CSE
school2,12,Tara,4,EEE
school1,9,Kali,6,ETT


In [16]:
# Similar to the series, we can extract data using the .iloc and .loc attributes. Because the 
# DataFrame is two-dimensional, passing a single value to the loc indexing operator will return 
# the series if there's only one row to return.

df.loc['school2']

class        12
name       Tara
ID            4
subject     EEE
Name: school2, dtype: object

In [17]:
df.iloc[1]

class        12
name       Tara
ID            4
subject     EEE
Name: school2, dtype: object

In [18]:
# You'll note that the name of the series is returned as the index value, while the column 
# name is included in the output.

# We can check the data type of the return using the python type function.
type(df.loc['school2'])

pandas.core.series.Series

In [20]:
#In this example, we see two records for school1 as different rows.
# If we use a single value with the DataFrame lock attribute, multiple rows of the DataFrame will 
# return, not as a new series, but as a new DataFrame.

df.loc['school1']

Unnamed: 0,class,name,ID,subject
school1,11,Tuba,1,CSE
school1,9,Kali,6,ETT


In [21]:
# For instance, if you wanted to just list the student names for school1, you would supply two 
# parameters to .loc, one being the row index and the other being the column name.

# For instance, if we are only interested in school1's student names
df.loc['school1','name']

school1    Tuba
school1    Kali
Name: name, dtype: object

In [6]:
# What would we do if we just wanted to select a single column though? Well, there are a few
# mechanisms. Firstly, we could transpose the matrix. This pivots all of the rows into columns
# and all of the columns into rows, and is done with the T attribute

df.T

Unnamed: 0,school1,school2,school1.1
Name,Alice,Jack,Helen
Class,Physics,Chemistry,Biology
Score,85,82,90


In [7]:
df.T.loc['Name']

school1    Alice
school2     Jack
school1    Helen
Name: Name, dtype: object

In [8]:
df

Unnamed: 0,Name,Class,Score
school1,Alice,Physics,85
school2,Jack,Chemistry,82
school1,Helen,Biology,90


In [9]:
df['Class']  #for getting column

school1      Physics
school2    Chemistry
school1      Biology
Name: Class, dtype: object

In [10]:
# Note too that the result of a single column projection is a Series object
type(df['Name'])

pandas.core.series.Series

In [11]:
# Since the result of using the indexing operator is either a DataFrame or Series, you can chain 
# operations together. For instance, we can select all of the rows which related to school1 using
# .loc, then project the name column from just those rows

df.loc['school1']['Name']

school1    Alice
school1    Helen
Name: Name, dtype: object

In [12]:
# As we saw, .loc does row selection, and it can take two parameters, 
# the row index and the list of column names. The .loc attribute also supports slicing.

# If we wanted to select all rows, we can use a colon to indicate a full slice from beginning to end. 
# This is just like slicing characters in a list in python. Then we can add the column name as the 
# second parameter as a string. If we wanted to include multiple columns, we could do so in a list. 
# and Pandas will bring back only the columns we have asked for.

# Here's an example, where we ask for all the names and scores for all schools using the .loc operator.



df.loc[:,['Name', 'Score']]

Unnamed: 0,Name,Score
school1,Alice,85
school2,Jack,82
school1,Helen,90


In [13]:
# Take a look at that again. The colon means that we want to get all of the rows, and the list
# in the second argument position is the list of columns we want to get back

In [14]:
# It's easy to delete data in Series and DataFrames, and we can use the drop function to do so. 
# This function takes a single parameter, which is the index or row label, to drop. This is another 
# tricky place for new users -- the drop function doesn't change the DataFrame by default! Instead,
# the drop function returns to you a copy of the DataFrame with the given rows removed.

df.drop('school1')

Unnamed: 0,Name,Class,Score
school2,Jack,Chemistry,82


In [15]:
# But if we look at our original DataFrame we see the data is still intact.
df

Unnamed: 0,Name,Class,Score
school1,Alice,Physics,85
school2,Jack,Chemistry,82
school1,Helen,Biology,90


In [23]:
# Drop has two interesting optional parameters. The first is called inplace, and if it's 
# set to true, the DataFrame will be updated in place, instead of a copy being returned. 
# The second parameter is the axes, which should be dropped. By default, this value is 0, 
# indicating the row axis. But you could change it to 1 if you want to drop a column.

# For example, lets make a copy of a DataFrame using .copy()

copy_df = df.copy()
copy_df

Unnamed: 0,Name,Class,Score
school1,Alice,Physics,85
school2,Jack,Chemistry,82
school1,Helen,Biology,90


In [24]:
# df_copy.drop("Name", inplace=True, axis=1)
del copy_df['Class']
copy_df

Unnamed: 0,Name,Score
school1,Alice,85
school2,Jack,82
school1,Helen,90


In [25]:
# Finally, adding a new column to the DataFrame is as easy as assigning it to some value using
# the indexing operator. For instance, if we wanted to add a class ranking column with default 
# value of None, we could do so by using the assignment operator after the square brackets.
# This broadcasts the default value to the new column immediately.

df['ClassRanking'] = None
df

Unnamed: 0,Name,Class,Score,ClassRanking
school1,Alice,Physics,85,
school2,Jack,Chemistry,82,
school1,Helen,Biology,90,


In [27]:
df['ClassRanking']['school1']='high'  #assigning value
df

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['ClassRanking']['school1']='high'


Unnamed: 0,Name,Class,Score,ClassRanking
school1,Alice,Physics,85,high
school2,Jack,Chemistry,82,
school1,Helen,Biology,90,high
