In [13]:
import pandas as pd

In [14]:
record1=pd.Series(
    {'Name':'Alice',
     'Class':'phy',
     'Score':'85'}
)

record2=pd.Series(
    {'Name':'jack',
     'Class':'chem',
     'Score':'82'
    }
)

record3=pd.Series(
    {'Name':'vanessa',
     'Class':'bio',
     'Score':'90'
     }
)

In [15]:
df=pd.DataFrame([record1, record2, record3],
                index=['school1','school2','school3']
                )
df.head()

Unnamed: 0,Name,Class,Score
school1,Alice,phy,85
school2,jack,chem,82
school3,vanessa,bio,90


In [16]:
import pandas as pd
students=[
    {'Name':'alice',
     'Class':'phy',
     'Score':'89'
    },
    {'Name':'jack',
     'Class':'chem',
     'Score':'80'
    },
    {'Name':'helen',
     'Class':'bio',
     'Score':'87' 
    }
]

df=pd.DataFrame(students, index=['school1','school2','school3'])
df.head()

Unnamed: 0,Name,Class,Score
school1,alice,phy,89
school2,jack,chem,80
school3,helen,bio,87


In [17]:
df.loc['school2']

Name     jack
Class    chem
Score      80
Name: school2, dtype: object

In [18]:
# we can also check the dtype 
type(df.loc['school2'])

pandas.core.series.Series

In [19]:
df.loc['school1','Name']

'alice'

In [20]:
# to extract just a single column, there are a few mechanisms. firstly, we could transpose the matrix
# this pivots all of the rows into columns & all of the columns into rows, & is done w the T attribute

df.T

Unnamed: 0,school1,school2,school3
Name,alice,jack,helen
Class,phy,chem,bio
Score,89,80,87


In [21]:
# then we call .loc on the transpose to get the student names only
df.T.loc['Name']

school1    alice
school2     jack
school3    helen
Name: Name, dtype: object

In [23]:
# the result of a singlr column projection is a Series object
type(df['Name'])

pandas.core.series.Series

In [24]:
# since the result of using the indexing operator is either a DataFrame or Series, u can chain operations together
# for instance, we can select all of the rows which related to school1 using .loc attribute then project the name column from just those rows
df.loc['school1']['Name']

'alice'

In [25]:
print(type(df.loc['school1']))
print(type(df.loc['school1']['Name']))

<class 'pandas.core.series.Series'>
<class 'str'>


DATAFRAME INDEXING

In [26]:
# the jupyter notebook uses ipython as the kernel underneath, which provides convenient ways to integrate lower level shell commands, 
# which are programs run in the underlying operating system 

In [27]:
# "cat", for concatenation, which just outputs the contents of a file. in ipython, we prepend the line w an exclamation mark 
# it'll execute the remainder of the line as a shell command

!cat Admission_Predict.csv

cat: Admission_Predict.csv: No such file or directory


In [29]:
import pandas as pd
df=pd.read_csv('resources/Admission_Predict.csv')
df.head()

Unnamed: 0,Serial No.,GRE Score,TOEFL Score,University Rating,SOP,LOR,CGPA,Research,Chance of Admit
0,1,337,118,4,4.5,4.5,9.65,1,0.92
1,2,324,107,4,4.0,4.5,8.87,1,0.76
2,3,316,104,3,3.0,3.5,8.0,1,0.72
3,4,322,110,3,3.5,2.5,8.67,1,0.8
4,5,314,103,2,2.0,3.0,8.21,0,0.65


In [31]:
# the default index here starts w 0 while the students' serial number starts from 1. if u jump  back to the CSV output you'll deduce that pandas 
# has created a new index
# instead, we can set the serial no. as the index if we want to by using the index column

df=pd.read_csv("resources/Admission_Predict.csv", index_col=0)
df.head()

Unnamed: 0_level_0,GRE Score,TOEFL Score,University Rating,SOP,LOR,CGPA,Research,Chance of Admit
Serial No.,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,337,118,4,4.5,4.5,9.65,1,0.92
2,324,107,4,4.0,4.5,8.87,1,0.76
3,316,104,3,3.0,3.5,8.0,1,0.72
4,322,110,3,3.5,2.5,8.67,1,0.8
5,314,103,2,2.0,3.0,8.21,0,0.65


In [32]:
# renaming the columns
new_df = df.rename(columns={
    'GRE Score': 'GRE Score', 
    'SOP': 'Statement of Purpose',
    'LOR': 'Letter of Recommendation'
})
new_df.head()

Unnamed: 0_level_0,GRE Score,TOEFL Score,University Rating,Statement of Purpose,LOR,CGPA,Research,Chance of Admit
Serial No.,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,337,118,4,4.5,4.5,9.65,1,0.92
2,324,107,4,4.0,4.5,8.87,1,0.76
3,316,104,3,3.0,3.5,8.0,1,0.72
4,322,110,3,3.5,2.5,8.67,1,0.8
5,314,103,2,2.0,3.0,8.21,0,0.65


In [33]:
# LOR isnt changed
new_df.columns

Index(['GRE Score', 'TOEFL Score', 'University Rating', 'Statement of Purpose',
       'LOR ', 'CGPA', 'Research', 'Chance of Admit '],
      dtype='object')

In [34]:
# If we look at the output closely, we can see that there is actually a space right after "LOR" and a space
# right after "Chance of Admit. Sneaky, huh? So this is why our rename dictionary does not work for LOR,
# because the key we used was just three characters, instead of "LOR "

# There are a couple of ways we could address this. One way would be to change a column by including the space
# in the name

new_df=new_df.rename(columns={'LOR ':'Letter of Recommendation'})
new_df.head()

Unnamed: 0_level_0,GRE Score,TOEFL Score,University Rating,Statement of Purpose,Letter of Recommendation,CGPA,Research,Chance of Admit
Serial No.,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,337,118,4,4.5,4.5,9.65,1,0.92
2,324,107,4,4.0,4.5,8.87,1,0.76
3,316,104,3,3.0,3.5,8.0,1,0.72
4,322,110,3,3.5,2.5,8.67,1,0.8
5,314,103,2,2.0,3.0,8.21,0,0.65


In [35]:
# another way to create some function that does the cleaning and then tell renamed to apply that function acorss all of the data
# we use "strip()" for this
# when we pass this in to rename we pass the function as the mapper parameter, & then the indicate whether the axis should be the columns or index (row labels)

new_df=new_df.rename(mapper=str.strip,axis='columns')
new_df.head()

Unnamed: 0_level_0,GRE Score,TOEFL Score,University Rating,Statement of Purpose,Letter of Recommendation,CGPA,Research,Chance of Admit
Serial No.,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,337,118,4,4.5,4.5,9.65,1,0.92
2,324,107,4,4.0,4.5,8.87,1,0.76
3,316,104,3,3.0,3.5,8.0,1,0.72
4,322,110,3,3.5,2.5,8.67,1,0.8
5,314,103,2,2.0,3.0,8.21,0,0.65


In [36]:
# # We can also use the df.columns attribute by assigning to it a list of column names which will directly
# rename the columns. This will directly modify the original dataframe and is very efficient especially when
# you have a lot of columns and you only want to change a few. This technique is also not affected by subtle
# errors in the column names, a problem that we just encountered. With a list, you can use the list index to
# change a certain value or use list comprehension to change all of the values

cols=list(df.columns)
cols=[x.lower().strip() for x in cols]
df.columns=cols
df.head()

Unnamed: 0_level_0,gre score,toefl score,university rating,sop,lor,cgpa,research,chance of admit
Serial No.,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,337,118,4,4.5,4.5,9.65,1,0.92
2,324,107,4,4.0,4.5,8.87,1,0.76
3,316,104,3,3.0,3.5,8.0,1,0.72
4,322,110,3,3.5,2.5,8.67,1,0.8
5,314,103,2,2.0,3.0,8.21,0,0.65
