In [57]:
# the DataFrame data structure is the heart of the Panda's library. 
# the DataFrame is conceptually a 2-D series object, where theres an index and multiole colunns of content w each column having a label
# DataFrame itself as simply a two-axes labeled array.

In [58]:
import pandas as pd

In [59]:
record1=pd.Series(
    {'Name':'Alice',
     'Class':'phy',
     'Score':'85'}
)

record2=pd.Series(
    {'Name':'jack',
     'Class':'chem',
     'Score':'82'
    }
)

record3=pd.Series(
    {'Name':'vanessa',
     'Class':'bio',
     'Score':'90'
     }
)

In [60]:
df=pd.DataFrame([record1, record2, record3],
                index=['school1','school2','school3']
                )
df.head()

Unnamed: 0,Name,Class,Score
school1,Alice,phy,85
school2,jack,chem,82
school3,vanessa,bio,90


In [61]:
import pandas as pd
students=[
    {'Name':'alice',
     'Class':'phy',
     'Score':'89'
    },
    {'Name':'jack',
     'Class':'chem',
     'Score':'80'
    },
    {'Name':'helen',
     'Class':'bio',
     'Score':'87' 
    }
]

df=pd.DataFrame(students, index=['school1','school2','school3'])
df.head()

Unnamed: 0,Name,Class,Score
school1,alice,phy,89
school2,jack,chem,80
school3,helen,bio,87


In [62]:
df.loc['school2']

Name     jack
Class    chem
Score      80
Name: school2, dtype: object

In [63]:
# we can also check the dtype 
type(df.loc['school2'])

pandas.core.series.Series

In [64]:
df.loc['school1','Name']

'alice'

In [65]:
# to extract just a single column, there are a few mechanisms. firstly, we could transpose the matrix
# this pivots all of the rows into columns & all of the columns into rows, & is done w the T attribute

df.T

Unnamed: 0,school1,school2,school3
Name,alice,jack,helen
Class,phy,chem,bio
Score,89,80,87


In [66]:
# then we call .loc on the transpose to get the student names only
df.T.loc['Name']

school1    alice
school2     jack
school3    helen
Name: Name, dtype: object

In [67]:
# ,loc attribute, just like for Series, works at row level
# if u mention the column, it;ll result in key error
df.loc['Name']

KeyError: 'Name'

In [28]:
# the result of a singlr column projection is a Series object
type(df['Name'])

pandas.core.series.Series

In [31]:
# since the result of using the indexing operator is either a DataFrame or Series, u can chain operations together
# for instance, we can select all of the rows which related to school1 using .loc attribute then project the name column from just those rows
df.loc['school1']['Name']

'alice'

In [33]:
print(type(df.loc['school1']))
print(type(df.loc['school1']['Name']))

<class 'pandas.core.series.Series'>
<class 'str'>


DATAFRAME INDEXING

In [34]:
# the jupyter notebook uses ipython as the kernel underneath, which provides convenient ways to integrate lower level shell commands, 
# which are programs run in the underlying operating system 

In [None]:
# "cat", for concatenation, which just outputs the contents of a file. in ipython, we prepend the line w an exclamation mark 
# it'll execute the remainder of the line as a shell command

!cat Admission_Predict.csv

Serial No.,GRE Score,TOEFL Score,University Rating,SOP,LOR ,CGPA,Research,Chance of Admit 
1,337,118,4,4.5,4.5,9.65,1,0.92
2,324,107,4,4,4.5,8.87,1,0.76
3,316,104,3,3,3.5,8,1,0.72
4,322,110,3,3.5,2.5,8.67,1,0.8
5,314,103,2,2,3,8.21,0,0.65
6,330,115,5,4.5,3,9.34,1,0.9
7,321,109,3,3,4,8.2,1,0.75
8,308,101,2,3,4,7.9,0,0.68
9,302,102,1,2,1.5,8,0,0.5
10,323,108,3,3.5,3,8.6,0,0.45
11,325,106,3,3.5,4,8.4,1,0.52
12,327,111,4,4,4.5,9,1,0.84
13,328,112,4,4,4.5,9.1,1,0.78
14,307,109,3,4,3,8,1,0.62
15,311,104,3,3.5,2,8.2,1,0.61
16,314,105,3,3.5,2.5,8.3,0,0.54
17,317,107,3,4,3,8.7,0,0.66
18,319,106,3,4,3,8,1,0.65
19,318,110,3,4,3,8.8,0,0.63
20,303,102,3,3.5,3,8.5,0,0.62
21,312,107,3,3,2,7.9,1,0.64
22,325,114,4,3,2,8.4,0,0.7
23,328,116,5,5,5,9.5,1,0.94
24,334,119,5,5,4.5,9.7,1,0.95
25,336,119,5,4,3.5,9.8,1,0.97
26,340,120,5,4.5,4.5,9.6,1,0.94
27,322,109,5,4.5,3.5,8.8,0,0.76
28,298,98,2,1.5,2.5,7.5,1,0.44
29,295,93,1,2,2,7.2,0,0.46
30,310,99,2,1.5,2,7.3,0,0.54
31,300,97,2,3,3,8.1,1,0.65
32,327,103,3,

In [53]:
import pandas as pd
df=pd.read_csv('Admission_Predict.csv')
df.head()

Unnamed: 0,Serial No.,GRE Score,TOEFL Score,University Rating,SOP,LOR,CGPA,Research,Chance of Admit
0,1,337,118,4,4.5,4.5,9.65,1,0.92
1,2,324,107,4,4.0,4.5,8.87,1,0.76
2,3,316,104,3,3.0,3.5,8.0,1,0.72
3,4,322,110,3,3.5,2.5,8.67,1,0.8
4,5,314,103,2,2.0,3.0,8.21,0,0.65


In [84]:
# the default index here starts w 0 while the students' serial number starts from 1. if u jump  back to the CSV output you'll deduce that pandas 
# has created a new index
# instead, we can set the serial no. as the index if we want to by using the index column

df=pd.read_csv("Admission_Predict.csv", index_col=0)
df.head()

Unnamed: 0_level_0,GRE Score,TOEFL Score,University Rating,SOP,LOR,CGPA,Research,Chance of Admit
Serial No.,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,337,118,4,4.5,4.5,9.65,1,0.92
2,324,107,4,4.0,4.5,8.87,1,0.76
3,316,104,3,3.0,3.5,8.0,1,0.72
4,322,110,3,3.5,2.5,8.67,1,0.8
5,314,103,2,2.0,3.0,8.21,0,0.65


In [87]:
# renaming the columns
new_df = df.rename(columns={
    'GRE Score': 'GRE Score', 
    'SOP': 'Statement of Purpose',
    'LOR': 'Letter of Recommendation'
})
new_df.head()

Unnamed: 0_level_0,GRE Score,TOEFL Score,University Rating,Statement of Purpose,LOR,CGPA,Research,Chance of Admit
Serial No.,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,337,118,4,4.5,4.5,9.65,1,0.92
2,324,107,4,4.0,4.5,8.87,1,0.76
3,316,104,3,3.0,3.5,8.0,1,0.72
4,322,110,3,3.5,2.5,8.67,1,0.8
5,314,103,2,2.0,3.0,8.21,0,0.65


In [89]:
# LOR isnt changed
new_df.columns

Index(['GRE Score', 'TOEFL Score', 'University Rating', 'Statement of Purpose',
       'LOR ', 'CGPA', 'Research', 'Chance of Admit '],
      dtype='object')

In [92]:
# If we look at the output closely, we can see that there is actually a space right after "LOR" and a space
# right after "Chance of Admit. Sneaky, huh? So this is why our rename dictionary does not work for LOR,
# because the key we used was just three characters, instead of "LOR "

# There are a couple of ways we could address this. One way would be to change a column by including the space
# in the name

new_df=new_df.rename(columns={'LOR ':'Letter of Recommendation'})
new_df.head()

Unnamed: 0_level_0,GRE Score,TOEFL Score,University Rating,Statement of Purpose,Letter of Recommendation,CGPA,Research,Chance of Admit
Serial No.,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,337,118,4,4.5,4.5,9.65,1,0.92
2,324,107,4,4.0,4.5,8.87,1,0.76
3,316,104,3,3.0,3.5,8.0,1,0.72
4,322,110,3,3.5,2.5,8.67,1,0.8
5,314,103,2,2.0,3.0,8.21,0,0.65


In [94]:
# another way to create some function that does the cleaning and then tell renamed to apply that function acorss all of the data
# we use "strip()" for this
# when we pass this in to rename we pass the function as the mapper parameter, & then the indicate whether the axis should be the columns or index (row labels)

new_df=new_df.rename(mapper=str.strip,axis='columns')
new_df.head()

Unnamed: 0_level_0,GRE Score,TOEFL Score,University Rating,Statement of Purpose,Letter of Recommendation,CGPA,Research,Chance of Admit
Serial No.,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,337,118,4,4.5,4.5,9.65,1,0.92
2,324,107,4,4.0,4.5,8.87,1,0.76
3,316,104,3,3.0,3.5,8.0,1,0.72
4,322,110,3,3.5,2.5,8.67,1,0.8
5,314,103,2,2.0,3.0,8.21,0,0.65


In [96]:
# # We can also use the df.columns attribute by assigning to it a list of column names which will directly
# rename the columns. This will directly modify the original dataframe and is very efficient especially when
# you have a lot of columns and you only want to change a few. This technique is also not affected by subtle
# errors in the column names, a problem that we just encountered. With a list, you can use the list index to
# change a certain value or use list comprehension to change all of the values

cols=list(df.columns)
cols=[x.lower().strip() for x in cols]
df.columns=cols
df.head()

Unnamed: 0_level_0,gre score,toefl score,university rating,sop,lor,cgpa,research,chance of admit
Serial No.,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,337,118,4,4.5,4.5,9.65,1,0.92
2,324,107,4,4.0,4.5,8.87,1,0.76
3,316,104,3,3.0,3.5,8.0,1,0.72
4,322,110,3,3.5,2.5,8.67,1,0.8
5,314,103,2,2.0,3.0,8.21,0,0.65
