# The Series Data Structure

In [1]:
import pandas as pd

In [2]:
student = ['Alice','Jack','Molly']

In [3]:
pd.Series(student)

0    Alice
1     Jack
2    Molly
dtype: object

In [4]:
number = [1,2,3,4]

In [5]:
pd.Series(number)

0    1
1    2
2    3
3    4
dtype: int64

In [6]:
student = ['Alice','Jack',None]

In [7]:
pd.Series(student)

0    Alice
1     Jack
2     None
dtype: object

In [8]:
number = [1,2,3,None]

In [9]:
pd.Series(number)

0    1.0
1    2.0
2    3.0
3    NaN
dtype: float64

In [10]:
import numpy as np


In [11]:
np.nan==None

False

In [12]:
np.nan==np.nan

False

In [13]:
np.isnan(np.nan)

True

In [14]:
studen_score = {
    'Alice':'Physics',
    'Jack' : 'Math',
    'Molly' : 'English'
}

In [15]:
s = pd.Series(studen_score)

In [16]:
s

Alice    Physics
Jack        Math
Molly    English
dtype: object

In [17]:
s.index

Index(['Alice', 'Jack', 'Molly'], dtype='object')

In [18]:
student = [('Alice','Brown'),('Jack','White'),('Jack','Molly')]

In [19]:
student

[('Alice', 'Brown'), ('Jack', 'White'), ('Jack', 'Molly')]

In [20]:
s1 = pd.Series(student)

In [21]:
s1.index

RangeIndex(start=0, stop=3, step=1)

In [22]:
s2 = pd.Series(['Physics','Chemistry','English'], index=['Alice','Jack','Molly'])

In [23]:
s2

Alice      Physics
Jack     Chemistry
Molly      English
dtype: object

In [24]:
s.index

Index(['Alice', 'Jack', 'Molly'], dtype='object')

In [25]:
s2.index

Index(['Alice', 'Jack', 'Molly'], dtype='object')

In [26]:
student_score ={
    'Alice':'Physics',
    'Jack':'Chemistry',
    'Molly':'English'
}

In [27]:
student_score

{'Alice': 'Physics', 'Jack': 'Chemistry', 'Molly': 'English'}

In [28]:
s3 = pd.Series(student_score,index=['Alice','Jack','Shohel'])

In [29]:
s3

Alice       Physics
Jack      Chemistry
Shohel          NaN
dtype: object

In [30]:
s3.index

Index(['Alice', 'Jack', 'Shohel'], dtype='object')

In [31]:
s3 = pd.Series(student_score,index=['Alice','Jack','Shohel','Molly'])

In [32]:
s3

Alice       Physics
Jack      Chemistry
Shohel          NaN
Molly       English
dtype: object

# Querying a Series

In [33]:
students_classes = {
    'Alice':'Physics',
    'Jack':'Chemistry',
    'Molly':'English',
    'Sam':'History'
}

In [34]:
s = pd.Series(students_classes)

In [35]:
s

Alice      Physics
Jack     Chemistry
Molly      English
Sam        History
dtype: object

In [36]:
s.iloc[3]

'History'

In [37]:
s.iloc[2]

'English'

In [38]:
s.loc['Molly']

'English'

In [39]:
s[2]

'English'

In [40]:
s['Alice']

'Physics'

In [41]:
class_codes = {
    100:'Physics',
    101: 'Chemistry',
    102: 'English',
    103: 'History'
}

In [42]:
s4 = pd.Series(class_codes)

In [43]:
s4

100      Physics
101    Chemistry
102      English
103      History
dtype: object

In [44]:
#s4[0]

In [45]:
grades = pd.Series([90,80,70,60])

In [46]:
total = 0
for i in grades:
    total += i

In [47]:
total

300

In [48]:
total/len(grades)

75.0

In [49]:
total = np.sum(grades)

In [50]:
total

300

In [51]:
total/len(grades)

75.0

In [52]:
numbers = pd.Series(np.random.randint(10,1000,100000))

In [53]:
numbers.head()

0    743
1    523
2    127
3    875
4    993
dtype: int64

In [54]:
len(numbers)

100000

In [55]:
%%timeit -n 100
total = 0
for i in numbers:
    total += i
    
total/len(numbers)

16.7 ms ± 2.08 ms per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [56]:
%%timeit -n 100
total = np.sum(numbers)
total/len(numbers)

144 µs ± 21.8 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [57]:
numbers +=2

In [58]:
numbers.head(3)

0    745
1    525
2    129
dtype: int64

In [59]:
for label,value in numbers.iteritems():
    
    numbers.at[label]=value+2

In [60]:
numbers.head(3)

0    747
1    527
2    131
dtype: int64

In [61]:
%%timeit -n 10
s = pd.Series(np.random.randint(0,1000,1000))

for label,value in s.iteritems():
    s.loc[label] = value+2


36.5 ms ± 1.7 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [62]:
%%timeit -n 10
s1 = pd.Series(np.random.randint(0,1000,1000))
s1 += 2

255 µs ± 134 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [63]:
s = pd.Series([1,2,3])
s.loc['History'] = 5

In [64]:
s

0          1
1          2
2          3
History    5
dtype: int64

In [65]:
students_classes =pd.Series({
    'Alice':'Physics',
    'Jack':'Chemistry',
    'Molly':'English',
    'Sam':'History'
})

In [66]:
kally_classes = pd.Series(['Philosopy','Arts','CSE'], index=['kally','kally','kally'])

In [67]:
kally_classes

kally    Philosopy
kally         Arts
kally          CSE
dtype: object

In [68]:
all_Student_classes = students_classes.append(kally_classes)

  all_Student_classes = students_classes.append(kally_classes)


In [69]:
all_Student_classes

Alice      Physics
Jack     Chemistry
Molly      English
Sam        History
kally    Philosopy
kally         Arts
kally          CSE
dtype: object

In [70]:
all_Student_classes.loc['kally']

kally    Philosopy
kally         Arts
kally          CSE
dtype: object

# DataFrame Data Structure

In [71]:
record1 = pd.Series({
    'name':'Alice',
    'class': 'Physics',
    'score': 85
})

In [72]:
record2 = pd.Series({
    'name':'Jack',
    'class': 'Chemistry',
    'score':83
})

In [73]:
record3 = pd.Series({
    'name': 'Halen',
    'class': 'Biology',
    'score':80
})

In [74]:
df1 = pd.DataFrame([record1,record2, record3])

In [75]:
df1

Unnamed: 0,name,class,score
0,Alice,Physics,85
1,Jack,Chemistry,83
2,Halen,Biology,80


In [76]:
df2 = pd.DataFrame([record1,record2, record3],index=['school1','school2','school4'])

In [77]:
df2

Unnamed: 0,name,class,score
school1,Alice,Physics,85
school2,Jack,Chemistry,83
school4,Halen,Biology,80


In [78]:
df2.head()

Unnamed: 0,name,class,score
school1,Alice,Physics,85
school2,Jack,Chemistry,83
school4,Halen,Biology,80


In [79]:
df2.iloc[0]

name       Alice
class    Physics
score         85
Name: school1, dtype: object

In [80]:
df2['name']

school1    Alice
school2     Jack
school4    Halen
Name: name, dtype: object

In [81]:
students = [
    {'name':'Alice', 'class':'Physics','score':86},
    {'name':'Jack', 'class':'Chemistry','score':85},
    {'name':'Halen', 'class':'Biology','score':82},


]

In [82]:
df3 = pd.DataFrame(students)

In [83]:
df3

Unnamed: 0,name,class,score
0,Alice,Physics,86
1,Jack,Chemistry,85
2,Halen,Biology,82


In [84]:
df4= pd.DataFrame(students, index=['school1','school2','school3'])

In [85]:
df4


Unnamed: 0,name,class,score
school1,Alice,Physics,86
school2,Jack,Chemistry,85
school3,Halen,Biology,82


In [86]:
students_2  = {
    'name':['Shohel','Shakil','Shakib'],
    'class': ['cse','eee','civil'],
    'score': [90,80,70]
}

In [87]:
df5 = pd.DataFrame(students_2)

In [88]:
df5.head()

Unnamed: 0,name,class,score
0,Shohel,cse,90
1,Shakil,eee,80
2,Shakib,civil,70


In [89]:
df6 = pd.DataFrame(students_2,index = ['school1','school2','school3'])

In [90]:
df6

Unnamed: 0,name,class,score
school1,Shohel,cse,90
school2,Shakil,eee,80
school3,Shakib,civil,70


In [91]:
df6.loc['school1']

name     Shohel
class       cse
score        90
Name: school1, dtype: object

In [92]:
type(df6.loc['school3'])

pandas.core.series.Series

In [93]:
df7 = pd.DataFrame(students_2,index = ['school1','school2','school1'])

In [94]:
df7.loc['school1']

Unnamed: 0,name,class,score
school1,Shohel,cse,90
school1,Shakib,civil,70


In [95]:
type(df7.loc['school1'])

pandas.core.frame.DataFrame

In [96]:
type(df7.loc['school2'])

pandas.core.series.Series

In [97]:
df6.loc['school2','name']

'Shakil'

In [98]:
df7.loc['school1','name']

school1    Shohel
school1    Shakib
Name: name, dtype: object

In [99]:
df7.T

Unnamed: 0,school1,school2,school1.1
name,Shohel,Shakil,Shakib
class,cse,eee,civil
score,90,80,70


In [100]:
df7.T.loc['name']

school1    Shohel
school2    Shakil
school1    Shakib
Name: name, dtype: object

In [101]:
df7['name']

school1    Shohel
school2    Shakil
school1    Shakib
Name: name, dtype: object

In [102]:
type(df7['name'])

pandas.core.series.Series

In [103]:
df7.loc['school1']['name']

school1    Shohel
school1    Shakib
Name: name, dtype: object

In [104]:
df7.loc[:,['name','score']]

Unnamed: 0,name,score
school1,Shohel,90
school2,Shakil,80
school1,Shakib,70


In [105]:
df7.drop('school1')

Unnamed: 0,name,class,score
school2,Shakil,eee,80


In [106]:
df7_new = df7.copy()

In [107]:
df7_new.drop('name', inplace=True,axis=1)

In [108]:
df7_new

Unnamed: 0,class,score
school1,cse,90
school2,eee,80
school1,civil,70


In [109]:
df7_new['classranking'] = None

In [110]:
df7_new

Unnamed: 0,class,score,classranking
school1,cse,90,
school2,eee,80,
school1,civil,70,


In [111]:
del df7_new['class']

In [112]:
df7_new

Unnamed: 0,score,classranking
school1,90,
school2,80,
school1,70,


In [113]:
df7_new['name'] = ['Shohel','Riyad','Hidoy']

In [114]:
df7_new

Unnamed: 0,score,classranking,name
school1,90,,Shohel
school2,80,,Riyad
school1,70,,Hidoy


# DataFrame Indexing and Loading

In [115]:
df = pd.read_csv('admission.csv', index_col=0)

In [116]:
df.head()

Unnamed: 0_level_0,GRE Score,TOEFL Score,University Rating,SOP,LOR,CGPA,Research,Chance of Admit
Serial No.,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,337,118,4,4.5,4.5,9.65,1,0.92
2,324,107,4,4.0,4.5,8.87,1,0.76
3,316,104,3,3.0,3.5,8.0,1,0.72
4,322,110,3,3.5,2.5,8.67,1,0.8
5,314,103,2,2.0,3.0,8.21,0,0.65


In [117]:
df.shape

(400, 8)

In [118]:
df.index

Int64Index([  1,   2,   3,   4,   5,   6,   7,   8,   9,  10,
            ...
            391, 392, 393, 394, 395, 396, 397, 398, 399, 400],
           dtype='int64', name='Serial No.', length=400)

In [119]:
df.columns

Index(['GRE Score', 'TOEFL Score', 'University Rating', 'SOP', 'LOR ', 'CGPA',
       'Research', 'Chance of Admit '],
      dtype='object')

In [120]:
new_df = df.rename(columns={'GRE Score':'GRE Score','TOEFL Score':'TOEFL Score','University Rating':'University Rating',
                            'SOP':'Statement of purpose','LOR ':'Letter Of recommendation','CGPA':'CGPA','Research':'Research',
                            'Chance of Admit':'Chance of Admit'})

In [121]:
df

Unnamed: 0_level_0,GRE Score,TOEFL Score,University Rating,SOP,LOR,CGPA,Research,Chance of Admit
Serial No.,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,337,118,4,4.5,4.5,9.65,1,0.92
2,324,107,4,4.0,4.5,8.87,1,0.76
3,316,104,3,3.0,3.5,8.00,1,0.72
4,322,110,3,3.5,2.5,8.67,1,0.80
5,314,103,2,2.0,3.0,8.21,0,0.65
...,...,...,...,...,...,...,...,...
396,324,110,3,3.5,3.5,9.04,1,0.82
397,325,107,3,3.0,3.5,9.11,1,0.84
398,330,116,4,5.0,4.5,9.45,1,0.91
399,312,103,3,3.5,4.0,8.78,0,0.67


In [122]:
new_df

Unnamed: 0_level_0,GRE Score,TOEFL Score,University Rating,Statement of purpose,Letter Of recommendation,CGPA,Research,Chance of Admit
Serial No.,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,337,118,4,4.5,4.5,9.65,1,0.92
2,324,107,4,4.0,4.5,8.87,1,0.76
3,316,104,3,3.0,3.5,8.00,1,0.72
4,322,110,3,3.5,2.5,8.67,1,0.80
5,314,103,2,2.0,3.0,8.21,0,0.65
...,...,...,...,...,...,...,...,...
396,324,110,3,3.5,3.5,9.04,1,0.82
397,325,107,3,3.0,3.5,9.11,1,0.84
398,330,116,4,5.0,4.5,9.45,1,0.91
399,312,103,3,3.5,4.0,8.78,0,0.67


In [123]:
df.columns

Index(['GRE Score', 'TOEFL Score', 'University Rating', 'SOP', 'LOR ', 'CGPA',
       'Research', 'Chance of Admit '],
      dtype='object')

In [124]:
new_df = new_df.rename(mapper=str.strip,axis='columns')

In [125]:
new_df

Unnamed: 0_level_0,GRE Score,TOEFL Score,University Rating,Statement of purpose,Letter Of recommendation,CGPA,Research,Chance of Admit
Serial No.,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,337,118,4,4.5,4.5,9.65,1,0.92
2,324,107,4,4.0,4.5,8.87,1,0.76
3,316,104,3,3.0,3.5,8.00,1,0.72
4,322,110,3,3.5,2.5,8.67,1,0.80
5,314,103,2,2.0,3.0,8.21,0,0.65
...,...,...,...,...,...,...,...,...
396,324,110,3,3.5,3.5,9.04,1,0.82
397,325,107,3,3.0,3.5,9.11,1,0.84
398,330,116,4,5.0,4.5,9.45,1,0.91
399,312,103,3,3.5,4.0,8.78,0,0.67


In [126]:
new_df = df.rename(mapper=str.strip,axis='columns')

In [127]:
new_df

Unnamed: 0_level_0,GRE Score,TOEFL Score,University Rating,SOP,LOR,CGPA,Research,Chance of Admit
Serial No.,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,337,118,4,4.5,4.5,9.65,1,0.92
2,324,107,4,4.0,4.5,8.87,1,0.76
3,316,104,3,3.0,3.5,8.00,1,0.72
4,322,110,3,3.5,2.5,8.67,1,0.80
5,314,103,2,2.0,3.0,8.21,0,0.65
...,...,...,...,...,...,...,...,...
396,324,110,3,3.5,3.5,9.04,1,0.82
397,325,107,3,3.0,3.5,9.11,1,0.84
398,330,116,4,5.0,4.5,9.45,1,0.91
399,312,103,3,3.5,4.0,8.78,0,0.67


In [128]:
cols = list(df.columns)
cols = [x.lower().strip() for x in cols]

In [129]:
cols

['gre score',
 'toefl score',
 'university rating',
 'sop',
 'lor',
 'cgpa',
 'research',
 'chance of admit']

In [130]:
df

Unnamed: 0_level_0,GRE Score,TOEFL Score,University Rating,SOP,LOR,CGPA,Research,Chance of Admit
Serial No.,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,337,118,4,4.5,4.5,9.65,1,0.92
2,324,107,4,4.0,4.5,8.87,1,0.76
3,316,104,3,3.0,3.5,8.00,1,0.72
4,322,110,3,3.5,2.5,8.67,1,0.80
5,314,103,2,2.0,3.0,8.21,0,0.65
...,...,...,...,...,...,...,...,...
396,324,110,3,3.5,3.5,9.04,1,0.82
397,325,107,3,3.0,3.5,9.11,1,0.84
398,330,116,4,5.0,4.5,9.45,1,0.91
399,312,103,3,3.5,4.0,8.78,0,0.67


# Querying a DataFrame

In [131]:
df2 = pd.read_csv('admission.csv',index_col=0)

In [132]:
df2.columns = [x.lower().strip() for x in df.columns]

In [133]:
df2

Unnamed: 0_level_0,gre score,toefl score,university rating,sop,lor,cgpa,research,chance of admit
Serial No.,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,337,118,4,4.5,4.5,9.65,1,0.92
2,324,107,4,4.0,4.5,8.87,1,0.76
3,316,104,3,3.0,3.5,8.00,1,0.72
4,322,110,3,3.5,2.5,8.67,1,0.80
5,314,103,2,2.0,3.0,8.21,0,0.65
...,...,...,...,...,...,...,...,...
396,324,110,3,3.5,3.5,9.04,1,0.82
397,325,107,3,3.0,3.5,9.11,1,0.84
398,330,116,4,5.0,4.5,9.45,1,0.91
399,312,103,3,3.5,4.0,8.78,0,0.67


In [134]:
admin_mask = df2['chance of admit'] > 0.7

In [135]:
admin_mask

Serial No.
1       True
2       True
3       True
4       True
5      False
       ...  
396     True
397     True
398     True
399    False
400     True
Name: chance of admit, Length: 400, dtype: bool

In [136]:
df2.where(admin_mask).head()

Unnamed: 0_level_0,gre score,toefl score,university rating,sop,lor,cgpa,research,chance of admit
Serial No.,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,337.0,118.0,4.0,4.5,4.5,9.65,1.0,0.92
2,324.0,107.0,4.0,4.0,4.5,8.87,1.0,0.76
3,316.0,104.0,3.0,3.0,3.5,8.0,1.0,0.72
4,322.0,110.0,3.0,3.5,2.5,8.67,1.0,0.8
5,,,,,,,,


In [137]:
df2.where(admin_mask).dropna()

Unnamed: 0_level_0,gre score,toefl score,university rating,sop,lor,cgpa,research,chance of admit
Serial No.,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,337.0,118.0,4.0,4.5,4.5,9.65,1.0,0.92
2,324.0,107.0,4.0,4.0,4.5,8.87,1.0,0.76
3,316.0,104.0,3.0,3.0,3.5,8.00,1.0,0.72
4,322.0,110.0,3.0,3.5,2.5,8.67,1.0,0.80
6,330.0,115.0,5.0,4.5,3.0,9.34,1.0,0.90
...,...,...,...,...,...,...,...,...
395,329.0,111.0,4.0,4.5,4.0,9.23,1.0,0.89
396,324.0,110.0,3.0,3.5,3.5,9.04,1.0,0.82
397,325.0,107.0,3.0,3.0,3.5,9.11,1.0,0.84
398,330.0,116.0,4.0,5.0,4.5,9.45,1.0,0.91


In [138]:
df2.reset_index()

Unnamed: 0,Serial No.,gre score,toefl score,university rating,sop,lor,cgpa,research,chance of admit
0,1,337,118,4,4.5,4.5,9.65,1,0.92
1,2,324,107,4,4.0,4.5,8.87,1,0.76
2,3,316,104,3,3.0,3.5,8.00,1,0.72
3,4,322,110,3,3.5,2.5,8.67,1,0.80
4,5,314,103,2,2.0,3.0,8.21,0,0.65
...,...,...,...,...,...,...,...,...,...
395,396,324,110,3,3.5,3.5,9.04,1,0.82
396,397,325,107,3,3.0,3.5,9.11,1,0.84
397,398,330,116,4,5.0,4.5,9.45,1,0.91
398,399,312,103,3,3.5,4.0,8.78,0,0.67


In [139]:
df2[ df2['chance of admit'] > 0.7].head()

Unnamed: 0_level_0,gre score,toefl score,university rating,sop,lor,cgpa,research,chance of admit
Serial No.,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,337,118,4,4.5,4.5,9.65,1,0.92
2,324,107,4,4.0,4.5,8.87,1,0.76
3,316,104,3,3.0,3.5,8.0,1,0.72
4,322,110,3,3.5,2.5,8.67,1,0.8
6,330,115,5,4.5,3.0,9.34,1,0.9


In [140]:
df2['gre score'].head()

Serial No.
1    337
2    324
3    316
4    322
5    314
Name: gre score, dtype: int64

In [141]:
df2[['gre score','toefl score']].head()

Unnamed: 0_level_0,gre score,toefl score
Serial No.,Unnamed: 1_level_1,Unnamed: 2_level_1
1,337,118
2,324,107
3,316,104
4,322,110
5,314,103


In [142]:
df2[df2['gre score'] > 320].head()

Unnamed: 0_level_0,gre score,toefl score,university rating,sop,lor,cgpa,research,chance of admit
Serial No.,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,337,118,4,4.5,4.5,9.65,1,0.92
2,324,107,4,4.0,4.5,8.87,1,0.76
4,322,110,3,3.5,2.5,8.67,1,0.8
6,330,115,5,4.5,3.0,9.34,1,0.9
7,321,109,3,3.0,4.0,8.2,1,0.75


In [143]:
(df2['chance of admit'] > 0.7) and (df2['chance of admit'] < 0.9)

ValueError: The truth value of a Series is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().

In [144]:
(df2['chance of admit'] > 0.7) & (df2['chance of admit'] < 0.9)

Serial No.
1      False
2       True
3       True
4       True
5      False
       ...  
396     True
397     True
398    False
399    False
400    False
Name: chance of admit, Length: 400, dtype: bool

In [145]:
df2['chance of admit'].gt(0.7) & df2['chance of admit'].lt(0.9) 

Serial No.
1      False
2       True
3       True
4       True
5      False
       ...  
396     True
397     True
398    False
399    False
400    False
Name: chance of admit, Length: 400, dtype: bool

In [146]:
df2['chance of admit'].gt(0.7).lt(0.9)

Serial No.
1      False
2      False
3      False
4      False
5       True
       ...  
396    False
397    False
398    False
399     True
400    False
Name: chance of admit, Length: 400, dtype: bool

# Indexing Dataframes

In [147]:
df3 = pd.read_csv('admission.csv', index_col=0)

In [148]:
df3

Unnamed: 0_level_0,GRE Score,TOEFL Score,University Rating,SOP,LOR,CGPA,Research,Chance of Admit
Serial No.,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,337,118,4,4.5,4.5,9.65,1,0.92
2,324,107,4,4.0,4.5,8.87,1,0.76
3,316,104,3,3.0,3.5,8.00,1,0.72
4,322,110,3,3.5,2.5,8.67,1,0.80
5,314,103,2,2.0,3.0,8.21,0,0.65
...,...,...,...,...,...,...,...,...
396,324,110,3,3.5,3.5,9.04,1,0.82
397,325,107,3,3.0,3.5,9.11,1,0.84
398,330,116,4,5.0,4.5,9.45,1,0.91
399,312,103,3,3.5,4.0,8.78,0,0.67


In [149]:
df3['Serial No.'] = df3.index

In [150]:
df3 = df3.set_index('Chance of Admit ')

In [151]:
df3

Unnamed: 0_level_0,GRE Score,TOEFL Score,University Rating,SOP,LOR,CGPA,Research,Serial No.
Chance of Admit,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0.92,337,118,4,4.5,4.5,9.65,1,1
0.76,324,107,4,4.0,4.5,8.87,1,2
0.72,316,104,3,3.0,3.5,8.00,1,3
0.80,322,110,3,3.5,2.5,8.67,1,4
0.65,314,103,2,2.0,3.0,8.21,0,5
...,...,...,...,...,...,...,...,...
0.82,324,110,3,3.5,3.5,9.04,1,396
0.84,325,107,3,3.0,3.5,9.11,1,397
0.91,330,116,4,5.0,4.5,9.45,1,398
0.67,312,103,3,3.5,4.0,8.78,0,399


In [152]:
df3 = df3.reset_index()

In [153]:
df3

Unnamed: 0,Chance of Admit,GRE Score,TOEFL Score,University Rating,SOP,LOR,CGPA,Research,Serial No.
0,0.92,337,118,4,4.5,4.5,9.65,1,1
1,0.76,324,107,4,4.0,4.5,8.87,1,2
2,0.72,316,104,3,3.0,3.5,8.00,1,3
3,0.80,322,110,3,3.5,2.5,8.67,1,4
4,0.65,314,103,2,2.0,3.0,8.21,0,5
...,...,...,...,...,...,...,...,...,...
395,0.82,324,110,3,3.5,3.5,9.04,1,396
396,0.84,325,107,3,3.0,3.5,9.11,1,397
397,0.91,330,116,4,5.0,4.5,9.45,1,398
398,0.67,312,103,3,3.5,4.0,8.78,0,399


In [154]:
census = pd.read_csv('census.csv')
census

Unnamed: 0,SUMLEV,REGION,DIVISION,STATE,COUNTY,STNAME,CTYNAME,CENSUS2010POP,ESTIMATESBASE2010,POPESTIMATE2010,...,RDOMESTICMIG2011,RDOMESTICMIG2012,RDOMESTICMIG2013,RDOMESTICMIG2014,RDOMESTICMIG2015,RNETMIG2011,RNETMIG2012,RNETMIG2013,RNETMIG2014,RNETMIG2015
0,40,3,6,1,0,Alabama,Alabama,4779736,4780127,4785161,...,0.002295,-0.193196,0.381066,0.582002,-0.467369,1.030015,0.826644,1.383282,1.724718,0.712594
1,50,3,6,1,1,Alabama,Autauga County,54571,54571,54660,...,7.242091,-2.915927,-3.012349,2.265971,-2.530799,7.606016,-2.626146,-2.722002,2.592270,-2.187333
2,50,3,6,1,3,Alabama,Baldwin County,182265,182265,183193,...,14.832960,17.647293,21.845705,19.243287,17.197872,15.844176,18.559627,22.727626,20.317142,18.293499
3,50,3,6,1,5,Alabama,Barbour County,27457,27457,27341,...,-4.728132,-2.500690,-7.056824,-3.904217,-10.543299,-4.874741,-2.758113,-7.167664,-3.978583,-10.543299
4,50,3,6,1,7,Alabama,Bibb County,22915,22919,22861,...,-5.527043,-5.068871,-6.201001,-0.177537,0.177258,-5.088389,-4.363636,-5.403729,0.754533,1.107861
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3188,50,4,8,56,37,Wyoming,Sweetwater County,43806,43806,43593,...,1.072643,16.243199,-5.339774,-14.252889,-14.248864,1.255221,16.243199,-5.295460,-14.075283,-14.070195
3189,50,4,8,56,39,Wyoming,Teton County,21294,21294,21297,...,-1.589565,0.972695,19.525929,14.143021,-0.564849,0.654527,2.408578,21.160658,16.308671,1.520747
3190,50,4,8,56,41,Wyoming,Uinta County,21118,21118,21102,...,-17.755986,-4.916350,-6.902954,-14.215862,-12.127022,-18.136812,-5.536861,-7.521840,-14.740608,-12.606351
3191,50,4,8,56,43,Wyoming,Washakie County,8533,8533,8545,...,-11.637475,-0.827815,-2.013502,-17.781491,1.682288,-11.990126,-1.182592,-2.250385,-18.020168,1.441961


In [155]:
census['SUMLEV'].unique()

array([40, 50])

In [156]:
census[census['SUMLEV']==50].head()

Unnamed: 0,SUMLEV,REGION,DIVISION,STATE,COUNTY,STNAME,CTYNAME,CENSUS2010POP,ESTIMATESBASE2010,POPESTIMATE2010,...,RDOMESTICMIG2011,RDOMESTICMIG2012,RDOMESTICMIG2013,RDOMESTICMIG2014,RDOMESTICMIG2015,RNETMIG2011,RNETMIG2012,RNETMIG2013,RNETMIG2014,RNETMIG2015
1,50,3,6,1,1,Alabama,Autauga County,54571,54571,54660,...,7.242091,-2.915927,-3.012349,2.265971,-2.530799,7.606016,-2.626146,-2.722002,2.59227,-2.187333
2,50,3,6,1,3,Alabama,Baldwin County,182265,182265,183193,...,14.83296,17.647293,21.845705,19.243287,17.197872,15.844176,18.559627,22.727626,20.317142,18.293499
3,50,3,6,1,5,Alabama,Barbour County,27457,27457,27341,...,-4.728132,-2.50069,-7.056824,-3.904217,-10.543299,-4.874741,-2.758113,-7.167664,-3.978583,-10.543299
4,50,3,6,1,7,Alabama,Bibb County,22915,22919,22861,...,-5.527043,-5.068871,-6.201001,-0.177537,0.177258,-5.088389,-4.363636,-5.403729,0.754533,1.107861
5,50,3,6,1,9,Alabama,Blount County,57322,57322,57373,...,1.807375,-1.177622,-1.748766,-2.062535,-1.36997,1.859511,-0.84858,-1.402476,-1.577232,-0.884411


In [157]:
columns_to_keep = ['STNAME','CTYNAME', 'BIRTHS2010', 'BIRTHS2011', 'BIRTHS2012', 'BIRTHS2013', 'BIRTHS2014', 'BIRTHS2015',
                  'POPESTIMATE2010','POPESTIMATE2011','POPESTIMATE2012','POPESTIMATE2013','POPESTIMATE2014','POPESTIMATE2015']
census1 =census[columns_to_keep]
census1.head()

Unnamed: 0,STNAME,CTYNAME,BIRTHS2010,BIRTHS2011,BIRTHS2012,BIRTHS2013,BIRTHS2014,BIRTHS2015,POPESTIMATE2010,POPESTIMATE2011,POPESTIMATE2012,POPESTIMATE2013,POPESTIMATE2014,POPESTIMATE2015
0,Alabama,Alabama,14226,59689,59062,57938,58334,58305,4785161,4801108,4816089,4830533,4846411,4858979
1,Alabama,Autauga County,151,636,615,574,623,600,54660,55253,55175,55038,55290,55347
2,Alabama,Baldwin County,517,2187,2092,2160,2186,2240,183193,186659,190396,195126,199713,203709
3,Alabama,Barbour County,70,335,300,283,260,269,27341,27226,27159,26973,26815,26489
4,Alabama,Bibb County,44,266,245,259,247,253,22861,22733,22642,22512,22549,22583


In [158]:
# multi-level indexing
census2 = census1.set_index(['STNAME','CTYNAME'])
census2.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,BIRTHS2010,BIRTHS2011,BIRTHS2012,BIRTHS2013,BIRTHS2014,BIRTHS2015,POPESTIMATE2010,POPESTIMATE2011,POPESTIMATE2012,POPESTIMATE2013,POPESTIMATE2014,POPESTIMATE2015
STNAME,CTYNAME,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
Alabama,Alabama,14226,59689,59062,57938,58334,58305,4785161,4801108,4816089,4830533,4846411,4858979
Alabama,Autauga County,151,636,615,574,623,600,54660,55253,55175,55038,55290,55347
Alabama,Baldwin County,517,2187,2092,2160,2186,2240,183193,186659,190396,195126,199713,203709
Alabama,Barbour County,70,335,300,283,260,269,27341,27226,27159,26973,26815,26489
Alabama,Bibb County,44,266,245,259,247,253,22861,22733,22642,22512,22549,22583


In [159]:
census2.loc['Michigan', 'Washtenaw County']

  census2.loc['Michigan', 'Washtenaw County']


Unnamed: 0_level_0,Unnamed: 1_level_0,BIRTHS2010,BIRTHS2011,BIRTHS2012,BIRTHS2013,BIRTHS2014,BIRTHS2015,POPESTIMATE2010,POPESTIMATE2011,POPESTIMATE2012,POPESTIMATE2013,POPESTIMATE2014,POPESTIMATE2015
STNAME,CTYNAME,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
Michigan,Washtenaw County,977,3826,3780,3662,3683,3709,345563,349048,351213,354289,357029,358880


In [160]:
census2.loc[ [('Michigan','Washtenaw County'),
        ('Michigan','Wayne County')]]

Unnamed: 0_level_0,Unnamed: 1_level_0,BIRTHS2010,BIRTHS2011,BIRTHS2012,BIRTHS2013,BIRTHS2014,BIRTHS2015,POPESTIMATE2010,POPESTIMATE2011,POPESTIMATE2012,POPESTIMATE2013,POPESTIMATE2014,POPESTIMATE2015
STNAME,CTYNAME,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
Michigan,Washtenaw County,977,3826,3780,3662,3683,3709,345563,349048,351213,354289,357029,358880
Michigan,Wayne County,5918,23819,23270,23377,23607,23586,1815199,1801273,1792514,1775713,1766008,1759335


# Missing Values

In [161]:
grade = pd.read_csv('class-grades.csv',on_bad_lines='skip')

In [162]:
grade

Unnamed: 0,Prefix,Assignment,Tutorial,Midterm,TakeHome,Final
0,5,57.14,34.09,64.38,51.48,52.50
1,8,95.05,105.49,67.50,99.07,68.33
2,8,83.70,83.17,30.00,63.15,48.89
3,7,81.22,96.06,49.38,105.93,80.56
4,8,91.32,93.64,95.00,107.41,73.89
...,...,...,...,...,...,...
91,8,96.73,103.71,45.00,93.52,61.94
92,7,85.34,80.54,41.25,93.70,39.72
93,8,89.94,102.77,87.50,90.74,87.78
94,7,95.60,76.13,66.25,99.81,85.56


In [163]:
mask = df.isnull()

In [164]:
mask.head()

Unnamed: 0_level_0,GRE Score,TOEFL Score,University Rating,SOP,LOR,CGPA,Research,Chance of Admit
Serial No.,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False
5,False,False,False,False,False,False,False,False


In [165]:
grade.dropna().head(10)

Unnamed: 0,Prefix,Assignment,Tutorial,Midterm,TakeHome,Final
0,5,57.14,34.09,64.38,51.48,52.5
1,8,95.05,105.49,67.5,99.07,68.33
2,8,83.7,83.17,30.0,63.15,48.89
3,7,81.22,96.06,49.38,105.93,80.56
4,8,91.32,93.64,95.0,107.41,73.89
5,7,95.0,92.58,93.12,97.78,68.06
6,8,95.05,102.99,56.25,99.07,50.0
8,8,84.26,93.1,47.5,18.52,50.83
9,7,90.1,97.55,51.25,88.89,63.61
10,7,80.44,90.2,75.0,91.48,39.72


In [166]:
grade.isnull().sum()

Prefix        0
Assignment    0
Tutorial      0
Midterm       0
TakeHome      1
Final         0
dtype: int64

In [167]:
grade.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 96 entries, 0 to 95
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Prefix      96 non-null     int64  
 1   Assignment  96 non-null     float64
 2   Tutorial    96 non-null     float64
 3   Midterm     96 non-null     float64
 4   TakeHome    95 non-null     float64
 5   Final       96 non-null     float64
dtypes: float64(5), int64(1)
memory usage: 4.6 KB


In [168]:
grade.fillna(0, inplace=True)

In [169]:
grade.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 96 entries, 0 to 95
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Prefix      96 non-null     int64  
 1   Assignment  96 non-null     float64
 2   Tutorial    96 non-null     float64
 3   Midterm     96 non-null     float64
 4   TakeHome    96 non-null     float64
 5   Final       96 non-null     float64
dtypes: float64(5), int64(1)
memory usage: 4.6 KB


In [170]:
log = pd.read_csv('log.csv')

In [171]:
log.head()

Unnamed: 0,time,user,video,playback position,paused,volume
0,1469974424,cheryl,intro.html,5,False,10.0
1,1469974454,cheryl,intro.html,6,,
2,1469974544,cheryl,intro.html,9,,
3,1469974574,cheryl,intro.html,10,,
4,1469977514,bob,intro.html,1,,


In [172]:
log = log.set_index('time')

In [173]:
log

Unnamed: 0_level_0,user,video,playback position,paused,volume
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1469974424,cheryl,intro.html,5,False,10.0
1469974454,cheryl,intro.html,6,,
1469974544,cheryl,intro.html,9,,
1469974574,cheryl,intro.html,10,,
1469977514,bob,intro.html,1,,
1469977544,bob,intro.html,1,,
1469977574,bob,intro.html,1,,
1469977604,bob,intro.html,1,,
1469974604,cheryl,intro.html,11,,
1469974694,cheryl,intro.html,14,,


In [174]:
log = log.sort_index()

In [175]:
log

Unnamed: 0_level_0,user,video,playback position,paused,volume
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1469974424,cheryl,intro.html,5,False,10.0
1469974424,sue,advanced.html,23,False,10.0
1469974454,cheryl,intro.html,6,,
1469974454,sue,advanced.html,24,,
1469974484,cheryl,intro.html,7,,
1469974514,cheryl,intro.html,8,,
1469974524,sue,advanced.html,25,,
1469974544,cheryl,intro.html,9,,
1469974554,sue,advanced.html,26,,
1469974574,cheryl,intro.html,10,,


In [176]:
log = log.reset_index()

In [177]:
log = log.set_index(['time','user'])

In [178]:
log

Unnamed: 0_level_0,Unnamed: 1_level_0,video,playback position,paused,volume
time,user,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1469974424,cheryl,intro.html,5,False,10.0
1469974424,sue,advanced.html,23,False,10.0
1469974454,cheryl,intro.html,6,,
1469974454,sue,advanced.html,24,,
1469974484,cheryl,intro.html,7,,
1469974514,cheryl,intro.html,8,,
1469974524,sue,advanced.html,25,,
1469974544,cheryl,intro.html,9,,
1469974554,sue,advanced.html,26,,
1469974574,cheryl,intro.html,10,,


In [179]:
log = log.fillna(method='ffill')

In [180]:
log

Unnamed: 0_level_0,Unnamed: 1_level_0,video,playback position,paused,volume
time,user,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1469974424,cheryl,intro.html,5,False,10.0
1469974424,sue,advanced.html,23,False,10.0
1469974454,cheryl,intro.html,6,False,10.0
1469974454,sue,advanced.html,24,False,10.0
1469974484,cheryl,intro.html,7,False,10.0
1469974514,cheryl,intro.html,8,False,10.0
1469974524,sue,advanced.html,25,False,10.0
1469974544,cheryl,intro.html,9,False,10.0
1469974554,sue,advanced.html,26,False,10.0
1469974574,cheryl,intro.html,10,False,10.0


In [181]:
df6 = pd.DataFrame({
    'A' : [1,2,4,5,6],
    'B': [1,5,6,7,3],
    'C': ['a','b','c','d','e']
})

In [182]:
df6

Unnamed: 0,A,B,C
0,1,1,a
1,2,5,b
2,4,6,c
3,5,7,d
4,6,3,e


In [183]:
df6.replace(1,20)

Unnamed: 0,A,B,C
0,20,20,a
1,2,5,b
2,4,6,c
3,5,7,d
4,6,3,e


In [184]:
df6 = df6.replace([1,3],[100,300])

In [185]:
df6

Unnamed: 0,A,B,C
0,100,100,a
1,2,5,b
2,4,6,c
3,5,7,d
4,6,300,e


In [186]:
log1 = pd.read_csv('log.csv')

In [187]:
log1

Unnamed: 0,time,user,video,playback position,paused,volume
0,1469974424,cheryl,intro.html,5,False,10.0
1,1469974454,cheryl,intro.html,6,,
2,1469974544,cheryl,intro.html,9,,
3,1469974574,cheryl,intro.html,10,,
4,1469977514,bob,intro.html,1,,
5,1469977544,bob,intro.html,1,,
6,1469977574,bob,intro.html,1,,
7,1469977604,bob,intro.html,1,,
8,1469974604,cheryl,intro.html,11,,
9,1469974694,cheryl,intro.html,14,,


In [188]:
log1.replace(to_replace='.*.html$',value='webpage',regex=True)

Unnamed: 0,time,user,video,playback position,paused,volume
0,1469974424,cheryl,webpage,5,False,10.0
1,1469974454,cheryl,webpage,6,,
2,1469974544,cheryl,webpage,9,,
3,1469974574,cheryl,webpage,10,,
4,1469977514,bob,webpage,1,,
5,1469977544,bob,webpage,1,,
6,1469977574,bob,webpage,1,,
7,1469977604,bob,webpage,1,,
8,1469974604,cheryl,webpage,11,,
9,1469974694,cheryl,webpage,14,,


# Example: Manipulating DataFrame

In [189]:
df = pd.read_csv('datasets/Week2/presidents.csv')

In [190]:
df.head()

Unnamed: 0,Presidency,President,Wikipedia Entry,Took office,Left office,Party,Portrait,Thumbnail,Home State
0,1,George Washington,http://en.wikipedia.org/wiki/George_Washington,30/04/1789[a],4/03/1797,Independent,GeorgeWashington.jpg,thmb_GeorgeWashington.jpg,Virginia
1,2,John Adams,http://en.wikipedia.org/wiki/John_Adams,4/03/1797,4/03/1801,Federalist,JohnAdams.jpg,thmb_JohnAdams.jpg,Massachusetts
2,3,Thomas Jefferson,http://en.wikipedia.org/wiki/Thomas_Jefferson,4/03/1801,4/03/1809,Democratic-Republican,Thomasjefferson.gif,thmb_Thomasjefferson.gif,Virginia
3,4,James Madison,http://en.wikipedia.org/wiki/James_Madison,4/03/1809[a],4/03/1817,Democratic-Republican,JamesMadison.gif,thmb_JamesMadison.gif,Virginia
4,5,James Monroe,http://en.wikipedia.org/wiki/James_Monroe,4/03/1817,4/03/1825,Democratic-Republican,JamesMonroe.gif,thmb_JamesMonroe.gif,Virginia


In [194]:
df['First'] = df['President ']

In [197]:
df['First'] = df['First'].replace("[ ].*",'',regex=True)

In [198]:
df.head()

Unnamed: 0,Presidency,President,Wikipedia Entry,Took office,Left office,Party,Portrait,Thumbnail,Home State,First
0,1,George Washington,http://en.wikipedia.org/wiki/George_Washington,30/04/1789[a],4/03/1797,Independent,GeorgeWashington.jpg,thmb_GeorgeWashington.jpg,Virginia,George
1,2,John Adams,http://en.wikipedia.org/wiki/John_Adams,4/03/1797,4/03/1801,Federalist,JohnAdams.jpg,thmb_JohnAdams.jpg,Massachusetts,John
2,3,Thomas Jefferson,http://en.wikipedia.org/wiki/Thomas_Jefferson,4/03/1801,4/03/1809,Democratic-Republican,Thomasjefferson.gif,thmb_Thomasjefferson.gif,Virginia,Thomas
3,4,James Madison,http://en.wikipedia.org/wiki/James_Madison,4/03/1809[a],4/03/1817,Democratic-Republican,JamesMadison.gif,thmb_JamesMadison.gif,Virginia,James
4,5,James Monroe,http://en.wikipedia.org/wiki/James_Monroe,4/03/1817,4/03/1825,Democratic-Republican,JamesMonroe.gif,thmb_JamesMonroe.gif,Virginia,James


In [199]:
del(df['First'])

In [200]:
df.head()

Unnamed: 0,Presidency,President,Wikipedia Entry,Took office,Left office,Party,Portrait,Thumbnail,Home State
0,1,George Washington,http://en.wikipedia.org/wiki/George_Washington,30/04/1789[a],4/03/1797,Independent,GeorgeWashington.jpg,thmb_GeorgeWashington.jpg,Virginia
1,2,John Adams,http://en.wikipedia.org/wiki/John_Adams,4/03/1797,4/03/1801,Federalist,JohnAdams.jpg,thmb_JohnAdams.jpg,Massachusetts
2,3,Thomas Jefferson,http://en.wikipedia.org/wiki/Thomas_Jefferson,4/03/1801,4/03/1809,Democratic-Republican,Thomasjefferson.gif,thmb_Thomasjefferson.gif,Virginia
3,4,James Madison,http://en.wikipedia.org/wiki/James_Madison,4/03/1809[a],4/03/1817,Democratic-Republican,JamesMadison.gif,thmb_JamesMadison.gif,Virginia
4,5,James Monroe,http://en.wikipedia.org/wiki/James_Monroe,4/03/1817,4/03/1825,Democratic-Republican,JamesMonroe.gif,thmb_JamesMonroe.gif,Virginia


In [201]:
def split_name(row):
    row['First'] = row['President '].split(' ')[0]
    row['Last']  = row['President '].split(' ')[1]
    return row

In [202]:
df = df.apply(split_name, axis='columns')

In [208]:
df.head()

Unnamed: 0,Presidency,President,Wikipedia Entry,Took office,Left office,Party,Portrait,Thumbnail,Home State
0,1,George Washington,http://en.wikipedia.org/wiki/George_Washington,30/04/1789[a],4/03/1797,Independent,GeorgeWashington.jpg,thmb_GeorgeWashington.jpg,Virginia
1,2,John Adams,http://en.wikipedia.org/wiki/John_Adams,4/03/1797,4/03/1801,Federalist,JohnAdams.jpg,thmb_JohnAdams.jpg,Massachusetts
2,3,Thomas Jefferson,http://en.wikipedia.org/wiki/Thomas_Jefferson,4/03/1801,4/03/1809,Democratic-Republican,Thomasjefferson.gif,thmb_Thomasjefferson.gif,Virginia
3,4,James Madison,http://en.wikipedia.org/wiki/James_Madison,4/03/1809[a],4/03/1817,Democratic-Republican,JamesMadison.gif,thmb_JamesMadison.gif,Virginia
4,5,James Monroe,http://en.wikipedia.org/wiki/James_Monroe,4/03/1817,4/03/1825,Democratic-Republican,JamesMonroe.gif,thmb_JamesMonroe.gif,Virginia


In [205]:
del(df['First'])
del(df['Last'])

In [207]:
df.head()

Unnamed: 0,Presidency,President,Wikipedia Entry,Took office,Left office,Party,Portrait,Thumbnail,Home State
0,1,George Washington,http://en.wikipedia.org/wiki/George_Washington,30/04/1789[a],4/03/1797,Independent,GeorgeWashington.jpg,thmb_GeorgeWashington.jpg,Virginia
1,2,John Adams,http://en.wikipedia.org/wiki/John_Adams,4/03/1797,4/03/1801,Federalist,JohnAdams.jpg,thmb_JohnAdams.jpg,Massachusetts
2,3,Thomas Jefferson,http://en.wikipedia.org/wiki/Thomas_Jefferson,4/03/1801,4/03/1809,Democratic-Republican,Thomasjefferson.gif,thmb_Thomasjefferson.gif,Virginia
3,4,James Madison,http://en.wikipedia.org/wiki/James_Madison,4/03/1809[a],4/03/1817,Democratic-Republican,JamesMadison.gif,thmb_JamesMadison.gif,Virginia
4,5,James Monroe,http://en.wikipedia.org/wiki/James_Monroe,4/03/1817,4/03/1825,Democratic-Republican,JamesMonroe.gif,thmb_JamesMonroe.gif,Virginia


In [209]:
pattern = "(^[\w]+)(?:. )([\w]*$)"

In [210]:
df['President '].str.extract(pattern)

Unnamed: 0,0,1
0,Georg,Washington
1,Joh,Adams
2,Thoma,Jefferson
3,Jame,Madison
4,Jame,Monroe
5,,
6,Andre,Jackson
7,,
8,,
9,Joh,Tyler


In [211]:
pattern ="(?P<First>^[\w]*)(?:.* )(?P<Last>[\w]*$)"


In [212]:
names=df["President "].str.extract(pattern)
names

Unnamed: 0,First,Last
0,George,Washington
1,John,Adams
2,Thomas,Jefferson
3,James,Madison
4,James,Monroe
5,John,Adams
6,Andrew,Jackson
7,Martin,Buren
8,William,Harrison
9,John,Tyler


In [213]:

df["Took office "] = df["Took office "].str.extract("([\w]{1,2}/[\w]{2}/[\w]{4})")
df["Took office "].head()

0    30/04/1789
1     4/03/1797
2     4/03/1801
3     4/03/1809
4     4/03/1817
Name: Took office , dtype: object

In [214]:
df

Unnamed: 0,Presidency,President,Wikipedia Entry,Took office,Left office,Party,Portrait,Thumbnail,Home State
0,1,George Washington,http://en.wikipedia.org/wiki/George_Washington,30/04/1789,4/03/1797,Independent,GeorgeWashington.jpg,thmb_GeorgeWashington.jpg,Virginia
1,2,John Adams,http://en.wikipedia.org/wiki/John_Adams,4/03/1797,4/03/1801,Federalist,JohnAdams.jpg,thmb_JohnAdams.jpg,Massachusetts
2,3,Thomas Jefferson,http://en.wikipedia.org/wiki/Thomas_Jefferson,4/03/1801,4/03/1809,Democratic-Republican,Thomasjefferson.gif,thmb_Thomasjefferson.gif,Virginia
3,4,James Madison,http://en.wikipedia.org/wiki/James_Madison,4/03/1809,4/03/1817,Democratic-Republican,JamesMadison.gif,thmb_JamesMadison.gif,Virginia
4,5,James Monroe,http://en.wikipedia.org/wiki/James_Monroe,4/03/1817,4/03/1825,Democratic-Republican,JamesMonroe.gif,thmb_JamesMonroe.gif,Virginia
5,6,John Quincy Adams,http://en.wikipedia.org/wiki/John_Quincy_Adams,4/03/1825,4/03/1829,Democratic-Republican/National Republican,JohnQuincyAdams.gif,thmb_JohnQuincyAdams.gif,Massachusetts
6,7,Andrew Jackson,http://en.wikipedia.org/wiki/Andrew_Jackson,4/03/1829,4/03/1837,Democratic,Andrew_jackson_head.gif,thmb_Andrew_jackson_head.gif,Tennessee
7,8,Martin Van Buren,http://en.wikipedia.org/wiki/Martin_Van_Buren,4/03/1837,4/03/1841,Democratic,MartinVanBuren.gif,thmb_MartinVanBuren.gif,New York
8,9,William Henry Harrison,http://en.wikipedia.org/wiki/William_Henry_Har...,4/03/1841,4/04/1841,Whig,WilliamHenryHarrison.gif,thmb_WilliamHenryHarrison.gif,Ohio
9,10,John Tyler,http://en.wikipedia.org/wiki/John_Tyler,4/04/1841,4/03/1845,Whig,JohnTyler.jpg,thmb_JohnTyler.jpg,Virginia
