### Here are a bunch more ways to use pandas to explore some interesting datasets. 

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
#import seaborn as sns
plt.style.use('ggplot')
% matplotlib inline

In [2]:
#import multiple dataframes
drinks = pd.read_csv('https://raw.githubusercontent.com/fivethirtyeight/data/master/alcohol-consumption/drinks.csv')
users = pd.read_table('https://raw.githubusercontent.com/justmarkham/DAT8/master/data/u.user', sep='|', index_col='user_id')
ufo = pd.read_csv('https://raw.githubusercontent.com/planetsig/ufo-reports/master/csv-data/ufo-scrubbed-geocoded-time-standardized.csv')

  interactivity=interactivity, compiler=compiler, result=result)


In [3]:
#UFOs doesn't have column headings so we'll need to sort that out
ufo.head()

Unnamed: 0,10/10/1949 20:30,san marcos,tx,us,cylinder,2700,45 minutes,This event took place in early fall around 1949-50. It occurred after a Boy Scout meeting in the Baptist Church. The Baptist Church sit,4/27/2004,29.8830556,-97.9411111
0,10/10/1949 21:00,lackland afb,tx,,light,7200,1-2 hrs,1949 Lackland AFB&#44 TX. Lights racing acros...,12/16/2005,29.38421,-98.581082
1,10/10/1955 17:00,chester (uk/england),,gb,circle,20,20 seconds,Green/Orange circular disc over Chester&#44 En...,1/21/2008,53.2,-2.916667
2,10/10/1956 21:00,edna,tx,us,circle,20,1/2 hour,My older brother and twin sister were leaving ...,1/17/2004,28.9783333,-96.645833
3,10/10/1960 20:00,kaneohe,hi,us,light,900,15 minutes,AS a Marine 1st Lt. flying an FJ4B fighter/att...,1/22/2004,21.4180556,-157.803611
4,10/10/1961 19:00,bristol,tn,us,sphere,300,5 minutes,My father is now 89 my brother 52 the girl wit...,4/27/2007,36.595,-82.188889


In [4]:
#one way to do that is to create a list with the column names...
columns = ['Date', 'City', 'State', 'Country', 'Shape', 'Duration (seconds)', 'Duration (hours/mins)', 'Description', 'Date_posted', 'Latitude', 'Longitude']

In [5]:
#...and specify that that list should be your column names when importing 
ufo = pd.read_csv('https://raw.githubusercontent.com/planetsig/ufo-reports/master/csv-data/ufo-scrubbed-geocoded-time-standardized.csv', names=columns)

  interactivity=interactivity, compiler=compiler, result=result)


In [6]:
#that's better
ufo.head()

Unnamed: 0,Date,City,State,Country,Shape,Duration (seconds),Duration (hours/mins),Description,Date_posted,Latitude,Longitude
0,10/10/1949 20:30,san marcos,tx,us,cylinder,2700,45 minutes,This event took place in early fall around 194...,4/27/2004,29.8830556,-97.941111
1,10/10/1949 21:00,lackland afb,tx,,light,7200,1-2 hrs,1949 Lackland AFB&#44 TX. Lights racing acros...,12/16/2005,29.38421,-98.581082
2,10/10/1955 17:00,chester (uk/england),,gb,circle,20,20 seconds,Green/Orange circular disc over Chester&#44 En...,1/21/2008,53.2,-2.916667
3,10/10/1956 21:00,edna,tx,us,circle,20,1/2 hour,My older brother and twin sister were leaving ...,1/17/2004,28.9783333,-96.645833
4,10/10/1960 20:00,kaneohe,hi,us,light,900,15 minutes,AS a Marine 1st Lt. flying an FJ4B fighter/att...,1/22/2004,21.4180556,-157.803611


In [15]:
#pandas makes it easy for you to take a look at multiple columns at once
ufo[['City', 'State']]         

Unnamed: 0,City,State
0,san marcos,tx
1,lackland afb,tx
2,chester (uk/england),
3,edna,tx
4,kaneohe,hi
5,bristol,tn
6,penarth (uk/wales),
7,norwalk,ct
8,pell city,al
9,live oak,fl


In [16]:
#there are also a bunch of ways to slice and dice a pandas dataframe to get the information you need. (and here's a great stackoverflow page on the merits of .loc vs. .iloc: https://stackoverflow.com/questions/31593201/pandas-iloc-vs-ix-vs-loc-explanation)
#you can slice the rows to by specifying which ones you want to see. Here we're looking at the cities in rows 3 through 6 
ufo.loc[3:6, 'City']              

3                  edna
4               kaneohe
5               bristol
6    penarth (uk/wales)
Name: City, dtype: object

In [21]:
#and here are the cities and states for rows three through six 
ufo.loc[3:6, ['City','State']]

Unnamed: 0,City,State
3,edna,tx
4,kaneohe,hi
5,bristol,tn
6,penarth (uk/wales),


In [22]:
# you can also do quite a bit of pre-processing in pandas
#for example: mapping existing values to a different set of values
users['is_male'] = users.gender.map({'F':0, 'M':1})

In [23]:
users

Unnamed: 0_level_0,age,gender,occupation,zip_code,is_male
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,24,M,technician,85711,1
2,53,F,other,94043,0
3,23,M,writer,32067,1
4,24,M,technician,43537,1
5,33,F,other,15213,0
6,42,M,executive,98101,1
7,57,M,administrator,91344,1
8,36,M,administrator,05201,1
9,29,M,student,01002,1
10,53,M,lawyer,90703,1


In [34]:
# when dealing with categorical data, you can also encode strings as integer values using .factorize (this automatically starts at 0)
users.occupation.factorize()

(array([ 0,  1,  2,  0,  1,  3,  4,  4,  5,  6,  1,  1,  7,  8,  7,  9, 10,
         1, 11, 12,  2,  2, 13, 13, 14, 14, 11,  2, 10,  5, 13,  5,  5,  4,
        12,  5,  5,  1,  9,  8, 14,  4, 11,  0, 10, 15, 15,  4,  5,  2,  7,
         5, 10,  3, 10, 11, 16, 10,  7, 17, 14,  4, 15,  7,  7,  5,  5,  5,
        14, 14,  8,  4,  5,  8,  9,  5,  0,  4,  4,  4,  5, 10,  1,  3,  7,
         4,  4, 11,  4,  7, 15,  9,  3,  5,  4, 13, 13,  3,  5,  3,  5, 10,
         5,  5, 14, 18,  8,  7,  1,  5, 14, 19,  3, 10, 14, 17,  5,  4, 10,
         1, 11,  2, 13,  5,  6,  6, 16, 15, 15, 16,  4,  1, 14, 10,  5,  1,
         7, 20,  5,  5, 10,  1,  0, 10,  9, 13, 11, 14, 15, 13,  4,  7,  5,
         5,  1,  7, 14,  7,  5, 10,  6, 13,  4, 17,  1,  7,  1,  1,  1, 17,
         7, 15,  1,  4,  8,  8, 10,  1,  9,  4,  3, 10,  8, 11, 11,  3,  7,
         5, 13,  4,  4,  7,  5,  4,  8,  2,  0,  5,  2, 10,  2,  7,  5, 11,
         6,  5, 15, 14,  7, 14, 19,  7,  3, 11, 10, 14,  1,  4, 10, 11,  5,
        10, 

In [36]:
#pandas lets you take a look at unique values in a column
print(users.occupation.nunique())      # count the number of unique values
users.occupation.unique()       # return what those unique values are

21


array(['technician', 'other', 'writer', 'executive', 'administrator',
       'student', 'lawyer', 'educator', 'scientist', 'entertainment',
       'programmer', 'librarian', 'homemaker', 'artist', 'engineer',
       'marketing', 'none', 'healthcare', 'retired', 'salesman', 'doctor'], dtype=object)

In [37]:
# you can also do some data cleaning using pandas, like replacing all instances of a value in a column
#here we're capitalizing TX
ufo.State.replace('tx', 'TX', inplace=True)

In [38]:
ufo

Unnamed: 0,Date,City,State,Country,Shape,Duration (seconds),Duration (hours/mins),Description,Date_posted,Latitude,Longitude
0,10/10/1949 20:30,san marcos,TX,us,cylinder,2700,45 minutes,This event took place in early fall around 194...,4/27/2004,29.8830556,-97.941111
1,10/10/1949 21:00,lackland afb,TX,,light,7200,1-2 hrs,1949 Lackland AFB&#44 TX. Lights racing acros...,12/16/2005,29.38421,-98.581082
2,10/10/1955 17:00,chester (uk/england),,gb,circle,20,20 seconds,Green/Orange circular disc over Chester&#44 En...,1/21/2008,53.2,-2.916667
3,10/10/1956 21:00,edna,TX,us,circle,20,1/2 hour,My older brother and twin sister were leaving ...,1/17/2004,28.9783333,-96.645833
4,10/10/1960 20:00,kaneohe,hi,us,light,900,15 minutes,AS a Marine 1st Lt. flying an FJ4B fighter/att...,1/22/2004,21.4180556,-157.803611
5,10/10/1961 19:00,bristol,tn,us,sphere,300,5 minutes,My father is now 89 my brother 52 the girl wit...,4/27/2007,36.5950000,-82.188889
6,10/10/1965 21:00,penarth (uk/wales),,gb,circle,180,about 3 mins,penarth uk circle 3mins stayed 30ft above m...,2/14/2006,51.434722,-3.180000
7,10/10/1965 23:45,norwalk,ct,us,disk,1200,20 minutes,A bright orange color changing to reddish colo...,10/2/1999,41.1175000,-73.408333
8,10/10/1966 20:00,pell city,al,us,disk,180,3 minutes,Strobe Lighted disk shape object observed clos...,3/19/2009,33.5861111,-86.286111
9,10/10/1966 21:00,live oak,fl,us,disk,120,several minutes,Saucer zaps energy from powerline as my pregna...,5/11/2005,30.2947222,-82.984167


In [39]:
#often, the data that needs the most cleaning are strings. you can access string methods using 'str'
#so let's convert every state abbreviation to upper case
ufo.State.str.upper()                               

0         TX
1         TX
2        NaN
3         TX
4         HI
5         TN
6        NaN
7         CT
8         AL
9         FL
10        CA
11        NC
12        NY
13        KY
14        NC
15        KY
16        MI
17        CT
18       NaN
19        MA
20       NaN
21        KS
22        SC
23        WA
24       NaN
25        TX
26        KY
27        NY
28        NY
29        AB
        ... 
80302     NC
80303     NY
80304     TN
80305     CA
80306     UT
80307     IN
80308     NY
80309     SC
80310     CA
80311     SC
80312     GA
80313     VA
80314     NY
80315     OH
80316     CA
80317     TX
80318     AB
80319     NJ
80320     AL
80321     TN
80322    NaN
80323     FL
80324     NC
80325     ON
80326     GA
80327     TN
80328     ID
80329     CA
80330     VA
80331     OK
Name: State, Length: 80332, dtype: object

In [46]:
#you can also use 'str' to query information
#here we're checking the substrings within the 'Description' column
ufo[ufo['Description'].str.contains('red')==True] 

Unnamed: 0,Date,City,State,Country,Shape,Duration (seconds),Duration (hours/mins),Description,Date_posted,Latitude,Longitude
0,10/10/1949 20:30,san marcos,TX,us,cylinder,2700,45 minutes,This event took place in early fall around 194...,4/27/2004,29.8830556,-97.941111
7,10/10/1965 23:45,norwalk,ct,us,disk,1200,20 minutes,A bright orange color changing to reddish colo...,10/2/1999,41.1175000,-73.408333
11,10/10/1968 19:00,brevard,nc,us,fireball,180,3 minutes,silent red /orange mass of energy floated by t...,6/12/2008,35.2333333,-82.734444
28,10/10/1979 00:00,poughkeepsie,ny,us,chevron,900,15 minutes,1/4 moon-like&#44 its &#39chord&#39 or flat s...,4/16/2005,41.7002778,-73.921389
34,10/10/1980 23:30,manchester,nh,us,light,300,3 to 5 min,A red glowing sphere stopped and watched me.,11/21/2010,42.9955556,-71.455278
57,10/10/1993 23:00,carthage,tn,us,other,60,less than 1 min,1 object with green and red lights,3/21/2003,36.2522222,-85.951667
82,10/10/1999 00:01,martinez,ca,us,changing,3600,1 hour,Bright objects&#44 red and green flashing ligh...,10/19/1999,38.0194444,-122.133056
87,10/10/1999 20:35,hayward,ca,us,circle,90,60-90 sec,Many different colored glowing objects,11/20/2001,37.6688889,-122.079722
88,10/10/1999 21:00,rachel,nv,us,light,10800,3 hours,Bright lights with incredible agility seen fro...,5/24/2005,37.6447222,-115.742778
102,10/10/2001 03:00,rockwell city,ia,us,triangle,240,4 min.s,Large&#44silent&#44slow&#44low to the ground d...,7/1/2002,42.3952778,-94.633611


In [54]:
#you can also change information in one column based on something in another (this is particularly useful when you're doing some string cleaning)
ufo.loc[ufo.Country == 'gb', 'State'] = "some county idk probably the countryside"

In [55]:
ufo

Unnamed: 0,Date,City,State,Country,Shape,Duration (seconds),Duration (hours/mins),Description,Date_posted,Latitude,Longitude,Year,Month
0,10/10/1949 20:30,san marcos,TX,us,cylinder,2700,45 minutes,This event took place in early fall around 194...,2004-04-27,29.8830556,-97.941111,2004,4
1,10/10/1949 21:00,lackland afb,TX,,light,7200,1-2 hrs,1949 Lackland AFB&#44 TX. Lights racing acros...,2005-12-16,29.38421,-98.581082,2005,12
2,10/10/1955 17:00,chester (uk/england),some county idk probably the countryside,gb,circle,20,20 seconds,Green/Orange circular disc over Chester&#44 En...,2008-01-21,53.2,-2.916667,2008,1
3,10/10/1956 21:00,edna,TX,us,circle,20,1/2 hour,My older brother and twin sister were leaving ...,2004-01-17,28.9783333,-96.645833,2004,1
4,10/10/1960 20:00,kaneohe,hi,us,light,900,15 minutes,AS a Marine 1st Lt. flying an FJ4B fighter/att...,2004-01-22,21.4180556,-157.803611,2004,1
5,10/10/1961 19:00,bristol,tn,us,sphere,300,5 minutes,My father is now 89 my brother 52 the girl wit...,2007-04-27,36.5950000,-82.188889,2007,4
6,10/10/1965 21:00,penarth (uk/wales),some county idk probably the countryside,gb,circle,180,about 3 mins,penarth uk circle 3mins stayed 30ft above m...,2006-02-14,51.434722,-3.180000,2006,2
7,10/10/1965 23:45,norwalk,ct,us,disk,1200,20 minutes,A bright orange color changing to reddish colo...,1999-10-02,41.1175000,-73.408333,1999,10
8,10/10/1966 20:00,pell city,al,us,disk,180,3 minutes,Strobe Lighted disk shape object observed clos...,2009-03-19,33.5861111,-86.286111,2009,3
9,10/10/1966 21:00,live oak,fl,us,disk,120,several minutes,Saucer zaps energy from powerline as my pregna...,2005-05-11,30.2947222,-82.984167,2005,5


In [47]:
#you can also convert a string to the datetime format
#FUN FACT: Pandas was created to help people handle stock information and that's why it has pretty gerat date time functionalities
ufo['Date_posted'] = pd.to_datetime(ufo['Date_posted'])

In [48]:
#you can also pull out information like the year in the date time column
ufo['Year'] = ufo.Date_posted.dt.year

In [49]:
#or the month
ufo['Month'] = ufo.Date_posted.dt.month

In [50]:
ufo.head()

Unnamed: 0,Date,City,State,Country,Shape,Duration (seconds),Duration (hours/mins),Description,Date_posted,Latitude,Longitude,Year,Month
0,10/10/1949 20:30,san marcos,TX,us,cylinder,2700,45 minutes,This event took place in early fall around 194...,2004-04-27,29.8830556,-97.941111,2004,4
1,10/10/1949 21:00,lackland afb,TX,,light,7200,1-2 hrs,1949 Lackland AFB&#44 TX. Lights racing acros...,2005-12-16,29.38421,-98.581082,2005,12
2,10/10/1955 17:00,chester (uk/england),,gb,circle,20,20 seconds,Green/Orange circular disc over Chester&#44 En...,2008-01-21,53.2,-2.916667,2008,1
3,10/10/1956 21:00,edna,TX,us,circle,20,1/2 hour,My older brother and twin sister were leaving ...,2004-01-17,28.9783333,-96.645833,2004,1
4,10/10/1960 20:00,kaneohe,hi,us,light,900,15 minutes,AS a Marine 1st Lt. flying an FJ4B fighter/att...,2004-01-22,21.4180556,-157.803611,2004,1


In [51]:
#you can use groupby statements to split data into groups based on some criteria, apply a function to that group, then combine the results of that function into a data structure
#in this example, we're splitting information into years and countries with UFO sightings and then counting how many sightings there were in each year-country unit
ufo.groupby('Year').Country.value_counts()

Year  Country
1998  us          756
      au            7
      gb            4
1999  us         3688
      gb           16
      au           14
      ca            5
      de            4
2000  us         2323
      ca          229
      gb           69
      au           21
      de            2
2001  us         2719
      gb          145
      ca          141
      au           45
      de            7
2002  us         3203
      ca          190
      gb          152
      au           59
      de            4
2003  us         3584
      ca          330
      gb          165
      au           56
      de            7
2004  us         3952
      ca          330
                 ... 
2009  us         4806
      gb          237
      ca          152
      au           19
      de           11
2010  us         3656
      ca          152
      gb          133
      au           12
      de            6
2011  us         4773
      ca          151
      gb           62
      au          

In [52]:
#you can also sort pandas dataframes by their index
ufo.State.value_counts().sort_index()

TX    3677
ab     333
ak     354
al     691
ar     666
az    2689
bc     788
ca    9655
co    1505
ct     968
dc      99
de     183
fl    4200
ga    1347
hi     353
ia     707
id     554
il    2645
in    1386
ks     653
ky     914
la     598
ma    1358
mb     155
md     911
me     633
mi    2071
mn    1081
mo    1576
ms     415
      ... 
nj    1512
nm     815
ns     143
nt      20
nv     905
ny    3219
oh    2425
ok     766
on    1584
or    1845
pa    2582
pe      17
pq      90
pr      33
qc     178
ri     290
sa      30
sc    1076
sd     196
sk      98
tn    1193
ut     743
va    1393
vt     307
wa    4268
wi    1333
wv     486
wy     205
yk       7
yt      13
Name: State, Length: 67, dtype: int64

In [26]:
ufo

Unnamed: 0,Date,City,State,Country,Shape,Duration (seconds),Duration (hours/mins),Description,Date_posted,Latitude,Longitude,Year,Month
0,10/10/1949 20:30,san marcos,TX,us,cylinder,2700,45 minutes,This event took place in early fall around 194...,2004-04-27,29.8830556,-97.941111,2004,4
1,10/10/1949 21:00,lackland afb,TX,,light,7200,1-2 hrs,1949 Lackland AFB&#44 TX. Lights racing acros...,2005-12-16,29.38421,-98.581082,2005,12
2,10/10/1955 17:00,chester (uk/england),some county idk probably the countryside,gb,circle,20,20 seconds,Green/Orange circular disc over Chester&#44 En...,2008-01-21,53.2,-2.916667,2008,1
3,10/10/1956 21:00,edna,TX,us,circle,20,1/2 hour,My older brother and twin sister were leaving ...,2004-01-17,28.9783333,-96.645833,2004,1
4,10/10/1960 20:00,kaneohe,hi,us,light,900,15 minutes,AS a Marine 1st Lt. flying an FJ4B fighter/att...,2004-01-22,21.4180556,-157.803611,2004,1
5,10/10/1961 19:00,bristol,tn,us,sphere,300,5 minutes,My father is now 89 my brother 52 the girl wit...,2007-04-27,36.5950000,-82.188889,2007,4
6,10/10/1965 21:00,penarth (uk/wales),some county idk probably the countryside,gb,circle,180,about 3 mins,penarth uk circle 3mins stayed 30ft above m...,2006-02-14,51.434722,-3.180000,2006,2
7,10/10/1965 23:45,norwalk,ct,us,disk,1200,20 minutes,A bright orange color changing to reddish colo...,1999-10-02,41.1175000,-73.408333,1999,10
8,10/10/1966 20:00,pell city,al,us,disk,180,3 minutes,Strobe Lighted disk shape object observed clos...,2009-03-19,33.5861111,-86.286111,2009,3
9,10/10/1966 21:00,live oak,fl,us,disk,120,several minutes,Saucer zaps energy from powerline as my pregna...,2005-05-11,30.2947222,-82.984167,2005,5


In [56]:
#data cleaning often involves detecting duplicate rows -- here's a bunch of ways to do that
#users.duplicated()        #True if there are duplicates
# users.duplicated().sum()    # count of duplicates
users[users.duplicated()]   # only show duplicates
# users.drop_duplicates()     # drop duplicate rows
# users.age.duplicated()      # check a single column for duplicates
# users.duplicated(['age', 'gender', 'zip_code']).sum()   # specify columns for finding duplicates

Unnamed: 0_level_0,age,gender,occupation,zip_code,is_male
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
496,21,F,student,55414,0
572,51,M,educator,20003,1
621,17,M,student,60402,1
684,28,M,student,55414,1
733,44,F,other,60630,0
805,27,F,other,20009,0
890,32,M,student,97301,1


In [57]:
#and sometimes, the easiest way to get a quick idea of the data is a cross-tabulation of two Series
pd.crosstab(users.occupation, users.gender)

gender,F,M
occupation,Unnamed: 1_level_1,Unnamed: 2_level_1
administrator,36,43
artist,13,15
doctor,0,7
educator,26,69
engineer,2,65
entertainment,2,16
executive,3,29
healthcare,11,5
homemaker,6,1
lawyer,2,10


In [58]:
# alternative syntax for boolean filtering 
users.query('age < 20')                 # users[users.age < 20]
users.query("age < 20 and gender=='M'") # users[(users.age < 20) & (users.gender=='M')]
users.query('age < 20 or age > 60')     # users[(users.age < 20) | (users.age > 60)]

Unnamed: 0_level_0,age,gender,occupation,zip_code,is_male
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
30,7,M,student,55436,1
36,19,F,student,93117,0
52,18,F,student,55105,0
57,16,M,none,84010,1
67,17,M,student,60402,1
68,19,M,student,22904,1
101,15,M,student,05146,1
106,61,M,retired,55125,1
110,19,M,student,77840,1
142,13,M,other,48118,1


In [59]:
# display the memory usage of a DataFrame
ufo.info()          # total usage
ufo.memory_usage()  # usage by column

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 80332 entries, 0 to 80331
Data columns (total 13 columns):
Date                     80332 non-null object
City                     80332 non-null object
State                    76428 non-null object
Country                  70662 non-null object
Shape                    78400 non-null object
Duration (seconds)       80332 non-null object
Duration (hours/mins)    80332 non-null object
Description              80317 non-null object
Date_posted              80332 non-null datetime64[ns]
Latitude                 80332 non-null object
Longitude                80332 non-null float64
Year                     80332 non-null int64
Month                    80332 non-null int64
dtypes: datetime64[ns](1), float64(1), int64(2), object(9)
memory usage: 8.0+ MB


Index                        80
Date                     642656
City                     642656
State                    642656
Country                  642656
Shape                    642656
Duration (seconds)       642656
Duration (hours/mins)    642656
Description              642656
Date_posted              642656
Latitude                 642656
Longitude                642656
Year                     642656
Month                    642656
dtype: int64

In [60]:
# change a Series to the 'category' data type (reduces memory usage and increases performance)
ufo['State'] = ufo.State.astype('category')

In [None]:
# limit which rows are read when reading in a file
pd.read_csv('drinks.csv', nrows=10)           # only read first 10 rows
pd.read_csv('drinks.csv', skiprows=[1, 2])    # skip the first two rows of data

In [None]:
# write a DataFrame out to a CSV
drinks.to_csv('drinks_updated.csv')                 # index is used as first column
drinks.to_csv('drinks_updated.csv', index=False)    # ignore index

In [None]:
# save a DataFrame to disk (aka 'pickle') and read it from disk (aka 'unpickle')
drinks.to_pickle('drinks_pickle')
pd.read_pickle('drinks_pickle')

In [None]:
# randomly sample a DataFrame
train = drinks.sample(frac=0.75, random_state=1)    # will contain 75% of the rows
test = drinks[~drinks.index.isin(train.index)]      # will contain the other 25%


In [None]:
# change the maximum number of rows and columns printed ('None' means unlimited)
pd.set_option('max_rows', None)     # default is 60 rows
pd.set_option('max_columns', None)  # default is 20 columns
print drinks

In [None]:
# reset options to defaults
pd.reset_option('max_rows')
pd.reset_option('max_columns')


In [None]:
# change the options temporarily (settings are restored when you exit the 'with' block)
with pd.option_context('max_rows', None, 'max_columns', None):
    print drinks

In [63]:
#combine everything into one function that runs your most commonly used EDA calls for you
def eda(dataframe):
    print ("missing values \n", dataframe.isnull().sum())
    print ("dataframe index \n", dataframe.index)
    print ("dataframe types \n", dataframe.dtypes)
    print ("dataframe shape \n", dataframe.shape)
    print ("dataframe describe \n", dataframe.describe())
    for item in dataframe:
        print (item)
        print (dataframe[item].nunique())

eda(ufo)

missing values 
 Date                        0
City                        0
State                    3904
Country                  9670
Shape                    1932
Duration (seconds)          0
Duration (hours/mins)       0
Description                15
Date_posted                 0
Latitude                    0
Longitude                   0
Year                        0
Month                       0
dtype: int64
dataframe index 
 RangeIndex(start=0, stop=80332, step=1)
dataframe types 
 Date                             object
City                             object
State                          category
Country                          object
Shape                            object
Duration (seconds)               object
Duration (hours/mins)            object
Description                      object
Date_posted              datetime64[ns]
Latitude                         object
Longitude                       float64
Year                              int64
Month                   