### Here are a bunch more ways to use pandas to explore some interesting datasets. 

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
#import seaborn as sns
plt.style.use('ggplot')
% matplotlib inline

In [None]:
#import multiple dataframes
drinks = pd.read_csv('https://raw.githubusercontent.com/fivethirtyeight/data/master/alcohol-consumption/drinks.csv')
users = pd.read_table('https://raw.githubusercontent.com/justmarkham/DAT8/master/data/u.user', sep='|', index_col='user_id')
ufo = pd.read_csv('https://raw.githubusercontent.com/planetsig/ufo-reports/master/csv-data/ufo-scrubbed-geocoded-time-standardized.csv')

In [None]:
#UFOs doesn't have column headings so we'll need to sort that out
ufo.head()

In [None]:
#one way to do that is to create a list with the column names...
columns = ['Date', 'City', 'State', 'Country', 'Shape', 'Duration (seconds)', 'Duration (hours/mins)', 'Description', 'Date_posted', 'Latitude', 'Longitude']

In [None]:
#...and specify that that list should be your column names when importing 
ufo = pd.read_csv('https://raw.githubusercontent.com/planetsig/ufo-reports/master/csv-data/ufo-scrubbed-geocoded-time-standardized.csv', names=columns)

In [None]:
#that's better
ufo.head()

In [None]:
#pandas makes it easy for you to take a look at multiple columns at once
ufo[['City', 'State']]         

In [None]:
#there are also a bunch of ways to slice and dice a pandas dataframe to get the information you need. (and here's a great stackoverflow page on the merits of .loc vs. .iloc: https://stackoverflow.com/questions/31593201/pandas-iloc-vs-ix-vs-loc-explanation)
#you can slice the rows to by specifying which ones you want to see. Here we're looking at the cities in rows 3 through 6 
ufo.loc[3:6, 'City']              

In [None]:
#and here are the cities and states for rows three through six 
ufo.loc[3:6, ['City','State']]

In [None]:
# you can also do quite a bit of pre-processing in pandas
#for example: mapping existing values to a different set of values
users['is_male'] = users.gender.map({'F':0, 'M':1})

In [None]:
users

In [None]:
# when dealing with categorical data, you can also encode strings as integer values using .factorize (this automatically starts at 0)
users.occupation.factorize()

In [None]:
#pandas lets you take a look at unique values in a column
print(users.occupation.nunique())      # count the number of unique values
users.occupation.unique()       # return what those unique values are

In [None]:
# you can also do some data cleaning using pandas, like replacing all instances of a value in a column
#here we're capitalizing TX
ufo.State.replace('tx', 'TX', inplace=True)

In [None]:
ufo

In [None]:
#often, the data that needs the most cleaning are strings. you can access string methods using 'str'
#so let's convert every state abbreviation to upper case
ufo.State.str.upper()                               

In [None]:
#you can also use 'str' to query information
#here we're checking the substrings within the 'Description' column
ufo[ufo['Description'].str.contains('red')==True] 

In [None]:
#you can also change information in one column based on something in another (this is particularly useful when you're doing some string cleaning)
ufo.loc[ufo.Country == 'gb', 'State'] = "some county idk probably the countryside"

In [None]:
ufo

In [None]:
#you can also convert a string to the datetime format
#FUN FACT: Pandas was created to help people handle stock information and that's why it has pretty gerat date time functionalities
ufo['Date_posted'] = pd.to_datetime(ufo['Date_posted'])

In [None]:
#you can also pull out information like the year in the date time column
ufo['Year'] = ufo.Date_posted.dt.year

In [None]:
#or the month
ufo['Month'] = ufo.Date_posted.dt.month

In [None]:
ufo.head()

In [None]:
#you can use groupby statements to split data into groups based on some criteria, apply a function to that group, then combine the results of that function into a data structure
#in this example, we're splitting information into years and countries with UFO sightings and then counting how many sightings there were in each year-country unit
ufo.groupby('Year').Country.value_counts()

In [None]:
#what about plotting this bit of information to look at it visually?
#we'll first take a stab at comparing countries over time
ct_ufo = pd.crosstab(ufo.Year, ufo.Country)

In [None]:
ct_ufo

In [None]:
plt.plot(ct_ufo[:], linewidth=4.0)
plt.legend(ct_ufo.columns)
#change figure size
fig_size = plt.rcParams["figure.figsize"]
fig_size[0] = 50
fig_size[1] = 30
plt.rcParams["figure.figsize"] = fig_size
#change font size
font = {'family' : 'normal',
        'weight' : 'bold',
        'size'   : 22}

plt.rc('font', **font)

In [None]:
#you can also sort pandas dataframes by their index
ufo.State.value_counts().sort_index()

In [None]:
#data cleaning often involves detecting duplicate rows -- here's a bunch of ways to do that
#users.duplicated()        #True if there are duplicates
# users.duplicated().sum()    # count of duplicates
users[users.duplicated()]   # only show duplicates
# users.drop_duplicates()     # drop duplicate rows
# users.age.duplicated()      # check a single column for duplicates
# users.duplicated(['age', 'gender', 'zip_code']).sum()   # specify columns for finding duplicates

In [None]:
#and sometimes, the easiest way to get a quick idea of the data is a cross-tabulation of two Series
pd.crosstab(users.occupation, users.gender)

In [None]:
# alternative syntax for boolean filtering 
users.query('age < 20')                 # users[users.age < 20]
users.query("age < 20 and gender=='M'") # users[(users.age < 20) & (users.gender=='M')]
users.query('age < 20 or age > 60')     # users[(users.age < 20) | (users.age > 60)]

In [None]:
# display the memory usage of a DataFrame
ufo.info()          # total usage
ufo.memory_usage()  # usage by column

In [None]:
# change a Series to the 'category' data type (reduces memory usage and increases performance)
ufo['State'] = ufo.State.astype('category')

In [None]:
# limit which rows are read when reading in a file
pd.read_csv('drinks.csv', nrows=10)           # only read first 10 rows
pd.read_csv('drinks.csv', skiprows=[1, 2])    # skip the first two rows of data

In [None]:
# write a DataFrame out to a CSV
drinks.to_csv('drinks_updated.csv')                 # index is used as first column
drinks.to_csv('drinks_updated.csv', index=False)    # ignore index

In [None]:
# save a DataFrame to disk (aka 'pickle') and read it from disk (aka 'unpickle')
drinks.to_pickle('drinks_pickle')
pd.read_pickle('drinks_pickle')

In [None]:
# randomly sample a DataFrame
train = drinks.sample(frac=0.75, random_state=1)    # will contain 75% of the rows
test = drinks[~drinks.index.isin(train.index)]      # will contain the other 25%


In [None]:
# change the maximum number of rows and columns printed ('None' means unlimited)
pd.set_option('max_rows', None)     # default is 60 rows
pd.set_option('max_columns', None)  # default is 20 columns
print drinks

In [None]:
# reset options to defaults
pd.reset_option('max_rows')
pd.reset_option('max_columns')


In [None]:
# change the options temporarily (settings are restored when you exit the 'with' block)
with pd.option_context('max_rows', None, 'max_columns', None):
    print drinks

In [None]:
#combine everything into one function that runs your most commonly used EDA calls for you
def eda(dataframe):
    print ("missing values \n", dataframe.isnull().sum())
    print ("dataframe index \n", dataframe.index)
    print ("dataframe types \n", dataframe.dtypes)
    print ("dataframe shape \n", dataframe.shape)
    print ("dataframe describe \n", dataframe.describe())
    for item in dataframe:
        print (item)
        print (dataframe[item].nunique())

eda(ufo)